text.py 148 KB


  1. # -*- coding: utf-8 -*-
  2. """
  3. text.py
  4. """
  5. import logging
  6. logger = logging.getLogger(__name__)
  7. import regex
  8. import copy
  9. import bleach
  10. import json
  11. try:
  12. import re2 as re
  13. re.set_fallback_notification(re.FALLBACK_WARNING)
  14. except ImportError:
  15. logging.warning("Failed to load 're2'. Falling back to 're' for regular expression parsing. See https://github.com/blockspeiser/Sefaria-Project/wiki/Regular-Expression-Engines")
  16. import re
  17. from . import abstract as abst
  18. from schema import deserialize_tree, SchemaNode, JaggedArrayNode, TitledTreeNode, AddressTalmud, TermSet, TitleGroup
  19. import sefaria.system.cache as scache
  20. from sefaria.system.exceptions import InputError, BookNameError, PartialRefInputError, IndexSchemaError, NoVersionFoundError
  21. from sefaria.utils.talmud import daf_to_section
  22. from sefaria.utils.hebrew import is_hebrew, hebrew_term
  23. from sefaria.utils.util import list_depth
  24. from sefaria.datatype.jagged_array import JaggedTextArray, JaggedArray
  25. from sefaria.settings import DISABLE_INDEX_SAVE
  26. """
  27. ----------------------------------
  28. Index, IndexSet, CommentaryIndex
  29. ----------------------------------
  30. """
  31. class AbstractIndex(object):
  32. def contents(self, v2=False, raw=False, **kwargs):
  33. pass
  34. def versionSet(self):
  35. return VersionSet({"title": self.title})
  36. def versionState(self):
  37. from . import version_state
  38. return version_state.VersionState(self.title)
  39. def is_new_style(self):
  40. return bool(getattr(self, "nodes", None))
  41. def get_title(self, lang="en"):
  42. if lang == "en":
  43. return self._title
  44. if self.is_new_style():
  45. return self.nodes.primary_title(lang)
  46. else:
  47. return getattr(self, "heTitle", None)
  48. def set_title(self, title, lang="en"):
  49. if lang == "en":
  50. self._title = title #we need to store the title attr in a physical storage, not that .title is a virtual property
  51. if self.is_new_style():
  52. if lang == "en":
  53. self.nodes.key = title
  54. old_primary = self.nodes.primary_title(lang)
  55. self.nodes.add_title(title, lang, True, True)
  56. if old_primary != title: #then remove the old title, we don't want it.
  57. self.nodes.remove_title(old_primary, lang)
  58. title = property(get_title, set_title)
  59. def author_objects(self):
  60. from . import person
  61. return [person.Person().load({"key": k}) for k in getattr(self, "authors", []) if person.Person().load({"key": k})]
  62. def composition_time_period(self):
  63. return None
  64. def composition_place(self):
  65. return None
  66. def publication_place(self):
  67. return None
  68. def publication_time_period(self):
  69. return None
  70. class Index(abst.AbstractMongoRecord, AbstractIndex):
  71. """
  72. Index objects define the names and structure of texts stored in the system.
  73. There is an Index object for every simple text and for every commentator (e.g. "Rashi").
  74. Commentaries (like "Rashi on Exodus") are instanciated with :class:`CommentaryIndex` objects.
  75. """
  76. collection = 'index'
  77. history_noun = 'index'
  78. criteria_field = 'title'
  79. criteria_override_field = 'oldTitle' # used when primary attribute changes. field that holds old value.
  80. second_save = True
  81. track_pkeys = True
  82. pkeys = ["title"]
  83. required_attrs = [
  84. "title",
  85. "categories"
  86. ]
  87. optional_attrs = [
  88. "titleVariants", # required for old style
  89. "schema", # required for new style
  90. "sectionNames", # required for old style simple texts, sometimes erroneously present for commnetary
  91. "heTitle", # optional for old style
  92. "heTitleVariants", # optional for old style
  93. "maps", # optional for old style
  94. "alt_structs", # optional for new style
  95. "default_struct", # optional for new style
  96. "order", # optional for old style and new
  97. "length", # optional for old style
  98. "lengths", # optional for old style
  99. "transliteratedTitle",# optional for old style
  100. "authors",
  101. "enDesc",
  102. "heDesc",
  103. "pubDate",
  104. "compDate",
  105. "compPlace",
  106. "pubPlace",
  107. "errorMargin",
  108. "era",
  109. ]
  110. def __unicode__(self):
  111. return u"Index: {}".format(self.title)
  112. def __str__(self):
  113. return unicode(self).encode('utf-8')
  114. def __repr__(self): # Wanted to use orig_tref, but repr can not include Unicode
  115. return u"{}().load({{'title': '{}'}})".format(self.__class__.__name__, self.title)
  116. def save(self):
  117. if DISABLE_INDEX_SAVE:
  118. raise InputError("Index saving has been disabled on this system.")
  119. return super(Index, self).save()
  120. def _set_derived_attributes(self):
  121. if getattr(self, "schema", None):
  122. self.nodes = deserialize_tree(self.schema, index=self)
  123. self.nodes.validate()
  124. else:
  125. self.nodes = None
  126. self.struct_objs = {}
  127. if getattr(self, "alt_structs", None) and self.nodes:
  128. for name, struct in self.alt_structs.items():
  129. self.struct_objs[name] = deserialize_tree(struct, index=self, struct_class=TitledTreeNode)
  130. self.struct_objs[name].title_group = self.nodes.title_group
  131. self.struct_objs[name].validate()
  132. def is_complex(self):
  133. return getattr(self, "nodes", None) and self.nodes.has_children()
  134. def contents(self, v2=False, raw=False, **kwargs):
  135. if not getattr(self, "nodes", None) or raw: # Commentator
  136. return super(Index, self).contents()
  137. elif v2:
  138. return self.nodes.as_index_contents()
  139. return self.legacy_form()
  140. def legacy_form(self):
  141. """
  142. :return: Returns an Index object as a flat dictionary, in version one form.
  143. :raise: Exception if the Index cannot be expressed in the old form
  144. """
  145. if not self.nodes.is_flat():
  146. raise InputError("Index record {} can not be converted to legacy API form".format(self.title))
  147. d = {
  148. "title": self.title,
  149. "categories": self.categories,
  150. "titleVariants": self.nodes.all_node_titles("en"),
  151. "sectionNames": self.nodes.sectionNames,
  152. "heSectionNames": map(hebrew_term, self.nodes.sectionNames),
  153. "textDepth": len(self.nodes.sectionNames),
  154. "addressTypes": self.nodes.addressTypes # This isn't legacy, but it was needed for checkRef
  155. }
  156. if getattr(self, "maps", None):
  157. d["maps"] = self.maps #keep an eye on this. Format likely to change.
  158. if getattr(self, "order", None):
  159. d["order"] = self.order
  160. if getattr(self.nodes, "lengths", None):
  161. d["lengths"] = self.nodes.lengths
  162. d["length"] = self.nodes.lengths[0]
  163. if self.nodes.primary_title("he"):
  164. d["heTitle"] = self.nodes.primary_title("he")
  165. if self.nodes.all_node_titles("he"):
  166. d["heTitleVariants"] = self.nodes.all_node_titles("he")
  167. else:
  168. d["heTitleVariants"] = []
  169. return d
  170. def _saveable_attrs(self):
  171. d = {k: getattr(self, k) for k in self._saveable_attr_keys() if hasattr(self, k)}
  172. if getattr(self, "nodes", None):
  173. d["schema"] = self.nodes.serialize()
  174. if getattr(self, "struct_objs", None):
  175. d["alt_structs"] = {}
  176. for name, obj in self.struct_objs.items():
  177. c = obj.serialize()
  178. del c["titles"]
  179. d["alt_structs"][name] = c
  180. return d
  181. def is_commentary(self):
  182. return self.categories[0] == "Commentary"
  183. def get_commentary_indexes(self):
  184. if not self.is_commentary():
  185. return [self]
  186. return list({v.get_index() for v in library.get_commentary_versions(self.title)})
  187. def all_titles(self, lang):
  188. if self.nodes:
  189. return self.nodes.all_tree_titles(lang)
  190. else:
  191. return None # Handle commentary case differently?
  192. ''' Alternate Title Structures '''
  193. def set_alt_structure(self, name, struct_obj):
  194. """
  195. :param name: String
  196. :param struct_obj: :py.class:`TitledTreeNode`
  197. :return:
  198. """
  199. self.struct_objs[name] = struct_obj
  200. def get_alt_structure(self, name):
  201. """
  202. :returns: :py.class:`TitledTreeNode`
  203. """
  204. return self.struct_objs.get(name)
  205. def get_alt_structures(self):
  206. return self.struct_objs
  207. def has_alt_structures(self):
  208. return bool(self.struct_objs)
  209. #These next 3 functions parallel functions on Library, but are simpler. Refactor?
  210. def alt_titles_dict(self, lang):
  211. title_dict = {}
  212. for key, tree in self.get_alt_structures().items():
  213. title_dict.update(tree.title_dict(lang))
  214. return title_dict
  215. def alt_titles_regex(self, lang):
  216. full_title_list = self.alt_titles_dict(lang).keys()
  217. alt_titles = map(re.escape, full_title_list)
  218. reg = u'(?P<title>' + u'|'.join(sorted(alt_titles, key=len, reverse=True)) + ur')($|[:., ]+)'
  219. try:
  220. reg = re.compile(reg, max_mem= 256 * 1024 * 1024)
  221. except TypeError:
  222. reg = re.compile(reg)
  223. return reg
  224. def get_alt_struct_node(self, title, lang=None):
  225. if not lang:
  226. lang = "he" if is_hebrew(title) else "en"
  227. return self.alt_titles_dict(lang).get(title)
  228. def composition_place(self):
  229. from . import place
  230. if getattr(self, "compPlace", None) is None:
  231. return None
  232. return place.Place().load({"key": self.compPlace})
  233. def publication_place(self):
  234. from . import place
  235. if getattr(self, "pubPlace", None) is None:
  236. return None
  237. return place.Place().load({"key": self.pubPlace})
  238. # This is similar to logic on GardenStop
  239. def composition_time_period(self):
  240. return self._get_time_period("compDate", "errorMargin")
  241. def publication_time_period(self):
  242. return self._get_time_period("pubDate")
  243. def _get_time_period(self, date_field, margin_field=None):
  244. from . import time
  245. if not getattr(self, date_field, None):
  246. return None
  247. errorMargin = int(getattr(self, margin_field, 0)) if margin_field else 0
  248. startIsApprox = endIsApprox = errorMargin > 0
  249. try:
  250. year = int(getattr(self, date_field))
  251. start = year - errorMargin
  252. end = year + errorMargin
  253. except ValueError as e:
  254. years = getattr(self, date_field).split("-")
  255. if years[0] == "" and len(years) == 3: #Fix for first value being negative
  256. years[0] = -int(years[1])
  257. years[1] = int(years[2])
  258. start = int(years[0]) - errorMargin
  259. end = int(years[1]) + errorMargin
  260. return time.TimePeriod({
  261. "start": start,
  262. "startIsApprox": startIsApprox,
  263. "end": end,
  264. "endIsApprox": endIsApprox
  265. })
  266. #todo: handle lang
  267. def get_maps(self):
  268. """
  269. Returns both those maps explicitly defined on this node and those derived from a term scheme
  270. """
  271. return getattr(self, "maps", [])
  272. #todo: term schemes
  273. # Index changes behavior of load_from_dict, so this circumvents that changed behavior to call load_from_dict on the abstract superclass
  274. def update_from_dict(self, d):
  275. return super(Index, self).load_from_dict(d, is_init=False)
  276. def load_from_dict(self, d, is_init=False):
  277. if d:
  278. if not d.get("categories"):
  279. raise InputError(u"Please provide category for Index record: {}.".format(d.get("title")))
  280. # Data is being loaded from dict in old format, rewrite to new format
  281. # Assumption is that d has a complete title collection
  282. if "schema" not in d and d["categories"][0] != "Commentary":
  283. node = getattr(self, "nodes", None)
  284. if node:
  285. node._init_titles()
  286. else:
  287. node = JaggedArrayNode()
  288. node.key = d.get("title")
  289. sn = d.pop("sectionNames", None)
  290. if sn:
  291. node.sectionNames = sn
  292. node.depth = len(node.sectionNames)
  293. else:
  294. raise InputError(u"Please specify section names for Index record.")
  295. if d["categories"][0] == "Talmud":
  296. node.addressTypes = ["Talmud", "Integer"]
  297. if d["categories"][1] == "Bavli" and d.get("heTitle"):
  298. node.checkFirst = {
  299. "he": u"משנה" + " " + d.get("heTitle"),
  300. "en": "Mishnah " + d.get("title")
  301. }
  302. elif d["categories"][0] == "Mishnah":
  303. node.addressTypes = ["Perek", "Mishnah"]
  304. else:
  305. node.addressTypes = ["Integer" for x in range(node.depth)]
  306. l = d.pop("length", None)
  307. if l:
  308. node.lengths = [l]
  309. ls = d.pop("lengths", None)
  310. if ls:
  311. node.lengths = ls #overwrite if index.length is already there
  312. #Build titles
  313. node.add_title(d["title"], "en", True)
  314. tv = d.pop("titleVariants", None)
  315. if tv:
  316. for t in tv:
  317. lang = "he" if is_hebrew(t) else "en"
  318. node.add_title(t, lang)
  319. ht = d.pop("heTitle", None)
  320. if ht:
  321. node.add_title(ht, "he", True)
  322. htv = d.pop("heTitleVariants", None)
  323. if htv:
  324. for t in htv:
  325. node.add_title(t, "he")
  326. d["schema"] = node.serialize()
  327. # todo: should this functionality be on load()?
  328. if "oldTitle" in d and "title" in d and d["oldTitle"] != d["title"]:
  329. self.load({"title": d["oldTitle"]})
  330. # self.titleVariants.remove(d["oldTitle"]) # let this be determined by user
  331. return super(Index, self).load_from_dict(d, is_init)
  332. def _normalize(self):
  333. self.title = self.title.strip()
  334. self.title = self.title[0].upper() + self.title[1:]
  335. if isinstance(getattr(self, "authors", None), basestring):
  336. self.authors = [self.authors]
  337. if not self.is_commentary():
  338. if not self.is_new():
  339. for t in [self.title, self.nodes.primary_title("en"), self.nodes.key]: # This sets a precedence order
  340. if t != self.pkeys_orig_values["title"]: # One title changed, update all of them.
  341. self.title = t
  342. self.nodes.key = t
  343. self.nodes.add_title(t, "en", True, True)
  344. break
  345. if getattr(self, "nodes", None) is None:
  346. if not getattr(self, "titleVariants", None):
  347. self.titleVariants = []
  348. self.titleVariants = [v[0].upper() + v[1:] for v in self.titleVariants]
  349. # Ensure primary title is listed among title variants
  350. if self.title not in self.titleVariants:
  351. self.titleVariants.append(self.title)
  352. self.titleVariants = list(set([v for v in self.titleVariants if v]))
  353. # Not sure how these string values are sneaking in here...
  354. if getattr(self, "heTitleVariants", None) is not None and isinstance(self.heTitleVariants, basestring):
  355. self.heTitleVariants = [self.heTitleVariants]
  356. if getattr(self, "heTitle", None) is not None:
  357. if getattr(self, "heTitleVariants", None) is None:
  358. self.heTitleVariants = [self.heTitle]
  359. elif self.heTitle not in self.heTitleVariants:
  360. self.heTitleVariants.append(self.heTitle)
  361. self.heTitleVariants = list(set([v for v in getattr(self, "heTitleVariants", []) if v]))
  362. def _validate(self):
  363. assert super(Index, self)._validate()
  364. # Keys that should be non empty lists
  365. non_empty = ["categories"]
  366. ''' No longer required for new format
  367. if not self.is_commentary():
  368. non_empty.append("sectionNames")
  369. '''
  370. for key in non_empty:
  371. if not isinstance(getattr(self, key, None), list) or len(getattr(self, key, [])) == 0:
  372. raise InputError(u"{} field must be a non empty list of strings.".format(key))
  373. #allow only ASCII in text titles
  374. try:
  375. self.title.decode('ascii')
  376. except (UnicodeDecodeError, UnicodeEncodeError):
  377. raise InputError("Text title may contain only simple English characters.")
  378. # Disallow special characters in text titles
  379. if any((c in '.-\\/') for c in self.title):
  380. raise InputError("Text title may not contain periods, hyphens or slashes.")
  381. # Disallow special character in categories
  382. for cat in self.categories:
  383. if any((c in '.-') for c in cat):
  384. raise InputError("Categories may not contain periods or hyphens.")
  385. # Disallow special character in sectionNames
  386. if getattr(self, "sectionNames", None):
  387. for sec in self.sectionNames:
  388. if any((c in '.-\\/') for c in sec):
  389. raise InputError("Text Structure names may not contain periods, hyphens or slashes.")
  390. #New style records
  391. if self.nodes:
  392. # Make sure that all primary titles match
  393. if self.title != self.nodes.primary_title("en") or self.title != self.nodes.key:
  394. raise InputError(u"Primary titles mismatched in Index Record: {}, {}, {}"
  395. .format(self.title, self.nodes.primary_title("en"), self.nodes.key))
  396. # Make sure all titles are unique
  397. for lang in ["en", "he"]:
  398. all_titles = self.all_titles(lang)
  399. """
  400. # Note: Because these titles come from the keys of TitledTreeNode.titleDict(), there's no possibility for name collision.
  401. # todo: actually test for name collision
  402. if len(all_titles) != len(set(all_titles)):
  403. for title in all_titles:
  404. if all_titles.count(title) > 1:
  405. raise InputError(u'The title {} occurs twice in this Index record'.format(title))
  406. """
  407. for title in all_titles:
  408. existing = library.get_schema_node(title, lang)
  409. if existing and not self.same_record(existing.index) and existing.index.title != self.pkeys_orig_values.get("title"):
  410. raise InputError(u'A text called "{}" already exists.'.format(title))
  411. self.nodes.validate()
  412. for key, tree in self.get_alt_structures().items():
  413. tree.validate()
  414. else: # old style commentator record
  415. assert self.is_commentary(), "Saw old style index record that's not a commentary. Panic!"
  416. assert getattr(self, "titleVariants", None)
  417. if not getattr(self, "heTitle", None):
  418. raise InputError(u'Missing Hebrew title on {}.'.format(self.title))
  419. if not getattr(self, "heTitleVariants", None):
  420. raise InputError(u'Missing Hebrew title variants on {}.'.format(self.title))
  421. # Make sure all title variants are unique
  422. if getattr(self, "titleVariants", None):
  423. for variant in self.titleVariants:
  424. existing = Index().load({"titleVariants": variant})
  425. if existing and not self.same_record(existing) and existing.title != self.pkeys_orig_values.get("title"):
  426. #if not getattr(self, "oldTitle", None) or existing.title != self.oldTitle:
  427. raise InputError(u'A text called "{}" already exists.'.format(variant))
  428. if getattr(self, "authors", None) and not isinstance(self.authors, list):
  429. raise InputError(u'{} authors must be a list.'.format(self.title))
  430. return True
  431. def _prepare_second_save(self):
  432. if getattr(self, "maps", None) is None:
  433. return
  434. for i in range(len(self.maps)):
  435. nref = Ref(self.maps[i]["to"]).normal()
  436. if not nref:
  437. raise InputError(u"Couldn't understand text reference: '{}'.".format(self.maps[i]["to"]))
  438. lang = "en" #todo: get rid of this assumption
  439. existing = library.get_schema_node(self.maps[i]["from"], lang)
  440. if existing and not self.same_record(existing.index) and existing.index.title != self.pkeys_orig_values.get("title"):
  441. raise InputError(u"'{}' cannot be a shorthand name: a text with this title already exisits.".format(nref))
  442. self.maps[i]["to"] = nref
  443. def toc_contents(self):
  444. firstSection = Ref(self.title).first_available_section_ref()
  445. toc_contents_dict = {
  446. "title": self.get_title(),
  447. "heTitle": self.get_title("he"),
  448. "categories": self.categories,
  449. "firstSection": firstSection.normal() if firstSection else None
  450. }
  451. if hasattr(self,"order"):
  452. toc_contents_dict["order"] = self.order
  453. if self.categories[0] == u"Commentary2":
  454. toc_contents_dict["commentator"] = self.categories[2]
  455. toc_contents_dict["heCommentator"] = hebrew_term(self.categories[2])
  456. on_split = self.get_title().split(" on ")
  457. if len(on_split) == 2:
  458. try:
  459. i = library.get_index(on_split[1])
  460. if getattr(i, "order", None):
  461. toc_contents_dict["order"] = i.order
  462. except BookNameError:
  463. pass
  464. return toc_contents_dict
  465. class IndexSet(abst.AbstractMongoSet):
  466. """
  467. A set of :class:`Index` objects.
  468. """
  469. recordClass = Index
  470. # Index changes behavior of load_from_dict, so this circumvents that changed behavior to call load_from_dict on the abstract superclass
  471. def update(self, attrs):
  472. for rec in self:
  473. rec.update_from_dict(attrs).save()
  474. class CommentaryIndex(AbstractIndex):
  475. """
  476. A virtual Index for commentary records.
  477. :param commentator_name: A title variant of a commentator :class:`Index` record
  478. :param book_name: A title variant of a book :class:`Index` record
  479. """
  480. def __init__(self, commentator_name, book_name):
  481. """
  482. :param commentator_name: A title variant of a commentator :class:Index record
  483. :param book_name: A title variant of a book :class:Index record
  484. :return:
  485. """
  486. self.c_index = Index().load({
  487. "titleVariants": commentator_name,
  488. "categories.0": "Commentary"
  489. })
  490. if not self.c_index:
  491. raise BookNameError(u"No commentator named '{}'.".format(commentator_name))
  492. self.b_index = library.get_index(book_name)
  493. if not self.b_index:
  494. raise BookNameError(u"No book named '{}'.".format(book_name))
  495. if self.b_index.is_commentary():
  496. raise BookNameError(u"We don't yet support nested commentaries '{} on {}'.".format(commentator_name, book_name))
  497. # This whole dance is a bit of a mess.
  498. # Todo: see if we can clean it up a bit
  499. # could expose the b_index and c_index records to consumers of this object, and forget the renaming
  500. self.__dict__.update(self.c_index.contents())
  501. self.commentaryBook = self.b_index.get_title()
  502. self.commentaryCategories = self.b_index.categories
  503. self.categories = ["Commentary"] + [self.b_index.categories[0], commentator_name]
  504. self.commentator = commentator_name
  505. if getattr(self.b_index, "order", None):
  506. self.order = self.b_index.order
  507. if getattr(self, "heTitle", None):
  508. self.heCommentator = self.heBook = self.heTitle # why both?
  509. # todo: this assumes flat structure
  510. # self.nodes = JaggedArrayCommentatorNode(self.b_index.nodes, index=self)
  511. def extend_leaf_nodes(node):
  512. node.index = self
  513. try:
  514. del node.checkFirst
  515. except AttributeError:
  516. pass
  517. if node.has_children():
  518. return node
  519. #return JaggedArrayCommentatorNode(node, index=self)
  520. node.addressTypes += ["Integer"]
  521. node.sectionNames += ["Comment"]
  522. node.depth += 1
  523. return node
  524. '''
  525. commentor_index = kwargs.get("index", None)
  526. assert commentor_index.is_commentary(), "Non-commentator index {} passed to JaggedArrayCommentatorNode".format(commentor_index.title)
  527. self.basenode = basenode
  528. parameters = {
  529. "addressTypes": basenode.addressTypes + ["Integer"],
  530. "sectionNames": basenode.sectionNames + ["Comment"],
  531. "depth": basenode.depth + 1
  532. }
  533. if getattr(basenode, "lengths", None):
  534. parameters["lengths"] = basenode.lengths
  535. super(JaggedArrayCommentatorNode, self).__init__({}, parameters, **kwargs)
  536. self.key = basenode.key
  537. self.title_group = basenode.title_group.copy()
  538. '''
  539. self.nodes = self.b_index.nodes.copy(extend_leaf_nodes)
  540. self.nodes.title_group = TitleGroup() # Reset all titles
  541. en_cross_product = [c + " on " + b for c in self.c_index.titleVariants for b in self.b_index.nodes.all_node_titles("en")]
  542. self.title = self.c_index.title + " on " + self.b_index.get_title() # Calls AbstractIndex.setTitle - will set nodes.key and nodes.primary_title
  543. for title in en_cross_product:
  544. self.nodes.add_title(title, "en")
  545. cnames = getattr(self.c_index, "heTitleVariants", None)
  546. cprimary = getattr(self.c_index, "heTitle", None)
  547. if cnames and cprimary:
  548. he_cross_product = [c + u" על " + b for c in cnames for b in self.b_index.nodes.all_node_titles("he")]
  549. self.set_title(cprimary + u" על " + self.b_index.get_title("he"), "he")
  550. for title in he_cross_product:
  551. self.nodes.add_title(title, "he")
  552. else:
  553. logger.warning("No Hebrew title for {}".format(self.title))
  554. # todo: handle 'alone' titles in b_index - add "commentator on" to them
  555. self.schema = self.nodes.serialize()
  556. self.nodes = deserialize_tree(self.schema, index=self) # reinit nodes so that derived attributes are instanciated
  557. self.titleVariants = self.nodes.all_node_titles("en")
  558. self.heTitle = self.nodes.primary_title("he")
  559. self.heTitleVariants = self.nodes.all_node_titles("he")
  560. if getattr(self.nodes, "lengths", None): #seems superfluous w/ nodes above
  561. self.length = self.nodes.lengths[0]
  562. def __unicode__(self):
  563. return u"{}: {} on {}".format(self.__class__.__name__, self.c_index.title, self.b_index.title)
  564. def __str__(self):
  565. return unicode(self).encode('utf-8')
  566. def __repr__(self): # Wanted to use orig_tref, but repr can not include Unicode
  567. return u"{}({}, {})".format(self.__class__.__name__, self.c_index.title, self.b_index.title)
  568. def is_commentary(self):
  569. return True
  570. def is_complex(self):
  571. return self.b_index.is_complex()
  572. # todo: integrate alt structure on commentary?
  573. def has_alt_structures(self):
  574. return False
  575. def get_alt_structures(self):
  576. return {}
  577. def copy(self):
  578. #todo: this doesn't seem to be used.
  579. #todo: make this quicker, by utilizing copy methods of the composed objects
  580. return copy.deepcopy(self)
  581. def toc_contents(self):
  582. firstSection = Ref(self.title).first_available_section_ref()
  583. toc_contents_dict = {
  584. "title": self.title,
  585. "heTitle": getattr(self, "heTitle", None),
  586. "commentator": self.commentator,
  587. "heCommentator": self.heCommentator,
  588. "categories": self.categories,
  589. "firstSection": firstSection.normal() if firstSection else None
  590. }
  591. if hasattr(self,"order"):
  592. toc_contents_dict["order"] = self.order
  593. return toc_contents_dict
  594. #todo: this needs help
  595. def contents(self, v2=False, raw=False, **kwargs):
  596. if v2:
  597. return self.nodes.as_index_contents()
  598. attrs = copy.copy(vars(self))
  599. del attrs["c_index"]
  600. del attrs["b_index"]
  601. del attrs["nodes"]
  602. attrs['schema'] = self.nodes.serialize(expand_shared=True, expand_titles=True, translate_sections=True)
  603. if self.nodes.is_leaf():
  604. attrs["sectionNames"] = self.nodes.sectionNames
  605. attrs["heSectionNames"] = map(hebrew_term, self.nodes.sectionNames)
  606. attrs["textDepth"] = len(self.nodes.sectionNames)
  607. return attrs
  608. # Deprecated
  609. def get_index(bookname):
  610. logger.warning("Use of deprecated function: get_index()")
  611. return library.get_index(bookname)
  612. """
  613. -------------------
  614. Versions & Chunks
  615. -------------------
  616. """
  617. class AbstractSchemaContent(object):
  618. content_attr = "content"
  619. def get_content(self):
  620. return getattr(self, self.content_attr, None)
  621. def content_node(self, snode):
  622. """
  623. :param snode:
  624. :type snode SchemaContentNode:
  625. :return:
  626. """
  627. return self.sub_content(snode.version_address())
  628. #TODO: test me
  629. def sub_content(self, key_list=None, indx_list=None, value=None):
  630. """
  631. Get's or sets values deep within the content of this version.
  632. This returns the result by reference, NOT by value.
  633. http://stackoverflow.com/questions/27339165/slice-nested-list-at-variable-depth
  634. :param key_list: The node keys to traverse to get to the content node
  635. :param indx_list: The indexes of the subsection to get/set
  636. :param value: The value to set. If present, the method acts as a setter. If None, it acts as a getter.
  637. """
  638. if not key_list:
  639. key_list = []
  640. if not indx_list:
  641. indx_list = []
  642. ja = reduce(lambda d, k: d[k], key_list, self.get_content())
  643. if indx_list:
  644. sa = reduce(lambda a, i: a[i], indx_list[:-1], ja)
  645. if value is not None:
  646. sa[indx_list[-1]] = value
  647. return sa[indx_list[-1]]
  648. else:
  649. if value is not None:
  650. ja[:] = value
  651. return ja
  652. class AbstractTextRecord(object):
  653. """
  654. """
  655. text_attr = "chapter"
  656. ALLOWED_TAGS = ("i", "b", "br", "u", "strong", "em", "big", "small", "img")
  657. ALLOWED_ATTRS = {'img': lambda name, value: name == 'src' and value.startswith("data:image/")}
  658. def word_count(self):
  659. """ Returns the number of words in this text """
  660. return self.ja().word_count()
  661. def char_count(self):
  662. """ Returns the number of characters in this text """
  663. return self.ja().char_count()
  664. def verse_count(self):
  665. """ Returns the number of verses in this text """
  666. return self.ja().verse_count()
  667. def ja(self): #don't cache locally unless change is handled. Pontential to cache on JA class level
  668. return JaggedTextArray(getattr(self, self.text_attr, None))
  669. def as_string(self):
  670. content = getattr(self, self.text_attr, None)
  671. if isinstance(content, basestring):
  672. return content
  673. elif isinstance(content, list):
  674. return self.ja().flatten_to_string()
  675. else:
  676. return ""
  677. @classmethod
  678. def sanitize_text(cls, t):
  679. if isinstance(t, list):
  680. for i, v in enumerate(t):
  681. t[i] = TextChunk.sanitize_text(v)
  682. elif isinstance(t, basestring):
  683. t = bleach.clean(t, tags=cls.ALLOWED_TAGS, attributes=cls.ALLOWED_ATTRS)
  684. else:
  685. return False
  686. return t
  687. # Currently assumes that text is JA
  688. def _sanitize(self):
  689. setattr(self, self.text_attr,
  690. self.sanitize_text(getattr(self, self.text_attr, None))
  691. )
  692. class Version(abst.AbstractMongoRecord, AbstractTextRecord, AbstractSchemaContent):
  693. """
  694. A version of a text.
  695. Relates to a complete single record from the texts collection.
  696. """
  697. history_noun = 'text'
  698. collection = 'texts'
  699. content_attr = "chapter"
  700. track_pkeys = True
  701. pkeys = ["versionTitle"]
  702. required_attrs = [
  703. "language",
  704. "title", # FK to Index.title
  705. "versionSource",
  706. "versionTitle",
  707. "chapter" # required. change to "content"?
  708. ]
  709. optional_attrs = [
  710. "status",
  711. "priority",
  712. "license",
  713. "licenseVetted",
  714. "versionNotes",
  715. "digitizedBySefaria",
  716. "method",
  717. "heversionSource", # bad data?
  718. "versionUrl" # bad data?
  719. ]
  720. def __unicode__(self):
  721. return u"Version: {} <{}>".format(self.title, self.versionTitle)
  722. def __str__(self):
  723. return unicode(self).encode('utf-8')
  724. def __repr__(self): # Wanted to use orig_tref, but repr can not include Unicode
  725. return u"{}().load({{'title': '{}', 'versionTitle': '{}'}})".format(self.__class__.__name__, self.title, self.versionTitle)
  726. def _validate(self):
  727. assert super(Version, self)._validate()
  728. """
  729. Old style database text record have a field called 'chapter'
  730. Version records in the wild have a field called 'text', and not always a field called 'chapter'
  731. """
  732. return True
  733. def _normalize(self):
  734. pass
  735. def get_index(self):
  736. return library.get_index(self.title)
  737. def first_section_ref(self):
  738. """
  739. Returns a :class:`Ref` to the first non-empty location in this version.
  740. """
  741. i = self.get_index()
  742. leafnodes = i.nodes.get_leaf_nodes()
  743. for leaf in leafnodes:
  744. ja = JaggedTextArray(self.content_node(leaf))
  745. indx_array = ja.next_index()
  746. if indx_array:
  747. return Ref(_obj={
  748. "index": i,
  749. "book": leaf.full_title("en"),
  750. "type": i.categories[0],
  751. "index_node": leaf,
  752. "sections": [i + 1 for i in indx_array],
  753. "toSections": [i + 1 for i in indx_array]
  754. }).section_ref()
  755. return None
  756. def ja(self):
  757. # the quickest way to check if this is a complex text
  758. if isinstance(getattr(self, self.text_attr, None), dict):
  759. nodes = self.get_index().nodes.get_leaf_nodes()
  760. return JaggedTextArray([self.content_node(node) for node in nodes])
  761. else:
  762. return super(Version, self).ja()
  763. class VersionSet(abst.AbstractMongoSet):
  764. """
  765. A collection of :class:`Version` objects
  766. """
  767. recordClass = Version
  768. def __init__(self, query={}, page=0, limit=0, sort=[["priority", -1], ["_id", 1]], proj=None):
  769. super(VersionSet, self).__init__(query, page, limit, sort, proj)
  770. def word_count(self):
  771. return sum([v.word_count() for v in self])
  772. def char_count(self):
  773. return sum([v.char_count() for v in self])
  774. def verse_count(self):
  775. return sum([v.verse_count() for v in self])
  776. def merge(self, node=None):
  777. """
  778. Returns merged result, but does not change underlying data
  779. """
  780. for v in self:
  781. if not getattr(v, "versionTitle", None):
  782. logger.error("No version title for Version: {}".format(vars(v)))
  783. if node is None:
  784. return merge_texts([getattr(v, "chapter", []) for v in self], [getattr(v, "versionTitle", None) for v in self])
  785. return merge_texts([v.content_node(node) for v in self], [getattr(v, "versionTitle", None) for v in self])
  786. # used in VersionSet.merge(), merge_text_versions(), and export.export_merged()
  787. # todo: move this to JaggedTextArray class?
  788. # Doesn't work for complex texts
  789. def merge_texts(text, sources):
  790. """
  791. This is a recursive function that merges the text in multiple
  792. translations to fill any gaps and deliver as much text as
  793. possible.
  794. e.g. [["a", ""], ["", "b", "c"]] becomes ["a", "b", "c"]
  795. """
  796. if not (len(text) and len(sources)):
  797. return ["", []]
  798. depth = list_depth(text)
  799. if depth > 2:
  800. results = []
  801. result_sources = []
  802. for x in range(max(map(len, text))):
  803. translations = map(None, *text)[x]
  804. remove_nones = lambda x: x or []
  805. result, source = merge_texts(map(remove_nones, translations), sources)
  806. results.append(result)
  807. # NOTE - the below flattens the sources list, so downstream code can always expect
  808. # a one dimensional list, but in so doing the mapping of source names to segments
  809. # is lost for merged texts of depth > 2 (this mapping is not currenly used in general)
  810. result_sources += source
  811. return [results, result_sources]
  812. if depth == 1:
  813. text = map(lambda x: [x], text)
  814. merged = map(None, *text)
  815. text = []
  816. text_sources = []
  817. for verses in merged:
  818. # Look for the first non empty version (which will be the oldest, or one with highest priority)
  819. index, value = 0, 0
  820. for i, version in enumerate(verses):
  821. if version:
  822. index = i
  823. value = version
  824. break
  825. text.append(value)
  826. text_sources.append(sources[index])
  827. if depth == 1:
  828. # strings were earlier wrapped in lists, now unwrap
  829. text = text[0]
  830. return [text, text_sources]
  831. class TextChunk(AbstractTextRecord):
  832. """
  833. A chunk of text corresponding to the provided :class:`Ref`, language, and optionall version name.
  834. If it is possible to get a more complete text by merging multiple versions, a merged result will be returned.
  835. :param oref: :class:`Ref`
  836. :param lang: "he" or "en"
  837. :param vtitle: optional. Title of the version desired.
  838. """
  839. text_attr = "text"
  840. def __init__(self, oref, lang="en", vtitle=None):
  841. """
  842. :param oref:
  843. :type oref: Ref
  844. :param lang: "he" or "en"
  845. :param vtitle:
  846. :return:
  847. """
  848. self._oref = oref
  849. self._ref_depth = len(oref.sections)
  850. self._versions = []
  851. self._saveable = False # Can this TextChunk be saved?
  852. self.lang = lang
  853. self.is_merged = False
  854. self.sources = []
  855. self.text = self._original_text = self.empty_text()
  856. self.vtitle = vtitle
  857. self.full_version = None
  858. self.versionSource = None # handling of source is hacky
  859. if lang and vtitle:
  860. self._saveable = True
  861. v = Version().load({"title": oref.index.title, "language": lang, "versionTitle": vtitle}, oref.part_projection())
  862. if v:
  863. self._versions += [v]
  864. self.text = self._original_text = self.trim_text(v.content_node(oref.index_node))
  865. elif lang:
  866. vset = VersionSet(oref.condition_query(lang), proj=oref.part_projection())
  867. if vset.count() == 0:
  868. if VersionSet({"title": oref.index.title}).count() == 0:
  869. raise NoVersionFoundError("No text record found for '{}'".format(oref.index.title))
  870. return
  871. if vset.count() == 1:
  872. v = vset[0]
  873. self._versions += [v]
  874. self.text = self.trim_text(v.content_node(oref.index_node))
  875. #todo: Should this instance, and the non-merge below, be made saveable?
  876. else: # multiple versions available, merge
  877. merged_text, sources = vset.merge(oref.index_node) #todo: For commentaries, this merges the whole chapter. It may show up as merged, even if our part is not merged.
  878. self.text = self.trim_text(merged_text)
  879. if len(set(sources)) == 1:
  880. for v in vset:
  881. if v.versionTitle == sources[0]:
  882. self._versions += [v]
  883. break
  884. else:
  885. self.sources = sources
  886. self.is_merged = True
  887. self._versions = vset.array()
  888. else:
  889. raise Exception("TextChunk requires a language.")
  890. def __unicode__(self):
  891. args = u"{}, {}".format(self._oref, self.lang)
  892. if self.vtitle:
  893. args += u", {}".format(self.vtitle)
  894. return args
  895. def __str__(self):
  896. return unicode(self).encode('utf-8')
  897. def __repr__(self): # Wanted to use orig_tref, but repr can not include Unicode
  898. args = u"{}, {}".format(self._oref, self.lang)
  899. if self.vtitle:
  900. args += u", {}".format(self.vtitle)
  901. return u"{}({})".format(self.__class__.__name__, args)
  902. def is_empty(self):
  903. return self.ja().is_empty()
  904. def ja(self):
  905. return JaggedTextArray(self.text)
  906. def save(self):
  907. assert self._saveable, u"Tried to save a read-only text: {}".format(self._oref.normal())
  908. assert not self._oref.is_range(), u"Only non-range references can be saved: {}".format(self._oref.normal())
  909. #may support simple ranges in the future.
  910. #self._oref.is_range() and self._oref.range_index() == len(self._oref.sections) - 1
  911. if self.text == self._original_text:
  912. logger.warning(u"Aborted save of {}. No change in text.".format(self._oref.normal()))
  913. return False
  914. self._validate()
  915. self._sanitize()
  916. self._trim_ending_whitespace()
  917. if not self.version():
  918. self.full_version = Version(
  919. {
  920. "chapter": self._oref.index.nodes.create_skeleton(),
  921. "versionTitle": self.vtitle,
  922. "versionSource": self.versionSource,
  923. "language": self.lang,
  924. "title": self._oref.index.title
  925. }
  926. )
  927. else:
  928. self.full_version = Version().load({"title": self._oref.index.title, "language": self.lang, "versionTitle": self.vtitle})
  929. assert self.full_version, u"Failed to load Version record for {}, {}".format(self._oref.normal(), self.vtitle)
  930. if self.versionSource:
  931. self.full_version.versionSource = self.versionSource # hack
  932. content = self.full_version.sub_content(self._oref.index_node.version_address())
  933. self._pad(content)
  934. self.full_version.sub_content(self._oref.index_node.version_address(), [i - 1 for i in self._oref.sections], self.text)
  935. self.full_version.save()
  936. self._oref.recalibrate_next_prev_refs(len(self.text))
  937. return self
  938. def _pad(self, content):
  939. """
  940. Pads the passed content to the dimension of self._oref.
  941. Acts on the input variable 'content' in place
  942. Does not yet handle ranges
  943. :param content:
  944. :return:
  945. """
  946. for pos, val in enumerate(self._oref.sections):
  947. # at pos == 0, parent_content == content
  948. # at pos == 1, parent_content == chapter
  949. # at pos == 2, parent_content == verse
  950. # etc
  951. parent_content = reduce(lambda a, i: a[i - 1], self._oref.sections[:pos], content)
  952. # Pad out existing content to size of ref
  953. if len(parent_content) < val:
  954. for _ in range(len(parent_content), val):
  955. parent_content.append("" if pos == self._oref.index_node.depth - 1 else [])
  956. # check for strings where arrays expected, except for last pass
  957. if pos < self._ref_depth - 2 and isinstance(parent_content[val - 1], basestring):
  958. parent_content[val - 1] = [parent_content[val - 1]]
  959. def _trim_ending_whitespace(self):
  960. """
  961. Trims blank segments from end of every section
  962. :return:
  963. """
  964. self.text = JaggedTextArray(self.text).trim_ending_whitespace().array()
  965. def _validate(self):
  966. """
  967. validate that depth/breadth of the TextChunk.text matches depth/breadth of the Ref
  968. :return:
  969. """
  970. posted_depth = 0 if isinstance(self.text, basestring) else list_depth(self.text)
  971. ref_depth = self._oref.range_index() if self._oref.is_range() else self._ref_depth
  972. implied_depth = ref_depth + posted_depth
  973. if implied_depth != self._oref.index_node.depth:
  974. raise InputError(
  975. u"Text Structure Mismatch. The stored depth of {} is {}, but the text posted to {} implies a depth of {}."
  976. .format(self._oref.index_node.full_title(), self._oref.index_node.depth, self._oref.normal(), implied_depth)
  977. )
  978. #validate that length of the array matches length of the ref
  979. #todo: double check for depth >= 3
  980. if self._oref.is_spanning():
  981. span_size = self._oref.span_size()
  982. if posted_depth == 0: #possible?
  983. raise InputError(
  984. u"Text Structure Mismatch. {} implies a length of {} sections, but the text posted is a string."
  985. .format(self._oref.normal(), span_size)
  986. )
  987. elif posted_depth == 1: #possible?
  988. raise InputError(
  989. u"Text Structure Mismatch. {} implies a length of {} sections, but the text posted is a simple list."
  990. .format(self._oref.normal(), span_size)
  991. )
  992. else:
  993. posted_length = len(self.text)
  994. if posted_length != span_size:
  995. raise InputError(
  996. u"Text Structure Mismatch. {} implies a length of {} sections, but the text posted has {} elements."
  997. .format(self._oref.normal(), span_size, posted_length)
  998. )
  999. #todo: validate last section size if provided
  1000. elif self._oref.is_range():
  1001. range_length = self._oref.range_size()
  1002. if posted_depth == 0:
  1003. raise InputError(
  1004. u"Text Structure Mismatch. {} implies a length of {}, but the text posted is a string."
  1005. .format(self._oref.normal(), range_length)
  1006. )
  1007. elif posted_depth == 1:
  1008. posted_length = len(self.text)
  1009. if posted_length != range_length:
  1010. raise InputError(
  1011. u"Text Structure Mismatch. {} implies a length of {}, but the text posted has {} elements."
  1012. .format(self._oref.normal(), range_length, posted_length)
  1013. )
  1014. else: # this should never happen. The depth check should catch it.
  1015. raise InputError(
  1016. u"Text Structure Mismatch. {} implies an simple array of length {}, but the text posted has depth {}."
  1017. .format(self._oref.normal(), range_length, posted_depth)
  1018. )
  1019. #maybe use JaggedArray.subarray()?
  1020. def trim_text(self, txt):
  1021. """
  1022. Trims a text loaded from Version record with self._oref.part_projection() to the specifications of self._oref
  1023. This works on simple Refs and range refs of unlimited depth and complexity.
  1024. (in place?)
  1025. :param txt:
  1026. :return: List|String depending on depth of Ref
  1027. """
  1028. range_index = self._oref.range_index()
  1029. sections = self._oref.sections
  1030. toSections = self._oref.toSections
  1031. if not sections:
  1032. pass
  1033. else:
  1034. for i in range(0, self._ref_depth):
  1035. if i == 0 == range_index: # First level slice handled at DB level
  1036. pass
  1037. elif range_index > i: # Either not range, or range begins later. Return simple value.
  1038. if i == 0 and len(txt): # We already sliced the first level w/ Ref.part_projection()
  1039. txt = txt[0]
  1040. elif len(txt) >= sections[i]:
  1041. txt = txt[sections[i] - 1]
  1042. else:
  1043. return self.empty_text()
  1044. elif range_index == i: # Range begins here
  1045. start = sections[i] - 1
  1046. end = toSections[i]
  1047. txt = txt[start:end]
  1048. else: # range_index < i, range continues here
  1049. begin = end = txt
  1050. for _ in range(range_index, i - 1):
  1051. begin = begin[0]
  1052. end = end[-1]
  1053. begin[0] = begin[0][sections[i] - 1:]
  1054. end[-1] = end[-1][:toSections[i]]
  1055. return txt
  1056. def empty_text(self):
  1057. """
  1058. :return: Either empty array or empty string, depending on depth of Ref
  1059. """
  1060. if not self._oref.is_range() and self._ref_depth == self._oref.index_node.depth:
  1061. return ""
  1062. else:
  1063. return []
  1064. def version(self):
  1065. """
  1066. Returns the Version record for this chunk
  1067. :return Version:
  1068. :raises Exception: if the TextChunk is merged
  1069. """
  1070. if not self._versions:
  1071. return None
  1072. if len(self._versions) == 1:
  1073. return self._versions[0]
  1074. else:
  1075. raise Exception("Called TextChunk.version() on merged TextChunk.")
  1076. # Mirrors the construction of the old get_text() method.
  1077. # The TextFamily.contents() method will return a dictionary in the same format that was provided by get_text().
  1078. class TextFamily(object):
  1079. """
  1080. A text with its translations and optionally the commentary on it.
  1081. Can be instanciated with just the first argument.
  1082. :param oref: :class:`Ref`. This is the only required argument.
  1083. :param int context: Default: 1. How many context levels up to go when getting commentary. See :func:`Ref.context_ref`
  1084. :param bool commentary: Default: True. Include commentary?
  1085. :param version: optional. Name of version to use when getting text.
  1086. :param lang: None, "en" or "he". Default: None. If None, included both languages.
  1087. :param bool pad: Default: True. Pads the provided ref before processing. See :func:`Ref.padded_ref`
  1088. :param bool alts: Default: False. Adds notes of where alternate structure elements begin
  1089. """
  1090. #Attribute maps used for generating dict format
  1091. text_attr_map = {
  1092. "en": "text",
  1093. "he": "he"
  1094. }
  1095. attr_map = {
  1096. "versionTitle": {
  1097. "en": "versionTitle",
  1098. "he": "heVersionTitle"
  1099. },
  1100. "versionSource": {
  1101. "en": "versionSource",
  1102. "he": "heVersionSource"
  1103. },
  1104. "status": {
  1105. "en": "versionStatus",
  1106. "he": "heVersionStatus"
  1107. },
  1108. "license": {
  1109. "en": "license",
  1110. "he": "heLicense",
  1111. "condition": "licenseVetted",
  1112. "default": "unknown"
  1113. },
  1114. "versionNotes": {
  1115. "en": "versionNotes",
  1116. "he": "heVersionNotes"
  1117. },
  1118. "digitizedBySefaria": {
  1119. "en": "digitizedBySefaria",
  1120. "he": "heDigitizedBySefaria",
  1121. "default": False,
  1122. }
  1123. }
  1124. sourceMap = {
  1125. "en": "sources",
  1126. "he": "heSources"
  1127. }
  1128. def __init__(self, oref, context=1, commentary=True, version=None, lang=None, pad=True, alts=False):
  1129. """
  1130. :param oref:
  1131. :param context:
  1132. :param commentary:
  1133. :param version:
  1134. :param lang:
  1135. :param pad:
  1136. :param alts: Adds notes of where alt elements begin
  1137. :return:
  1138. """
  1139. oref = oref.padded_ref() if pad else oref
  1140. self.ref = oref.normal()
  1141. self.heRef = oref.he_normal()
  1142. self.isComplex = oref.index.is_complex()
  1143. self.text = None
  1144. self.he = None
  1145. self._lang = lang
  1146. self._original_oref = oref
  1147. self._context_oref = None
  1148. self._chunks = {}
  1149. self._inode = oref.index_node
  1150. self._alts = []
  1151. assert isinstance(self._inode, JaggedArrayNode), "TextFamily only works with JaggedArray nodes" # todo: handle structure nodes?
  1152. for i in range(0, context):
  1153. oref = oref.context_ref()
  1154. self._context_oref = oref
  1155. # processes "en" and "he" TextChunks, and puts the text in self.text and self.he, respectively.
  1156. for language, attr in self.text_attr_map.items():
  1157. if language == lang:
  1158. c = TextChunk(oref, language, version)
  1159. else:
  1160. c = TextChunk(oref, language)
  1161. self._chunks[language] = c
  1162. setattr(self, self.text_attr_map[language], c.text)
  1163. if oref.is_spanning():
  1164. self.spanning = True
  1165. if commentary:
  1166. from sefaria.client.wrapper import get_links
  1167. if not oref.is_spanning():
  1168. links = get_links(oref.normal()) #todo - have this function accept an object
  1169. else:
  1170. links = [get_links(r.normal()) for r in oref.split_spanning_ref()]
  1171. self.commentary = links if "error" not in links else []
  1172. # get list of available versions of this text
  1173. self.versions = oref.version_list()
  1174. # Adds decoration for the start of each alt structure reference
  1175. if alts:
  1176. # Set up empty Array that mirrors text structure
  1177. alts_ja = JaggedArray()
  1178. for key, struct in oref.index.get_alt_structures().iteritems():
  1179. # Assuming these are in order, continue if it is before ours, break if we see one after
  1180. for n in struct.get_leaf_nodes():
  1181. wholeRef = Ref(n.wholeRef)
  1182. if wholeRef.ending_ref().precedes(oref):
  1183. continue
  1184. if wholeRef.starting_ref().follows(oref):
  1185. break
  1186. #It's in our territory
  1187. wholeRefStart = wholeRef.starting_ref()
  1188. if oref.contains(wholeRefStart) and not oref == wholeRefStart:
  1189. indxs = [k - 1 for k in wholeRefStart.in_terms_of(oref)]
  1190. val = {"en":[], "he":[]}
  1191. try:
  1192. val = alts_ja.get_element(indxs)
  1193. except IndexError:
  1194. pass
  1195. val["en"] += [n.primary_title("en")]
  1196. val["he"] += [n.primary_title("he")]
  1197. val["whole"] = True
  1198. alts_ja.set_element(indxs, val)
  1199. if getattr(n, "refs", None):
  1200. for i, r in enumerate(n.refs):
  1201. # hack to skip Rishon, skip empty refs
  1202. if i==0 or not r:
  1203. continue;
  1204. subRef = Ref(r)
  1205. subRefStart = subRef.starting_ref()
  1206. if oref.contains(subRefStart) and not oref == subRefStart:
  1207. indxs = [k - 1 for k in subRefStart.in_terms_of(oref)]
  1208. val = {"en":[], "he":[]}
  1209. try:
  1210. a = alts_ja.get_element(indxs)
  1211. if a:
  1212. val = a
  1213. except IndexError:
  1214. pass
  1215. val["en"] += [n.sectionString([i + 1], "en", title=False)]
  1216. val["he"] += [n.sectionString([i + 1], "he", title=False)]
  1217. alts_ja.set_element(indxs, val)
  1218. elif subRefStart.follows(oref):
  1219. break
  1220. self._alts = alts_ja.array()
  1221. def contents(self):
  1222. """
  1223. :return dict: Returns the contents of the text family.
  1224. """
  1225. d = {k: getattr(self, k) for k in vars(self).keys() if k[0] != "_"}
  1226. d["textDepth"] = getattr(self._inode, "depth", None)
  1227. d["sectionNames"] = getattr(self._inode, "sectionNames", None)
  1228. d["addressTypes"] = getattr(self._inode, "addressTypes", None)
  1229. if getattr(self._inode, "lengths", None):
  1230. d["lengths"] = getattr(self._inode, "lengths")
  1231. if len(d["lengths"]):
  1232. d["length"] = d["lengths"][0]
  1233. elif getattr(self._inode, "length", None):
  1234. d["length"] = getattr(self._inode, "length")
  1235. d["textDepth"] = self._inode.depth
  1236. d["heTitle"] = self._inode.full_title("he")
  1237. d["titleVariants"] = self._inode.all_tree_titles("en")
  1238. d["heTitleVariants"] = self._inode.all_tree_titles("he")
  1239. for attr in ["categories", "order", "maps"]:
  1240. d[attr] = getattr(self._inode.index, attr, "")
  1241. for attr in ["book", "type"]:
  1242. d[attr] = getattr(self._original_oref, attr)
  1243. for attr in ["sections", "toSections"]:
  1244. d[attr] = getattr(self._original_oref, attr)[:]
  1245. if self._context_oref.is_commentary():
  1246. for attr in ["commentaryBook", "commentaryCategories", "commentator", "heCommentator"]:
  1247. d[attr] = getattr(self._inode.index, attr, "")
  1248. d["isComplex"] = self.isComplex
  1249. d["indexTitle"] = self._inode.index.title
  1250. d["heIndexTitle"] = self._inode.index.get_title("he")
  1251. d["sectionRef"] = self._original_oref.section_ref().normal()
  1252. d["isSpanning"] = self._original_oref.is_spanning()
  1253. if d["isSpanning"]:
  1254. d["spanningRefs"] = [r.normal() for r in self._original_oref.split_spanning_ref()]
  1255. for language, attr in self.text_attr_map.items():
  1256. chunk = self._chunks.get(language)
  1257. if chunk.is_merged:
  1258. d[self.sourceMap[language]] = chunk.sources
  1259. else:
  1260. ver = chunk.version()
  1261. if ver:
  1262. for key, val in self.attr_map.items():
  1263. if not val.get("condition") or getattr(ver, val.get("condition"), False):
  1264. d[val[language]] = getattr(ver, key, val.get("default", ""))
  1265. else:
  1266. d[val[language]] = val.get("default")
  1267. # replace ints with daf strings (3->"2a") for Talmud addresses
  1268. # this could be simpler if was done for every value - but would be slower.
  1269. if "Talmud" in self._inode.addressTypes:
  1270. for i in range(len(d["sections"])):
  1271. if self._inode.addressTypes[i] == "Talmud":
  1272. d["sections"][i] = AddressTalmud.toStr("en", d["sections"][i])
  1273. if "toSections" in d:
  1274. d["toSections"][i] = AddressTalmud.toStr("en", d["toSections"][i])
  1275. d["title"] = self._context_oref.normal()
  1276. if "heTitle" in d:
  1277. d["heBook"] = d["heTitle"]
  1278. d["heTitle"] = self._context_oref.he_normal()
  1279. if d["type"] == "Commentary" and self._context_oref.is_talmud() and len(d["sections"]) > 1:
  1280. d["title"] = "%s Line %d" % (d["title"], d["sections"][1])
  1281. elif self._context_oref.is_commentary():
  1282. dep = len(d["sections"]) if len(d["sections"]) < 2 else 2
  1283. d["title"] = d["book"] + " " + ":".join(["%s" % s for s in d["sections"][:dep]])
  1284. d["alts"] = self._alts
  1285. return d
  1286. def process_index_title_change_in_versions(indx, **kwargs):
  1287. VersionSet({"title": kwargs["old"]}).update({"title": kwargs["new"]})
  1288. if indx.is_commentary(): # and "commentaryBook" not in d: # looks useless
  1289. old_titles = library.get_commentary_version_titles(kwargs["old"])
  1290. else:
  1291. old_titles = library.get_commentary_version_titles_on_book(kwargs["old"])
  1292. old_new = [(title, title.replace(kwargs["old"], kwargs["new"], 1)) for title in old_titles]
  1293. for pair in old_new:
  1294. VersionSet({"title": pair[0]}).update({"title": pair[1]})
  1295. def process_index_delete_in_versions(indx, **kwargs):
  1296. VersionSet({"title": indx.title}).delete()
  1297. if indx.is_commentary(): # and not getattr(self, "commentator", None): # Seems useless
  1298. library.get_commentary_versions(indx.title).delete()
  1299. """
  1300. -------------------
  1301. Refs
  1302. -------------------
  1303. """
  1304. class RefCachingType(type):
  1305. """
  1306. Metaclass for Ref class.
  1307. Caches all Ref isntances according to the string they were instanciated with and their normal form.
  1308. Returns cached instance on instanciation if either instanciation string or normal form are matched.
  1309. """
  1310. def __init__(cls, name, parents, dct):
  1311. super(RefCachingType, cls).__init__(name, parents, dct)
  1312. cls.__cache = {}
  1313. def cache_size(cls):
  1314. return len(cls.__cache)
  1315. def cache_dump(cls):
  1316. return [(a, repr(b)) for (a, b) in cls.__cache.iteritems()]
  1317. def _raw_cache(cls):
  1318. return cls.__cache
  1319. def clear_cache(cls):
  1320. cls.__cache = {}
  1321. def __call__(cls, *args, **kwargs):
  1322. if len(args) == 1:
  1323. tref = args[0]
  1324. else:
  1325. tref = kwargs.get("tref")
  1326. obj_arg = kwargs.get("_obj")
  1327. if tref:
  1328. if tref in cls.__cache:
  1329. ref = cls.__cache[tref]
  1330. ref.tref = tref
  1331. return ref
  1332. else:
  1333. result = super(RefCachingType, cls).__call__(*args, **kwargs)
  1334. if result.uid() in cls.__cache:
  1335. #del result # Do we need this to keep memory clean?
  1336. cls.__cache[tref] = cls.__cache[result.uid()]
  1337. return cls.__cache[result.uid()]
  1338. cls.__cache[result.uid()] = result
  1339. cls.__cache[tref] = result
  1340. return result
  1341. elif obj_arg:
  1342. result = super(RefCachingType, cls).__call__(*args, **kwargs)
  1343. if result.uid() in cls.__cache:
  1344. #del result # Do we need this to keep memory clean?
  1345. return cls.__cache[result.uid()]
  1346. cls.__cache[result.uid()] = result
  1347. return result
  1348. else: # Default. Shouldn't be used.
  1349. return super(RefCachingType, cls).__call__(*args, **kwargs)
  1350. class Ref(object):
  1351. """
  1352. A Ref is a reference to a location. A location could be to a *book*, to a specific *segment* (e.g. verse or mishnah), to a *section* (e.g chapter), or to a *range*.
  1353. Instanciated with a string representation of the reference, e.g.:
  1354. ::
  1355. >>> Ref("Genesis 1:3")
  1356. >>> Ref("Rashi on Genesis 1:3")
  1357. >>> Ref("Genesis 1:3-2:4")
  1358. >>> Ref("Shabbat 4b")
  1359. >>> Ref("Rashi on Shabbat 4b-5a")
  1360. """
  1361. __metaclass__ = RefCachingType
  1362. def __init__(self, tref=None, _obj=None):
  1363. """
  1364. Object is generally initialized with a textual reference - ``tref``
  1365. Internally, the _obj argument can be used to instantiate a ref with a complete dict composing the Ref data
  1366. """
  1367. self.index = None
  1368. self.book = None
  1369. self.type = None
  1370. self.sections = []
  1371. self.toSections = []
  1372. self.index_node = None
  1373. if tref:
  1374. self.__init_ref_pointer_vars()
  1375. self.orig_tref = self.tref = tref
  1376. self._lang = "he" if is_hebrew(tref) else "en"
  1377. self.__clean_tref()
  1378. self.__init_tref()
  1379. self._validate()
  1380. elif _obj:
  1381. for key, value in _obj.items():
  1382. setattr(self, key, value)
  1383. self.__init_ref_pointer_vars()
  1384. self.tref = self.normal()
  1385. self._validate()
  1386. else:
  1387. self.__init_ref_pointer_vars()
  1388. def __init_ref_pointer_vars(self):
  1389. self._normal = None
  1390. self._he_normal = None
  1391. self._url = None
  1392. self._next = None
  1393. self._prev = None
  1394. self._padded = None
  1395. self._context = {}
  1396. self._first_spanned_ref = None
  1397. self._spanned_refs = []
  1398. self._ranged_refs = []
  1399. self._range_depth = None
  1400. self._range_index = None
  1401. def _validate(self):
  1402. offset = 0
  1403. if self.is_bavli():
  1404. offset = 2
  1405. checks = [self.sections, self.toSections]
  1406. for check in checks:
  1407. if 0 in check:
  1408. raise InputError(u"{} {} must be greater than 0".format(self.book, self.index_node.sectionNames[check.index(0)]))
  1409. if getattr(self.index_node, "lengths", None) and len(check):
  1410. if check[0] > self.index_node.lengths[0] + offset:
  1411. display_size = self.index_node.address_class(0).toStr("en", self.index_node.lengths[0] + offset)
  1412. raise InputError(u"{} ends at {} {}.".format(self.book, self.index_node.sectionNames[0], display_size))
  1413. for i in range(len(self.sections)):
  1414. if self.toSections > self.sections:
  1415. break
  1416. if self.toSections < self.sections:
  1417. raise InputError(u"{} is an invalid range. Ranges must end later than they begin.".format(self.normal()))
  1418. def __clean_tref(self):
  1419. self.tref = self.tref.strip().replace(u"–", "-").replace("_", " ") # don't replace : in Hebrew, where it can indicate amud
  1420. if self._lang == "he":
  1421. return
  1422. try:
  1423. self.tref = self.tref.decode('utf-8').replace(":", ".")
  1424. except UnicodeEncodeError, e:
  1425. return {"error": "UnicodeEncodeError: %s" % e}
  1426. except AttributeError, e:
  1427. return {"error": "AttributeError: %s" % e}
  1428. try:
  1429. # capitalize first letter (don't title case all to avoid e.g., "Song Of Songs")
  1430. self.tref = self.tref[0].upper() + self.tref[1:]
  1431. except IndexError:
  1432. pass
  1433. def __reinit_tref(self, new_tref):
  1434. self.tref = new_tref
  1435. self.__clean_tref()
  1436. self._lang = "en"
  1437. self.__init_tref()
  1438. def __init_tref(self):
  1439. parts = [s.strip() for s in self.tref.split("-")]
  1440. if len(parts) > 2:
  1441. raise InputError(u"Couldn't understand ref '{}' (too many -'s).".format(self.tref))
  1442. base = parts[0]
  1443. title = None
  1444. tndict = library.get_title_node_dict(self._lang, with_commentary=True)
  1445. termdict = library.get_term_dict(self._lang)
  1446. for l in range(len(base), 0, -1):
  1447. self.index_node = tndict.get(base[0:l])
  1448. new_tref = termdict.get(base[0:l])
  1449. if self.index_node:
  1450. title = base[0:l]
  1451. if base[l - 1] == ".": # Take care of Refs like "Exo.14.15", where the period shouldn't get swallowed in the name.
  1452. title = base[0:l - 1]
  1453. break
  1454. if new_tref:
  1455. self.__reinit_tref(new_tref)
  1456. return
  1457. if title:
  1458. assert isinstance(self.index_node, SchemaNode)
  1459. self.index = self.index_node.index
  1460. self.book = self.index_node.full_title("en")
  1461. # checkFirst is used on Bavli records to check for a Mishnah pattern match first
  1462. if getattr(self.index_node, "checkFirst", None) and self.index_node.checkFirst.get(self._lang):
  1463. try:
  1464. check_node = library.get_schema_node(self.index_node.checkFirst[self._lang], self._lang)
  1465. assert isinstance(check_node, JaggedArrayNode) # Initially used with Mishnah records. Assumes JaggedArray.
  1466. reg = check_node.full_regex(title, self._lang, strict=True)
  1467. self.sections = self.__get_sections(reg, base, use_node=check_node)
  1468. except InputError: # Regex doesn't work
  1469. pass
  1470. except AttributeError: # Can't find node for check_node
  1471. pass
  1472. else:
  1473. old_index_node = self.index_node
  1474. self.index_node = check_node
  1475. self.index = self.index_node.index
  1476. self.book = self.index_node.full_title("en")
  1477. self.toSections = self.sections[:]
  1478. try:
  1479. self._validate()
  1480. except InputError: # created Ref doesn't validate, back it out
  1481. self.index_node = old_index_node
  1482. self.sections = []
  1483. elif self.index.is_commentary() and self._lang == "en":
  1484. if not getattr(self.index, "commentaryBook", None):
  1485. raise InputError(u"Please specify a text that {} comments on.".format(self.index.title))
  1486. else: # This may be a new version, try to build a schema node.
  1487. match = library.all_titles_regex(self._lang, commentary=True).match(base)
  1488. if match:
  1489. title = match.group('title')
  1490. on_node = library.get_schema_node(match.group('commentee')) # May be SchemaNode or JaggedArrayNode
  1491. self.index = library.get_index(match.group('commentor') + " on " + on_node.index.title)
  1492. self.index_node = self.index.nodes.title_dict(self._lang).get(title)
  1493. self.book = self.index_node.full_title("en")
  1494. if not self.index_node:
  1495. raise BookNameError(u"Can not find index record for {}".format(title))
  1496. else:
  1497. raise InputError(u"Unrecognized Index record: {}".format(base))
  1498. if title is None:
  1499. raise InputError(u"Could not find title in reference: {}".format(self.tref))
  1500. self.type = self.index_node.index.categories[0]
  1501. if title == base: # Bare book.
  1502. if self.index_node.is_default(): # Without any further specification, match the parent of the fall-through node
  1503. self.index_node = self.index_node.parent
  1504. self.book = self.index_node.full_title("en")
  1505. return
  1506. try:
  1507. reg = self.index_node.full_regex(title, self._lang) # Try to treat this as a JaggedArray
  1508. except AttributeError:
  1509. matched = self.index_node.full_title(self._lang)
  1510. msg = u"Partial reference match for '{}' - failed to find continuation for '{}'.\nValid continuations are:\n".format(self.tref, matched)
  1511. continuations = []
  1512. for child in self.index_node.children:
  1513. continuations += child.all_node_titles(self._lang)
  1514. msg += u",\n".join(continuations)
  1515. raise PartialRefInputError(msg, matched, continuations)
  1516. # Numbered Structure node - try numbered structure parsing
  1517. if self.index_node.has_children() and getattr(self.index_node, "_addressTypes", None):
  1518. try:
  1519. struct_indexes = self.__get_sections(reg, base)
  1520. self.index_node = reduce(lambda a, i: a.children[i], [s - 1 for s in struct_indexes], self.index_node)
  1521. title = self.book = self.index_node.full_title("en")
  1522. base = regex.sub(reg, title, base)
  1523. reg = self.index_node.full_regex(title, self._lang)
  1524. except InputError:
  1525. pass
  1526. #todo: ranges that cross structures
  1527. if title == base:
  1528. return
  1529. # Content node - Match primary structure address (may be stage two of numbered structure parsing)
  1530. if not self.index_node.has_children() and getattr(self.index_node, "_addressTypes", None):
  1531. try:
  1532. self.sections = self.__get_sections(reg, base)
  1533. except InputError:
  1534. pass
  1535. # Look for alternate structure
  1536. # todo: handle commentator on alt structure
  1537. if not self.sections and not self.index.is_commentary():
  1538. alt_struct_regex = self.index.alt_titles_regex(self._lang)
  1539. if alt_struct_regex:
  1540. match = alt_struct_regex.match(base)
  1541. if match:
  1542. title = match.group('title')
  1543. alt_struct_node = self.index.get_alt_struct_node(title, self._lang)
  1544. # Exact match alt structure node
  1545. if title == base:
  1546. new_tref = alt_struct_node.get_ref_from_sections([])
  1547. if new_tref:
  1548. self.__reinit_tref(new_tref)
  1549. return
  1550. try: # Some structure nodes don't have .regex() methods.
  1551. reg = alt_struct_node.full_regex(title, self._lang)
  1552. except AttributeError:
  1553. pass
  1554. else:
  1555. # Alternate numbered structure
  1556. if alt_struct_node.has_children() and getattr(alt_struct_node, "_addressTypes", None):
  1557. try:
  1558. struct_indexes = self.__get_sections(reg, base)
  1559. alt_struct_node = reduce(lambda a, i: a.children[i], [s - 1 for s in struct_indexes], alt_struct_node)
  1560. title = alt_struct_node.full_title("en")
  1561. base = regex.sub(reg, title, base)
  1562. reg = alt_struct_node.full_regex(title, self._lang)
  1563. except InputError:
  1564. pass
  1565. # Alt struct map node - (may be stage two of numbered structure parsing)
  1566. if title == base: #not a repetition of similar test above - title may have changed in numbered structure parsing
  1567. alt_struct_indexes = []
  1568. else:
  1569. alt_struct_indexes = self.__get_sections(reg, base)
  1570. new_tref = alt_struct_node.get_ref_from_sections(alt_struct_indexes)
  1571. if new_tref:
  1572. self.__reinit_tref(new_tref)
  1573. return
  1574. if not self.sections:
  1575. raise InputError(u"Failed to parse sections for ref {}".format(self.orig_tref))
  1576. self.toSections = self.sections[:]
  1577. if len(parts) == 2:
  1578. self.__init_ref_pointer_vars() # clear out any mistaken partial representations
  1579. if self._lang == "he" or any([a != "Integer" for a in self.index_node.addressTypes[1:]]): # in process. developing logic that should work for all languages / texts
  1580. # todo: handle sections names in "to" part. Handle talmud יד א - ב kind of cases.
  1581. range_parts = re.split("[., ]+", parts[1])
  1582. delta = len(self.sections) - len(range_parts)
  1583. for i in range(delta, len(self.sections)):
  1584. try:
  1585. self.toSections[i] = self.index_node._addressTypes[i].toNumber(self._lang, range_parts[i - delta])
  1586. except (ValueError, IndexError):
  1587. raise InputError(u"Couldn't understand text sections: '{}'.".format(self.tref))
  1588. elif self._lang == "en":
  1589. if self.index_node.addressTypes[0] == "Talmud":
  1590. self.__parse_talmud_range(parts[1])
  1591. else:
  1592. range_parts = re.split("[.:, ]+", parts[1])
  1593. delta = len(self.sections) - len(range_parts)
  1594. for i in range(delta, len(self.sections)):
  1595. try:
  1596. self.toSections[i] = int(range_parts[i - delta])
  1597. except (ValueError, IndexError):
  1598. raise InputError(u"Couldn't understand text sections: '{}'.".format(self.tref))
  1599. def __get_sections(self, reg, tref, use_node=None):
  1600. use_node = use_node or self.index_node
  1601. sections = []
  1602. ref_match = reg.match(tref)
  1603. if not ref_match:
  1604. raise InputError(u"Can not parse sections from ref: {}".format(tref))
  1605. gs = ref_match.groupdict()
  1606. for i in range(0, use_node.depth):
  1607. gname = u"a{}".format(i)
  1608. if gs.get(gname) is not None:
  1609. sections.append(use_node._addressTypes[i].toNumber(self._lang, gs.get(gname)))
  1610. return sections
  1611. def __parse_talmud_range(self, range_part):
  1612. #todo: make sure to-daf isn't out of range
  1613. self.toSections = range_part.split(".") # this was converting space to '.', for some reason.
  1614. # 'Shabbat 23a-b'
  1615. if self.toSections[0] == 'b':
  1616. self.toSections[0] = self.sections[0] + 1
  1617. # 'Shabbat 24b-25a'
  1618. elif regex.match("\d+[ab]", self.toSections[0]):
  1619. self.toSections[0] = daf_to_section(self.toSections[0])
  1620. # 'Shabbat 24b.12-24'
  1621. else:
  1622. delta = len(self.sections) - len(self.toSections)
  1623. for i in range(delta -1, -1, -1):
  1624. self.toSections.insert(0, self.sections[i])
  1625. self.toSections = [int(x) for x in self.toSections]
  1626. def __eq__(self, other):
  1627. return self.uid() == other.uid()
  1628. def __ne__(self, other):
  1629. return not self.__eq__(other)
  1630. @staticmethod
  1631. def is_ref(tref):
  1632. """
  1633. Static method for testing if a string is valid for instanciating a Ref object.
  1634. :param string tref: the string to test
  1635. :return bool:
  1636. """
  1637. try:
  1638. Ref(tref)
  1639. return True
  1640. except InputError:
  1641. return False
  1642. def is_talmud(self):
  1643. """
  1644. Is this a Talmud reference?
  1645. :return bool:
  1646. """
  1647. return getattr(self.index_node, "addressTypes", None) and len(self.index_node.addressTypes) and self.index_node.addressTypes[0] == "Talmud"
  1648. def is_bavli(self):
  1649. """
  1650. Is this a Talmud Bavli reference?
  1651. :return bool:
  1652. """
  1653. if self.is_commentary():
  1654. return u"Bavli" in self.index.b_index.categories
  1655. else:
  1656. return u"Bavli" in self.index.categories
  1657. def is_commentary(self):
  1658. """
  1659. Is this a commentary reference?
  1660. :return bool:
  1661. """
  1662. return self.type == "Commentary"
  1663. def is_range(self):
  1664. """
  1665. Is this reference a range?
  1666. A Ref is range if it's starting point and ending point are different, i.e. it has a dash in its text form.
  1667. References can cover large areas of text without being a range - in the case where they are references to chapters.
  1668. ::
  1669. >>> Ref("Genesis 3").is_range()
  1670. False
  1671. >>> Ref("Genesis 3-5").is_range()
  1672. True
  1673. :return bool:
  1674. """
  1675. return self.sections != self.toSections
  1676. def range_size(self):
  1677. """
  1678. How large is the range?
  1679. :return int:
  1680. """
  1681. #todo: rewrite with range_index to handle ranges across higher level sections
  1682. return self.toSections[-1] - self.sections[-1] + 1
  1683. def range_index(self):
  1684. """
  1685. At what section index does the range begin?
  1686. ::
  1687. >>> Ref("Leviticus 15:3 - 17:12").range_index()
  1688. 0
  1689. >>> Ref("Leviticus 15-17").range_index()
  1690. 0
  1691. >>> Ref("Leviticus 15:17-21").range_index()
  1692. 1
  1693. >>> Ref("Leviticus 15:17").range_index()
  1694. 2
  1695. :return int:
  1696. """
  1697. if not self._range_index:
  1698. self._set_range_data()
  1699. return self._range_index
  1700. def range_depth(self):
  1701. """
  1702. How deep is the range?
  1703. ::
  1704. >>> Ref("Leviticus 15:3 - 17:12").range_depth()
  1705. 2
  1706. >>> Ref("Leviticus 15-17").range_depth()
  1707. 2
  1708. >>> Ref("Leviticus 15:17-21").range_depth()
  1709. 1
  1710. >>> Ref("Leviticus 15:17").range_depth()
  1711. 0
  1712. :return int:
  1713. """
  1714. if not self._range_depth:
  1715. self._set_range_data()
  1716. return self._range_depth
  1717. def _set_range_data(self):
  1718. if not self.is_range():
  1719. self._range_depth = 0
  1720. self._range_index = self.index_node.depth
  1721. else:
  1722. for i in range(0, self.index_node.depth):
  1723. if self.sections[i] != self.toSections[i]:
  1724. self._range_depth = self.index_node.depth - i
  1725. self._range_index = i
  1726. break
  1727. def is_spanning(self):
  1728. """
  1729. :return bool: True if the Ref spans across text sections.
  1730. ::
  1731. >>> Ref("Shabbat 13a-b").is_spanning()
  1732. True
  1733. >>> Ref("Shabbat 13a:3-14").is_spanning()
  1734. False
  1735. >>> Ref("Job 4:3-5:3").is_spanning()
  1736. True
  1737. >>> Ref("Job 4:5-18").is_spanning()
  1738. False
  1739. """
  1740. return self.span_size() > 1
  1741. def span_size(self):
  1742. """
  1743. How many sections does the span cover?
  1744. ::
  1745. >>> Ref("Leviticus 15:3 - 17:12").span_size()
  1746. 3
  1747. >>> Ref("Leviticus 15-17").span_size()
  1748. 3
  1749. >>> Ref("Leviticus 15:17-21").span_size()
  1750. 1
  1751. >>> Ref("Leviticus 15:17").span_size()
  1752. 1
  1753. :return int:
  1754. """
  1755. if not getattr(self.index_node, "depth", None) or self.index_node.depth == 1:
  1756. # text with no depth or depth 1 can't be spanning
  1757. return 0
  1758. if len(self.sections) == 0:
  1759. # can't be spanning if no sections set
  1760. return 0
  1761. if len(self.sections) <= self.index_node.depth - 2:
  1762. point = len(self.sections) - 1
  1763. else:
  1764. point = self.index_node.depth - 2
  1765. for i in range(0, point + 1):
  1766. size = self.toSections[i] - self.sections[i] + 1
  1767. if size > 1:
  1768. return size
  1769. return 1
  1770. def is_section_level(self):
  1771. """
  1772. Is this Ref section (e.g. Chapter) level?
  1773. ::
  1774. >>> Ref("Leviticus 15:3").is_section_level()
  1775. False
  1776. >>> Ref("Leviticus 15").is_section_level()
  1777. True
  1778. >>> Ref("Rashi on Leviticus 15:3").is_section_level()
  1779. True
  1780. >>> Ref("Rashi on Leviticus 15:3:1").is_section_level()
  1781. False
  1782. >>> Ref("Leviticus 15-17").is_section_level()
  1783. True
  1784. :return bool:
  1785. """
  1786. return len(self.sections) == self.index_node.depth - 1
  1787. def is_segment_level(self):
  1788. """
  1789. Is this Ref segment (e.g. Verse) level?
  1790. ::
  1791. >>> Ref("Leviticus 15:3").is_segment_level()
  1792. True
  1793. >>> Ref("Leviticus 15").is_segment_level()
  1794. False
  1795. >>> Ref("Rashi on Leviticus 15:3").is_segment_level()
  1796. False
  1797. >>> Ref("Rashi on Leviticus 15:3:1").is_segment_level()
  1798. True
  1799. :return bool:
  1800. """
  1801. return len(self.sections) == self.index_node.depth
  1802. """ Methods to generate new Refs based on this Ref """
  1803. def _core_dict(self):
  1804. return {
  1805. "index": self.index,
  1806. "book": self.book,
  1807. "type": self.type,
  1808. "index_node": self.index_node,
  1809. "sections": self.sections[:],
  1810. "toSections": self.toSections[:]
  1811. }
  1812. def has_default_child(self):
  1813. return self.index_node.has_default_child()
  1814. def default_child_ref(self):
  1815. """
  1816. Return ref to the default node underneath this node
  1817. :return:
  1818. """
  1819. if not self.has_default_child():
  1820. return self
  1821. d = self._core_dict()
  1822. d["index_node"] = self.index_node.get_default_child()
  1823. return Ref(_obj=d)
  1824. def surrounding_ref(self, size=1):
  1825. """
  1826. Return a reference with 'size' additional segments added to each side.
  1827. Currently does not extend to sections beyond the original ref's span.
  1828. :param int size:
  1829. :return: :class:`Ref`
  1830. """
  1831. if self.starting_ref().sections[-1] > size:
  1832. start = self.starting_ref().sections[-1] - size
  1833. else:
  1834. start = 1
  1835. ending_sections = self.ending_ref().sections
  1836. ending_section_length = self.get_state_ja().sub_array_length([s - 1 for s in ending_sections[:-1]])
  1837. if ending_sections[-1] + size < ending_section_length:
  1838. end = ending_sections[-1] + size
  1839. else:
  1840. end = ending_section_length
  1841. d = self._core_dict()
  1842. d["sections"] = d["sections"][:-1] + [start]
  1843. d["toSections"] = d["toSections"][:-1] + [end]
  1844. return Ref(_obj=d)
  1845. def starting_ref(self):
  1846. """
  1847. For ranged Refs, return the starting Ref
  1848. :return: :class:`Ref`
  1849. """
  1850. if not self.is_range():
  1851. return self
  1852. d = self._core_dict()
  1853. d["toSections"] = self.sections[:]
  1854. return Ref(_obj=d)
  1855. def ending_ref(self):
  1856. """
  1857. For ranged Refs, return the ending Ref
  1858. :return: :class:`Ref`
  1859. """
  1860. if not self.is_range():
  1861. return self
  1862. d = self._core_dict()
  1863. d["sections"] = self.toSections[:]
  1864. return Ref(_obj=d)
  1865. def section_ref(self):
  1866. """
  1867. Return the section level Ref
  1868. For texts of depth 2, this has the same behavior as :meth:`top_section_ref`
  1869. ::
  1870. >>> Ref("Rashi on Genesis 2:3:1").section_ref()
  1871. Ref("Rashi on Genesis 2:3")
  1872. >>> Ref("Genesis 2:3").section_ref()
  1873. Ref("Genesis 2")
  1874. :return: :class:`Ref`
  1875. """
  1876. if not self.is_segment_level():
  1877. return self
  1878. return self.padded_ref().context_ref()
  1879. def top_section_ref(self):
  1880. """
  1881. Return the highest level section Ref.
  1882. For texts of depth 2, this has the same behavior as :meth:`section_ref`
  1883. ::
  1884. >>> Ref("Rashi on Genesis 2:3:1").top_section_ref()
  1885. Ref("Rashi on Genesis 2")
  1886. >>> Ref("Genesis 2:3").top_section_ref()
  1887. Ref("Genesis 2")
  1888. :return: :class:`Ref`
  1889. """
  1890. return self.padded_ref().context_ref(self.index_node.depth - 1)
  1891. def next_section_ref(self):
  1892. """
  1893. Returns a Ref to the next section (e.g. Chapter).
  1894. If this is the last section, returns ``None``
  1895. :return: :class:`Ref`
  1896. """
  1897. if not self._next:
  1898. self._next = self._iter_text_section()
  1899. if self._next is None and self.index_node.is_leaf():
  1900. current_leaf = self.index_node
  1901. #we now need to iterate over the next leaves, finding the first available section
  1902. while True:
  1903. next_leaf = current_leaf.next_leaf() #next schema/JANode
  1904. if next_leaf:
  1905. next_node_ref = next_leaf.ref() #get a ref so we can do the next lines
  1906. potential_next = next_node_ref._iter_text_section(depth_up=0 if next_leaf.depth == 1 else 1)
  1907. if potential_next:
  1908. self._next = potential_next
  1909. break
  1910. current_leaf = next_leaf
  1911. else:
  1912. self._next = None
  1913. break
  1914. return self._next
  1915. def prev_section_ref(self):
  1916. """
  1917. Returns a Ref to the previous section (e.g. Chapter).
  1918. If this is the first section, returns ``None``
  1919. :return: :class:`Ref`
  1920. """
  1921. if not self._prev:
  1922. self._prev = self._iter_text_section(False)
  1923. if self._prev is None and self.index_node.is_leaf():
  1924. current_leaf = self.index_node
  1925. #we now need to iterate over the prev leaves, finding the first available section
  1926. while True:
  1927. prev_leaf = current_leaf.prev_leaf() #prev schema/JANode
  1928. if prev_leaf:
  1929. prev_node_ref = prev_leaf.ref() #get a ref so we can do the next lines
  1930. potential_prev = prev_node_ref._iter_text_section(forward=False, depth_up=0 if prev_leaf.depth == 1 else 1)
  1931. if potential_prev:
  1932. self._prev = potential_prev
  1933. break
  1934. current_leaf = prev_leaf
  1935. else:
  1936. self._prev = None
  1937. break
  1938. return self._prev
  1939. def recalibrate_next_prev_refs(self, add_self=True):
  1940. """
  1941. Internal. Called when a section is inserted or deleted.
  1942. :param add_self:
  1943. :return: None
  1944. """
  1945. next_ref = self.next_section_ref()
  1946. prev_ref = self.prev_section_ref()
  1947. if next_ref:
  1948. next_ref._prev = self if add_self else prev_ref
  1949. if prev_ref:
  1950. prev_ref._next = self if add_self else next_ref
  1951. def prev_segment_ref(self):
  1952. """
  1953. Returns a :class:`Ref` to the next previous populated segment.
  1954. If this ref is not segment level, will return ``self```
  1955. :return: :class:`Ref`
  1956. """
  1957. r = self.starting_ref()
  1958. if not r.is_segment_level():
  1959. return r
  1960. if r.sections[-1] > 1:
  1961. d = r._core_dict()
  1962. d["sections"] = d["toSections"] = r.sections[:-1] + [r.sections[-1] - 1]
  1963. return Ref(_obj=d)
  1964. else:
  1965. r = r.prev_section_ref()
  1966. if not r:
  1967. return None
  1968. d = r._core_dict()
  1969. newSections = r.sections + [self.get_state_ja().sub_array_length([i - 1 for i in r.sections])]
  1970. d["sections"] = d["toSections"] = newSections
  1971. return Ref(_obj=d)
  1972. def next_segment_ref(self):
  1973. """
  1974. Returns a :class:`Ref` to the next populated segment.
  1975. If this ref is not segment level, will return ``self```
  1976. :return: :class:`Ref`
  1977. """
  1978. r = self.ending_ref()
  1979. if not r.is_segment_level():
  1980. return r
  1981. sectionRef = r.section_ref()
  1982. sectionLength = self.get_state_ja().sub_array_length([i - 1 for i in sectionRef.sections])
  1983. if r.sections[-1] < sectionLength:
  1984. d = r._core_dict()
  1985. d["sections"] = d["toSections"] = r.sections[:-1] + [r.sections[-1] + 1]
  1986. return Ref(_obj=d)
  1987. else:
  1988. return r.next_section_ref().subref(1)
  1989. def last_segment_ref(self):
  1990. """
  1991. Returns :class:`Ref` to the last segment in the current book (or complex book part).
  1992. Not to be confused with :meth:`ending_ref`
  1993. :return:
  1994. """
  1995. o = self._core_dict()
  1996. o["sections"] = o["toSections"] = [i + 1 for i in self.get_state_ja().last_index(self.index_node.depth)]
  1997. return Ref(_obj=o)
  1998. def first_available_section_ref(self):
  1999. """
  2000. Returns a :class:`Ref` to the first section inside of or following this :class:`Ref` that has some content.
  2001. Returns ``None`` if self is empty and no following :class:`Ref` has content.
  2002. :return: :class:`Ref`
  2003. """
  2004. if isinstance(self.index_node, JaggedArrayNode):
  2005. r = self.padded_ref()
  2006. elif isinstance(self.index_node, SchemaNode):
  2007. nodes = self.index_node.get_leaf_nodes()
  2008. if not len(nodes):
  2009. return None
  2010. r = nodes[0].ref().padded_ref()
  2011. else:
  2012. return None
  2013. return r.next_section_ref() if r.is_empty() else r
  2014. #Don't store results on Ref cache - state objects change, and don't yet propogate to this Cache
  2015. def get_state_node(self, meta=None, hint=None):
  2016. """
  2017. :return: :class:`sefaria.model.version_state.StateNode`
  2018. """
  2019. from . import version_state
  2020. return version_state.StateNode(snode=self.index_node, meta=meta, hint=hint)
  2021. def get_state_ja(self, lang="all"):
  2022. """
  2023. :param lang: "all", "he", or "en"
  2024. :return: :class:`sefaria.datatype.jagged_array`
  2025. """
  2026. return self.get_state_node(hint=[(lang, "availableTexts")]).ja(lang)
  2027. def is_text_fully_available(self, lang):
  2028. """
  2029. :param lang: "he" or "en"
  2030. :return: True if at least one complete version of ref is available in lang.
  2031. """
  2032. if self.is_section_level() or self.is_segment_level():
  2033. # Using mongo queries to slice and merge versions
  2034. # is much faster than actually using the Version State doc
  2035. text = self.text(lang=lang).text
  2036. return bool(len(text) and all(text))
  2037. else:
  2038. sja = self.get_state_ja(lang)
  2039. subarray = sja.subarray_with_ref(self)
  2040. return subarray.is_full()
  2041. def is_text_translated(self):
  2042. """
  2043. :return: True if at least one complete version of this :class:`Ref` is available in English.
  2044. """
  2045. return self.is_text_fully_available("en")
  2046. def is_empty(self):
  2047. """
  2048. Checks if :class:`Ref` has any corresponding data in :class:`Version` records.
  2049. :return: Bool True is there is not text at this ref in any language
  2050. """
  2051. return not len(self.versionset())
  2052. def _iter_text_section(self, forward=True, depth_up=1):
  2053. """
  2054. Iterate forwards or backwards to the next available :class:`Ref` in a text
  2055. :param forward: Boolean indicating direction to iterate
  2056. :depth_up: if we want to traverse the text at a higher level than most granular. Defaults to one level above
  2057. :return: :class:`Ref`
  2058. """
  2059. if self.index_node.depth <= depth_up: # if there is only one level of text, don't even waste time iterating.
  2060. return None
  2061. #arrays are 0 based. text sections are 1 based. so shift the numbers back.
  2062. if not forward:
  2063. # Going backward, start from begginning of Ref
  2064. starting_points = [s - 1 for s in self.sections[:self.index_node.depth - depth_up]]
  2065. else:
  2066. # Going forward start form end of Ref
  2067. starting_points = [s - 1 for s in self.toSections[:self.index_node.depth - depth_up]]
  2068. #start from the next one
  2069. if len(starting_points) > 0:
  2070. starting_points[-1] += 1 if forward else -1
  2071. #let the counts obj calculate the correct place to go.
  2072. c = self.get_state_node(hint=[("all","availableTexts")]).ja("all", "availableTexts")
  2073. new_section = c.next_index(starting_points) if forward else c.prev_index(starting_points)
  2074. # we are also scaling back the sections to the level ABOVE the lowest section type (eg, for bible we want chapter, not verse)
  2075. if new_section:
  2076. d = self._core_dict()
  2077. d["toSections"] = d["sections"] = [(s + 1) for s in new_section[:-depth_up]]
  2078. return Ref(_obj=d)
  2079. else:
  2080. return None
  2081. def to(self, toref):
  2082. """
  2083. Return a reference that begins at this :class:`Ref`, and ends at toref
  2084. :param toref: :class:`Ref` that denotes the end of the new ranged :class:`Ref`
  2085. :return: :class:`Ref`
  2086. """
  2087. assert self.book == toref.book
  2088. d = self._core_dict()
  2089. d["toSections"] = toref.toSections[:]
  2090. return Ref(_obj=d)
  2091. def subref(self, subsections):
  2092. """
  2093. Returns a more specific reference than the current Ref
  2094. :param subsection: int or list - the subsection(s) of the current Ref
  2095. :return: :class:`Ref`
  2096. """
  2097. if isinstance(subsections, int):
  2098. subsections = [subsections]
  2099. assert self.index_node.depth >= len(self.sections) + len(subsections), u"Tried to get subref of bottom level ref: {}".format(self.normal())
  2100. assert not self.is_range(), u"Tried to get subref of ranged ref".format(self.normal())
  2101. d = self._core_dict()
  2102. d["sections"] += subsections
  2103. d["toSections"] += subsections
  2104. return Ref(_obj=d)
  2105. def subrefs(self, length):
  2106. """
  2107. Return a list of :class:`Ref` objects one level deeper than this :class:`Ref`, from 1 to `length`.
  2108. :param length: Number of subrefs to return
  2109. ::
  2110. >>> Ref("Genesis").subrefs(4)
  2111. [Ref('Genesis 1'),
  2112. Ref('Genesis 2'),
  2113. Ref('Genesis 3'),
  2114. Ref('Genesis 4')]
  2115. :return: List of :class:`Ref`
  2116. """
  2117. l = []
  2118. for i in range(length):
  2119. l.append(self.subref(i + 1))
  2120. return l
  2121. def all_subrefs(self):
  2122. """
  2123. Return a list of all the valid :class:`Ref` objects one level deeper than this :class:`Ref`.
  2124. ::
  2125. >>> Ref("Genesis").all_subrefs()
  2126. [Ref('Genesis 1'),
  2127. Ref('Genesis 2'),
  2128. Ref('Genesis 3'),
  2129. Ref('Genesis 4'),
  2130. ...]
  2131. :return: List of :class:`Ref`
  2132. """
  2133. assert not self.is_range(), "Ref.all_subrefs() is not intended for use on Ranges"
  2134. size = self.get_state_ja().sub_array_length([i - 1 for i in self.sections])
  2135. return self.subrefs(size)
  2136. def context_ref(self, level=1):
  2137. """
  2138. :return: :class:`Ref` that is more general than this :class:`Ref`.
  2139. :param level: how many levels to 'zoom out' from the most specific possible :class:`Ref`
  2140. ::
  2141. >>> Ref("Genesis 4:5").context_ref(level = 1)
  2142. Ref("Genesis 4")
  2143. >>> Ref("Genesis 4:5").context_ref(level = 2)
  2144. Ref("Genesis")
  2145. If this :class:`Ref` is less specific than or equally specific to the level given, it is returned as-is.
  2146. """
  2147. if level == 0:
  2148. return self
  2149. if not self._context.get(level) or not self._context[level]:
  2150. if len(self.sections) <= self.index_node.depth - level:
  2151. return self
  2152. if level > self.index_node.depth:
  2153. raise InputError(u"Call to Ref.context_ref of {} exceeds Ref depth of {}.".format(level, self.index_node.depth))
  2154. d = self._core_dict()
  2155. d["sections"] = d["sections"][:self.index_node.depth - level]
  2156. d["toSections"] = d["toSections"][:self.index_node.depth - level]
  2157. self._context[level] = Ref(_obj=d)
  2158. return self._context[level]
  2159. def padded_ref(self):
  2160. """
  2161. :return: :class:`Ref` with 1s inserted to make the :class:`Ref` specific to the section level
  2162. ::
  2163. >>> Ref("Genesis").padded_ref()
  2164. Ref("Genesis 1")
  2165. If this :class:`Ref` is already specific to the section or segment level, it is returned unchanged.
  2166. ::
  2167. >>> Ref("Genesis 1").padded_ref()
  2168. Ref("Genesis 1")
  2169. """
  2170. if not self._padded:
  2171. if not getattr(self, "index_node", None):
  2172. raise Exception(u"No index_node found {}".format(vars(self)))
  2173. if len(self.sections) >= self.index_node.depth - 1:
  2174. return self
  2175. d = self._core_dict()
  2176. if self.is_talmud():
  2177. if len(self.sections) == 0: #No daf specified
  2178. section = 3 if "Bavli" in self.index.categories else 1
  2179. d["sections"].append(section)
  2180. d["toSections"].append(section)
  2181. for i in range(self.index_node.depth - len(d["sections"]) - 1):
  2182. d["sections"].append(1)
  2183. d["toSections"].append(1) # todo: is this valid in all cases?
  2184. self._padded = Ref(_obj=d)
  2185. return self._padded
  2186. def first_spanned_ref(self):
  2187. """
  2188. Returns the first section portion of a spanning :class:`Ref`.
  2189. Designed to cut the wasted cost of running :meth:`split_spanning_ref`
  2190. >>> Ref("Shabbat 6b-9a").first_spanned_ref()
  2191. Ref('Shabbat 6b')
  2192. >>> Ref("Shabbat 6b.12-9a.7").first_spanned_ref()
  2193. Ref('Shabbat 6b:12-47')
  2194. :return: :py:class:`Ref`
  2195. """
  2196. if not self._first_spanned_ref:
  2197. if self._spanned_refs:
  2198. self._first_spanned_ref = self._spanned_refs[0]
  2199. elif self.index_node.depth == 1 or not self.is_spanning():
  2200. self._first_spanned_ref = self
  2201. else:
  2202. ref_depth = len(self.sections)
  2203. d = self._core_dict()
  2204. d["toSections"] = self.sections[0:self.range_index() + 1]
  2205. for i in range(self.range_index() + 1, ref_depth):
  2206. d["toSections"] += [self.get_state_ja().sub_array_length([s - 1 for s in d["toSections"][0:i]])]
  2207. r = Ref(_obj=d)
  2208. if self.range_depth() > 2:
  2209. self._first_spanned_ref = r.first_spanned_ref()
  2210. else:
  2211. self._first_spanned_ref = r
  2212. return self._first_spanned_ref
  2213. def split_spanning_ref(self):
  2214. """
  2215. Return list of non-spanning :class:`Ref` objects which completely cover the area of this Ref
  2216. >>> Ref("Shabbat 13b-14b").split_spanning_ref()
  2217. [Ref("Shabbat 13b"), Ref("Shabbat 14a"), Ref("Shabbat 14b")]
  2218. >>> Ref("Shabbat 13b:3 - 14b:3").split_spanning_ref()
  2219. [Ref('Shabbat 13b:3-50'), Ref('Shabbat 14a'), Ref('Shabbat 14b:1-3')]
  2220. """
  2221. if not self._spanned_refs:
  2222. if self.index_node.depth == 1 or not self.is_spanning():
  2223. self._spanned_refs = [self]
  2224. else:
  2225. start, end = self.sections[self.range_index()], self.toSections[self.range_index()]
  2226. ref_depth = len(self.sections)
  2227. refs = []
  2228. for n in range(start, end + 1):
  2229. d = self._core_dict()
  2230. if n == start:
  2231. d["toSections"] = self.sections[0:self.range_index() + 1]
  2232. for i in range(self.range_index() + 1, ref_depth):
  2233. d["toSections"] += [self.get_state_ja().sub_array_length([s - 1 for s in d["toSections"][0:i]])]
  2234. elif n == end:
  2235. d["sections"] = self.toSections[0:self.range_index() + 1]
  2236. for _ in range(self.range_index() + 1, ref_depth):
  2237. d["sections"] += [1]
  2238. else:
  2239. d["sections"] = self.sections[0:self.range_index()] + [n]
  2240. d["toSections"] = self.sections[0:self.range_index()] + [n]
  2241. ''' If we find that we need to expand inner refs, add this arg.
  2242. # It will require handling on cached ref and passing on the recursive call below.
  2243. if expand_middle:
  2244. for i in range(self.range_index() + 1, ref_depth):
  2245. d["sections"] += [1]
  2246. d["toSections"] += [self.get_state_ja().sub_array_length([s - 1 for s in d["toSections"][0:i]])]
  2247. '''
  2248. if d["toSections"][-1]: # to filter out, e.g. non-existant Rashi's, where the last index is 0
  2249. refs.append(Ref(_obj=d))
  2250. if self.range_depth() == 2:
  2251. self._spanned_refs = refs
  2252. if self.range_depth() > 2: #recurse
  2253. expanded_refs = []
  2254. for ref in refs:
  2255. expanded_refs.extend(ref.split_spanning_ref())
  2256. self._spanned_refs = expanded_refs
  2257. return self._spanned_refs
  2258. def range_list(self):
  2259. """
  2260. :return: list of :class:`Ref` objects corresponding to each point in the range of this :class:`Ref`
  2261. Does not work for spanning refs
  2262. """
  2263. if not self._ranged_refs:
  2264. if not self.is_range():
  2265. return [self]
  2266. if self.is_spanning():
  2267. raise InputError(u"Can not get range of spanning ref: {}".format(self))
  2268. results = []
  2269. for s in range(self.sections[-1], self.toSections[-1] + 1):
  2270. d = self._core_dict()
  2271. d["sections"][-1] = s
  2272. d["toSections"][-1] = s
  2273. results.append(Ref(_obj=d))
  2274. self._ranged_refs = results
  2275. return self._ranged_refs
  2276. def regex(self, as_list=False, anchored=True):
  2277. """
  2278. :return string: for a Regular Expression which will find any refs that match this Ref exactly, or more specifically.
  2279. E.g., "Genesis 1" yields an RE that match "Genesis 1" and "Genesis 1:3"
  2280. """
  2281. #todo: explore edge cases - book name alone, full ref to segment level
  2282. #todo: move over to the regex methods of the index nodes
  2283. patterns = []
  2284. if self.is_range():
  2285. if self.is_spanning():
  2286. s_refs = self.split_spanning_ref()
  2287. normals = []
  2288. for s_ref in s_refs:
  2289. normals += [r.normal() for r in s_ref.range_list()]
  2290. else:
  2291. normals = [r.normal() for r in self.range_list()]
  2292. for r in normals:
  2293. sections = re.sub("^%s" % re.escape(self.book), '', r)
  2294. patterns.append("%s$" % sections) # exact match
  2295. patterns.append("%s:" % sections) # more granualar, exact match followed by :
  2296. patterns.append("%s \d" % sections) # extra granularity following space
  2297. else:
  2298. sections = re.sub("^%s" % re.escape(self.book), '', self.normal())
  2299. patterns.append("%s$" % sections) # exact match
  2300. if self.index_node.has_titled_continuation():
  2301. patterns.append(u"{}({}).".format(sections, u"|".join(self.index_node.title_separators)))
  2302. elif self.index_node.has_numeric_continuation():
  2303. patterns.append("%s:" % sections) # more granualar, exact match followed by :
  2304. patterns.append("%s \d" % sections) # extra granularity following space
  2305. escaped_book = re.escape(self.book)
  2306. if anchored:
  2307. if as_list:
  2308. return ["^{}{}".format(escaped_book, p) for p in patterns]
  2309. else:
  2310. return "^%s(%s)" % (escaped_book, "|".join(patterns))
  2311. else:
  2312. if as_list:
  2313. return ["{}{}".format(escaped_book, p) for p in patterns]
  2314. else:
  2315. return "%s(%s)" % (escaped_book, "|".join(patterns))
  2316. def base_text_and_commentary_regex(self):
  2317. ref_regex_str = self.regex(anchored=False)
  2318. commentators = library.get_commentary_version_titles_on_book(self.book, with_commentary2=True)
  2319. if commentators:
  2320. pattern = ur"(^{})|(^({}) on {})".format(ref_regex_str, "|".join(commentators), ref_regex_str)
  2321. else:
  2322. pattern = ur"^{}".format(ref_regex_str)
  2323. return pattern
  2324. """ Comparisons """
  2325. def overlaps(self, other):
  2326. """
  2327. Does this Ref overlap ``other`` Ref?
  2328. :param other:
  2329. :return bool:
  2330. """
  2331. assert isinstance(other, Ref)
  2332. if not self.index_node == other.index_node:
  2333. return False
  2334. return not (self.precedes(other) or self.follows(other))
  2335. def contains(self, other):
  2336. """
  2337. Does this Ref completely contain ``other`` Ref?
  2338. :param other:
  2339. :return bool:
  2340. """
  2341. assert isinstance(other, Ref)
  2342. if not self.index_node == other.index_node:
  2343. return False
  2344. return (
  2345. (not self.starting_ref().follows(other.starting_ref()))
  2346. and
  2347. (not self.ending_ref().precedes(other.ending_ref()))
  2348. )
  2349. def precedes(self, other):
  2350. """
  2351. Does this Ref completely precede ``other`` Ref?
  2352. :param other:
  2353. :return bool:
  2354. """
  2355. assert isinstance(other, Ref)
  2356. if not self.index_node == other.index_node:
  2357. return False
  2358. my_end = self.ending_ref()
  2359. other_start = other.starting_ref()
  2360. smallest_section_len = min([len(my_end.sections), len(other_start.sections)])
  2361. # Bare book references never precede or follow
  2362. if smallest_section_len == 0:
  2363. return False
  2364. # Compare all but last section
  2365. for i in range(smallest_section_len - 1):
  2366. if my_end.sections[i] < other_start.sections[i]:
  2367. return True
  2368. if my_end.sections[i] > other_start.sections[i]:
  2369. return False
  2370. # Compare last significant section
  2371. if my_end.sections[smallest_section_len - 1] < other_start.sections[smallest_section_len - 1]:
  2372. return True
  2373. return False
  2374. def follows(self, other):
  2375. """
  2376. Does this Ref completely follow ``other`` Ref?
  2377. :param other:
  2378. :return bool:
  2379. """
  2380. assert isinstance(other, Ref)
  2381. if not self.index_node == other.index_node:
  2382. return False
  2383. my_start = self.starting_ref()
  2384. other_end = other.ending_ref()
  2385. smallest_section_len = min([len(my_start.sections), len(other_end.sections)])
  2386. # Bare book references never precede or follow
  2387. if smallest_section_len == 0:
  2388. return False
  2389. # Compare all but last section
  2390. for i in range(smallest_section_len - 1):
  2391. if my_start.sections[i] > other_end.sections[i]:
  2392. return True
  2393. if my_start.sections[i] < other_end.sections[i]:
  2394. return False
  2395. # Compare last significant section
  2396. if my_start.sections[smallest_section_len - 1] > other_end.sections[smallest_section_len - 1]:
  2397. return True
  2398. return False
  2399. def in_terms_of(self, other):
  2400. """
  2401. Returns the current reference sections in terms of another, containing reference.
  2402. Returns an array of ordinal references, not array indexes. (Meaning first is 1)
  2403. Must be called on a point Reference, not a range
  2404. ""
  2405. >>> Ref("Genesis 6:3").in_terms_of("Genesis 6")
  2406. [3]
  2407. >>> Ref("Genesis 6:3").in_terms_of("Genesis")
  2408. [6,3]
  2409. >>> Ref("Genesis 6:3").in_terms_of("Genesis 6-7")
  2410. [1,3]
  2411. >>> Ref("Genesis 6:8").in_terms_of("Genesis 6:3-7:3")
  2412. [1, 6]
  2413. :param other: :class:`Ref`
  2414. :return: array of indexes
  2415. """
  2416. #What's best behavior for these cases?
  2417. assert isinstance(other, Ref)
  2418. if not self.index_node == other.index_node:
  2419. return None
  2420. if self.is_range():
  2421. raise Exception("Ref.in_terms_of() called on ranged Ref: {}".format(self))
  2422. if not other.contains(self):
  2423. return None
  2424. ret = []
  2425. if not other.is_range():
  2426. ret = self.sections[len(other.sections):]
  2427. else:
  2428. for i in range(other.range_index(), self.index_node.depth):
  2429. ret.append(self.sections[i] + 1 - other.sections[i])
  2430. if other.sections[i] != self.sections[i] or len(other.sections) <= i + 1:
  2431. ret += self.sections[i + 1:]
  2432. break
  2433. return ret
  2434. def order_id(self):
  2435. """
  2436. Returns a unique id for this reference that establishes an ordering of references across the whole catalog.
  2437. This id will change as the ordering of the catalog changes, and may begin to overlap with other numbers because of those changes.
  2438. However, at any point in time these ids will be unique across the catalog.
  2439. Used to sort results from ElasticSearch queries
  2440. :return string:
  2441. """
  2442. #Todo: handle complex texts. Right now, all complex results are grouped under the root of the text
  2443. from sefaria.summaries import category_id_dict
  2444. cats = self.index.categories[:]
  2445. if len(cats) >= 1 and cats[0] == "Commentary":
  2446. cats = cats[1:2] + ["Commentary"] + cats[2:]
  2447. key = "/".join(cats + [self.index.title])
  2448. try:
  2449. base = category_id_dict()[key]
  2450. res = reduce(lambda x, y: x + format(y, '04'), self.sections, base)
  2451. if self.is_range():
  2452. res = reduce(lambda x, y: x + format(y, '04'), self.toSections, res + "-")
  2453. return res
  2454. except Exception as e:
  2455. logger.warning("Failed to execute order_id for {} : {}".format(self, e))
  2456. return "Z"
  2457. """ Methods for working with Versions and VersionSets """
  2458. def storage_address(self):
  2459. """
  2460. Return the storage location within a Version for this Ref.
  2461. :return string:
  2462. """
  2463. return ".".join(["chapter"] + self.index_node.address()[1:])
  2464. def part_projection(self):
  2465. """
  2466. Returns the slice and storage address to return top-level sections for Versions of this ref
  2467. Used as:
  2468. ::
  2469. Version().load({...},oref.part_projection())
  2470. **Regarding projecting complex texts:**
  2471. By specifying a projection that includes a non-existing element of our dictionary at the level of our selection,
  2472. we cause all other elements of the dictionary to be unselected.
  2473. A bit non-intuitive, but a huge savings of document size and time on the data transfer.
  2474. http://stackoverflow.com/a/15798087/213042
  2475. """
  2476. # todo: reimplement w/ aggregation pipeline (see above)
  2477. # todo: special case string 0?
  2478. projection = {k: 1 for k in Version.required_attrs + Version.optional_attrs}
  2479. del projection[Version.content_attr] # Version.content_attr == "chapter"
  2480. projection["_id"] = 0
  2481. if not self.sections:
  2482. # For simple texts, self.store_address() == "chapter".
  2483. # For complex texts, it can be a deeper branch of the dictionary: "chapter.Bereshit.Torah" or similar
  2484. projection[self.storage_address()] = 1
  2485. else:
  2486. skip = self.sections[0] - 1
  2487. limit = 1 if self.range_index() > 0 else self.toSections[0] - self.sections[0] + 1
  2488. slce = {"$slice": [skip, limit]}
  2489. projection[self.storage_address()] = slce
  2490. if len(self.index_node.address()) > 1:
  2491. # create dummy key at level of our selection - see above.
  2492. dummy_limiter = ".".join(["chapter"] + self.index_node.address()[1:-1] + ["hacky_dummy_key"])
  2493. projection[dummy_limiter] = 1
  2494. return projection
  2495. def condition_query(self, lang=None):
  2496. """
  2497. Return condition to select only versions with content at the location of this Ref.
  2498. Usage:
  2499. ::
  2500. VersionSet(oref.condition_query(lang))
  2501. Can be combined with :meth:`part_projection` to only return the content indicated by this ref:
  2502. ::
  2503. VersionSet(oref.condition_query(lang), proj=oref.part_projection())
  2504. :return: dict containing a query in the format expected by VersionSet
  2505. """
  2506. d = {
  2507. "title": self.index.title,
  2508. }
  2509. if lang:
  2510. d.update({"language": lang})
  2511. condition_addr = self.storage_address()
  2512. if not self.sections:
  2513. d.update({
  2514. condition_addr: {"$exists": True, "$elemMatch": {"$nin": ["", [], 0]}} # any non-empty element will do
  2515. })
  2516. elif not self.is_spanning():
  2517. for s in range(0, len(self.sections) if not self.is_range() else len(self.sections) - 1):
  2518. condition_addr += ".{}".format(self.sections[s] - 1)
  2519. if len(self.sections) == self.index_node.depth and not self.is_range():
  2520. d.update({
  2521. condition_addr: {"$exists": True, "$nin": ["", [], 0]}
  2522. })
  2523. else:
  2524. d.update({
  2525. condition_addr: {"$exists": True, "$elemMatch": {"$nin": ["", [], 0]}}
  2526. })
  2527. else:
  2528. #todo: If this method gets cached, then copies need to be made before the del below.
  2529. parts = []
  2530. refs = self.split_spanning_ref()
  2531. for r in refs:
  2532. q = r.condition_query()
  2533. del q["title"]
  2534. parts.append(q)
  2535. d.update({
  2536. "$or": parts
  2537. })
  2538. return d
  2539. def versionset(self, lang=None):
  2540. """
  2541. :class:`VersionsSet` of :class:`Version` objects that have content for this Ref in lang, projected
  2542. :param lang: "he", "en", or None
  2543. :return: :class:`VersionSet`
  2544. """
  2545. return VersionSet(self.condition_query(lang), proj=self.part_projection())
  2546. def version_list(self):
  2547. """
  2548. A list of available text versions titles and languages matching this ref
  2549. :return list: each list element is an object with keys 'versionTitle' and 'language'
  2550. """
  2551. vlist = []
  2552. for v in VersionSet(self.condition_query(), proj={"versionTitle": 1, "language": 1}):
  2553. vlist.append({
  2554. "versionTitle": v.versionTitle,
  2555. "language": v.language
  2556. })
  2557. return vlist
  2558. """ String Representations """
  2559. def __str__(self):
  2560. return self.uid()
  2561. def __repr__(self): # Wanted to use orig_tref, but repr can not include Unicode
  2562. return self.__class__.__name__ + "('" + str(self.uid()) + "')"
  2563. def old_dict_format(self):
  2564. """
  2565. Outputs the ref in the old format, for code that relies heavily on that format
  2566. """
  2567. #todo: deprecate this.
  2568. d = {
  2569. "ref": self.tref,
  2570. "book": self.book,
  2571. "sections": self.sections,
  2572. "toSections": self.toSections,
  2573. "type": self.type
  2574. }
  2575. d.update(self.index.contents())
  2576. del d["title"]
  2577. return d
  2578. def he_book(self):
  2579. return self.index.get_title(lang="he")
  2580. def _get_normal(self, lang):
  2581. normal = self.index_node.full_title(lang)
  2582. if not normal:
  2583. if lang != "en":
  2584. return self.normal()
  2585. else:
  2586. raise InputError("Failed to get English normal form for ref")
  2587. if len(self.sections) == 0:
  2588. return normal
  2589. if self.type == "Commentary" and not getattr(self.index, "commentaryCategories", None):
  2590. return normal
  2591. normal += u" "
  2592. normal += u":".join(
  2593. [self.index_node.address_class(i).toStr(lang, n) for i, n in enumerate(self.sections)]
  2594. )
  2595. for i in range(len(self.sections)):
  2596. if not self.sections[i] == self.toSections[i]:
  2597. normal += u"-{}".format(
  2598. u":".join(
  2599. [self.index_node.address_class(i + j).toStr(lang, n) for j, n in enumerate(self.toSections[i:])]
  2600. )
  2601. )
  2602. break
  2603. return normal
  2604. def he_normal(self):
  2605. """
  2606. :return string: Normal Hebrew string form
  2607. """
  2608. '''
  2609. 18 June 2015: Removed the special casing for Hebrew Talmud sub daf numerals
  2610. Previously, talmud lines had been normalised as arabic numerals
  2611. '''
  2612. if not self._he_normal:
  2613. self._he_normal = self._get_normal("he")
  2614. return self._he_normal
  2615. def uid(self):
  2616. """
  2617. To handle the fact that default nodes have the same name as their parents
  2618. :return:
  2619. """
  2620. return self.normal() + ("<d>" if self.index_node.is_default() else "")
  2621. def normal(self):
  2622. """
  2623. :return string: Normal English string form
  2624. """
  2625. if not self._normal:
  2626. self._normal = self._get_normal("en")
  2627. return self._normal
  2628. def text(self, lang="en", vtitle=None):
  2629. """
  2630. :param lang: "he" or "en"
  2631. :param vtitle: optional. text title of the Version to get the text from
  2632. :return: :class:`TextChunk` corresponding to this Ref
  2633. """
  2634. return TextChunk(self, lang, vtitle)
  2635. def url(self):
  2636. """
  2637. :return string: normal url form
  2638. """
  2639. if not self._url:
  2640. self._url = self.normal().replace(" ", "_").replace(":", ".")
  2641. # Change "Mishna_Brachot_2:3" to "Mishna_Brachot.2.3", but don't run on "Mishna_Brachot"
  2642. if len(self.sections) > 0:
  2643. last = self._url.rfind("_")
  2644. if last == -1:
  2645. return self._url
  2646. lref = list(self._url)
  2647. lref[last] = "."
  2648. self._url = "".join(lref)
  2649. return self._url
  2650. def noteset(self, public=True, uid=None):
  2651. """
  2652. :return: :class:`NoteSet` for this Ref
  2653. """
  2654. from . import NoteSet
  2655. if public and uid:
  2656. query = {"ref": {"$regex": self.regex()}, "$or": [{"public": True}, {"owner": uid}]}
  2657. elif public:
  2658. query = {"ref": {"$regex": self.regex()}, "public": True}
  2659. elif uid:
  2660. query = {"ref": {"$regex": self.regex()}, "owner": uid}
  2661. else:
  2662. raise InputError("Can not get anonymous private notes")
  2663. return NoteSet(query)
  2664. def linkset(self):
  2665. """
  2666. :return: :class:`LinkSet` for this Ref
  2667. """
  2668. from . import LinkSet
  2669. return LinkSet(self)
  2670. class Library(object):
  2671. """
  2672. Operates as a singleton, through the instance called ``library``.
  2673. Stewards the in-memory and in-cache objects that cover the entire collection of texts.
  2674. Exposes methods to add, remove, or register change of an index record. These are primarily called by the dependencies mechanism on Index Create/Update/Destroy.
  2675. """
  2676. def __init__(self):
  2677. self.langs = ["en", "he"]
  2678. # Map from index key to ref keys
  2679. self._index_ref_map = {}
  2680. # Maps, keyed by language, from index key to array of titles
  2681. self._index_title_maps = {lang:{} for lang in self.langs}
  2682. # Maps, keyed by language, from titles to schema nodes
  2683. self._title_node_maps = {lang:{} for lang in self.langs}
  2684. # Maps, keyed by language, from index key to array of commentary titles
  2685. self._index_title_commentary_maps = {lang:{} for lang in self.langs}
  2686. # Maps, keyed by language, from titles to simple and commentary schema nodes
  2687. self._title_node_with_commentary_maps = {lang:{} for lang in self.langs}
  2688. # Lists of full titles, keys are string generated from a combination of language code, "commentators", "commentary", and "terms". See method `full_title_list()`
  2689. self._full_title_lists = {}
  2690. # Lists of full titles, including simple and commentary texts, keyed by language
  2691. self._full_title_list_jsons = {}
  2692. # Title regex strings & objects, keys are strings generated from a combination of arguments to `all_titles_regex` and `all_titles_regex_string`
  2693. self._title_regex_strings = {}
  2694. self._title_regexes = {}
  2695. # Maps, keyed by language, from term names to text refs
  2696. self._term_ref_maps = {lang:{} for lang in self.langs}
  2697. # Map from index title to index object
  2698. self._indexes = {}
  2699. # old local cache
  2700. self.local_cache = {}
  2701. def get_index(self, bookname):
  2702. """
  2703. Factory - returns either an :class:`Index` object or a :class:`CommentaryIndex` object
  2704. :param string bookname: Name of the book or commentary on book.
  2705. :return:
  2706. """
  2707. # look for result in indices cache
  2708. if not bookname:
  2709. raise BookNameError("No book provided.")
  2710. indx = self._indexes.get(bookname)
  2711. if not indx:
  2712. bookname = (bookname[0].upper() + bookname[1:]).replace("_", " ") #todo: factor out method
  2713. #todo: cache
  2714. node = self.get_schema_node(bookname)
  2715. if node:
  2716. indx = node.index
  2717. else:
  2718. # "commenter" on "book"
  2719. # todo: handle hebrew x on y format (do we need this?)
  2720. pattern = r'(?P<commentor>.*) on (?P<book>.*)'
  2721. m = regex.match(pattern, bookname)
  2722. if m:
  2723. indx = CommentaryIndex(m.group('commentor'), m.group('book'))
  2724. else:
  2725. #simple commentary record
  2726. indx = Index().load({
  2727. "titleVariants": bookname,
  2728. "categories.0": "Commentary"
  2729. })
  2730. if not indx:
  2731. raise BookNameError(u"No book named '{}'.".format(bookname))
  2732. self._indexes[bookname] = indx
  2733. return indx
  2734. def add_index_record(self, index_title = None, index_object = None, rebuild = True):
  2735. """
  2736. Update library title dictionaries and caches with information from provided index.
  2737. Index can be passed with primary title in `index_title` or as an object in `index_object`
  2738. :param title: primary title of index
  2739. :param index: index record
  2740. :param rebuild: Perform a rebuild of derivative objects afterwards?
  2741. :return:
  2742. """
  2743. if index_title:
  2744. index_object = self.get_index(index_title)
  2745. assert index_object, "Library.add_index_record called without index"
  2746. #//TODO: mark for commentary refactor
  2747. title_maps = self._index_title_commentary_maps if index_object.is_commentary() else self._index_title_maps
  2748. try:
  2749. for lang in self.langs:
  2750. title_dict = index_object.nodes.title_dict(lang)
  2751. title_maps[lang][index_object.title] = title_dict.keys()
  2752. self._title_node_with_commentary_maps[lang].update(title_dict)
  2753. if not index_object.is_commentary():
  2754. self._title_node_maps[lang].update(title_dict)
  2755. except IndexSchemaError as e:
  2756. logger.error(u"Error in generating title node dictionary: {}".format(e))
  2757. if rebuild:
  2758. self._reset_derivitative_objects()
  2759. def remove_index_record(self, index_title, rebuild = True):
  2760. """
  2761. Update provided index from library title dictionaries and caches
  2762. :param index_title: primary title of index
  2763. :param rebuild: Perform a rebuild of derivative objects afterwards?
  2764. :return:
  2765. """
  2766. #//TODO: mark for commentary refactor
  2767. #//Keeping commentary branch and simple branch completely separate - should make refactor easier
  2768. for lang in self.langs:
  2769. commentary_titles = self._index_title_commentary_maps[lang].get(index_title)
  2770. simple_titles = self._index_title_maps[lang].get(index_title)
  2771. if simple_titles:
  2772. for key in simple_titles:
  2773. try:
  2774. del self._title_node_with_commentary_maps[lang][key]
  2775. del self._title_node_maps[lang][key]
  2776. except KeyError:
  2777. logger.warning("Tried to delete non-existent title '{}' of index record '{}' from title-node map".format(key, index_title))
  2778. del self._index_title_maps[lang][index_title]
  2779. elif commentary_titles:
  2780. for key in commentary_titles:
  2781. try:
  2782. del self._title_node_with_commentary_maps[lang][key]
  2783. except KeyError:
  2784. logger.warning("Tried to delete non-existent title '{}' of index record '{}' from title-node map".format(key, index_title))
  2785. del self._index_title_commentary_maps[lang][index_title]
  2786. else:
  2787. logger.error("Could not find entry for index '{}' in index-title map".format(index_title))
  2788. return
  2789. if rebuild:
  2790. self._reset_derivitative_objects()
  2791. def refresh_index_record(self, index_title):
  2792. """
  2793. Update library title dictionaries and caches for provided index
  2794. :param title: primary title of index
  2795. :return:
  2796. """
  2797. self.remove_index_record(index_title, rebuild=False)
  2798. self.add_index_record(index_title, rebuild=False)
  2799. self._reset_derivitative_objects()
  2800. def _reset_derivitative_objects(self):
  2801. self._full_title_lists = {}
  2802. self._full_title_list_jsons = {}
  2803. self._title_regex_strings = {}
  2804. self._title_regexes = {}
  2805. def build_all_title_node_dicts(self):
  2806. # Rework get_index_forest() code here to only run once
  2807. # simple texts
  2808. forest = [i.nodes for i in IndexSet() if not i.is_commentary()]
  2809. self._title_node_maps = {lang : {} for lang in self.langs}
  2810. for tree in forest:
  2811. try:
  2812. for lang in self.langs:
  2813. self._title_node_maps[lang].update(tree.title_dict(lang))
  2814. except IndexSchemaError as e:
  2815. logger.error(u"Error in generating title node dictionary: {}".format(e))
  2816. # commentary
  2817. commentary_forest = [self.get_index(i).nodes for i in self.get_commentary_version_titles()]
  2818. self._title_node_with_commentary_maps = { lang: self._title_node_maps[lang].copy() for lang in self.langs }
  2819. for tree in commentary_forest:
  2820. try:
  2821. for lang in self.langs:
  2822. self._title_node_with_commentary_map[lang].update(tree.title_dict(lang))
  2823. except IndexSchemaError as e:
  2824. logger.error(u"Error in generating title node dictionary: {}".format(e))
  2825. #todo: the for_js path here does not appear to be in use.
  2826. def all_titles_regex_string(self, lang="en", commentary=False, with_commentary=False, with_terms=False): #, for_js=False):
  2827. """
  2828. :param lang: "en" or "he"
  2829. :param commentary: If true matches ONLY commentary records
  2830. :param with_commentary: If true, overrides `commentary` argument and matches BOTH "x on y" style records and simple records
  2831. Note that matching behavior differs between commentary=True and with_commentary=True.
  2832. commentary=True matches 'title', 'commentor' and 'commentee' named groups.
  2833. with_commentary=True matches only 'title', wether for plain records or commentary records.
  2834. :param with_terms:
  2835. :param for_js:
  2836. :return:
  2837. """
  2838. if lang == "he" and (commentary or with_commentary):
  2839. raise InputError("No support for Hebrew Commentatory Ref Objects")
  2840. key = lang
  2841. key += "_both" if with_commentary else "_commentary" if commentary else ""
  2842. key += "_terms" if with_terms else ""
  2843. re_string = self._title_regex_strings.get(key)
  2844. if not re_string:
  2845. re_string = u""
  2846. simple_books = map(re.escape, self.full_title_list(lang, with_commentators=False, with_commentary=with_commentary, with_terms=with_terms))
  2847. simple_book_part = ur'|'.join(sorted(simple_books, key=len, reverse=True)) # Match longer titles first
  2848. # re_string += ur'(?:^|[ ([{>,-]+)' if for_js else u'' # Why don't we check for word boundaries internally as well?
  2849. # re_string += ur'(?:\u05d5?(?:\u05d1|\u05de|\u05dc|\u05e9|\u05d8|\u05d8\u05e9)?)' if for_js and lang == "he" else u'' # likewise leading characters in Hebrew?
  2850. # re_string += ur'(' if for_js else
  2851. re_string = ur'(?P<title>'
  2852. if not commentary:
  2853. re_string += simple_book_part
  2854. else:
  2855. first_part = ur'|'.join(map(re.escape, self.get_commentator_titles(with_variants=True)))
  2856. # if for_js:
  2857. # re_string += ur"(" + first_part + ur") on (" + simple_book_part + ur")"
  2858. # else:
  2859. re_string += ur"(?P<commentor>" + first_part + ur") on (?P<commentee>" + simple_book_part + ur")"
  2860. re_string += ur')'
  2861. re_string += ur'($|[:., <]+)'
  2862. self._title_regex_strings[key] = re_string
  2863. return re_string
  2864. #WARNING: Do NOT put the compiled re2 object into redis. It gets corrupted.
  2865. def all_titles_regex(self, lang="en", commentary=False, with_commentary=False, with_terms=False):
  2866. """
  2867. :return: A regular expression object that will match any known title in the library in the provided language
  2868. :param lang: "en" or "he"
  2869. :param bool commentary: Default False.
  2870. If True, matches "X on Y" style commentary records only.
  2871. If False matches simple records only.
  2872. :param with_commentary: If true, overrides `commentary` argument and matches BOTH "x on y" style records and simple records
  2873. Note that matching behavior differs between commentary=True and with_commentary=True.
  2874. commentary=True matches 'title', 'commentor' and 'commentee' named groups.
  2875. with_commentary=True matches only 'title', wether for plain records or commentary records.
  2876. :param bool with_terms: Default False. If True, include shared titles ('terms')
  2877. :raise: InputError: if lang == "he" and commentary == True
  2878. Uses re2 if available. See https://github.com/Sefaria/Sefaria-Project/wiki/Regular-Expression-Engines
  2879. """
  2880. key = "all_titles_regex_" + lang
  2881. key += "_both" if with_commentary else "_commentary" if commentary else ""
  2882. key += "_terms" if with_terms else ""
  2883. reg = self._title_regexes.get(key)
  2884. if not reg:
  2885. re_string = self.all_titles_regex_string(lang, commentary, with_commentary, with_terms)
  2886. try:
  2887. reg = re.compile(re_string, max_mem=256 * 1024 * 1024)
  2888. except TypeError:
  2889. reg = re.compile(re_string)
  2890. self._title_regexes[key] = reg
  2891. return reg
  2892. def full_title_list(self, lang="en", with_commentators=True, with_commentary=False, with_terms=False):
  2893. """
  2894. :return: list of strings of all possible titles
  2895. :param lang: "he" or "en"
  2896. :param with_commentators: if True, includes the commentator names, with variants, but not the cross-product with books.
  2897. :param with_commentary: if True, includes all existing "X on Y" type commentary records
  2898. :param with_terms: if True, includes shared titles ('terms')
  2899. """
  2900. key = lang
  2901. key += "_commentators" if with_commentators else ""
  2902. key += "_commentary" if with_commentary else ""
  2903. key += "_terms" if with_terms else ""
  2904. # titles = scache.get_cache_elem(key)
  2905. titles = self._full_title_lists.get(key)
  2906. if not titles:
  2907. titles = self.get_title_node_dict(lang, with_commentary=with_commentary).keys()
  2908. if with_terms:
  2909. titles += self.get_term_dict(lang).keys()
  2910. if with_commentators:
  2911. titles += self.get_commentator_titles(lang, with_variants=True)
  2912. self._full_title_lists[key] = titles
  2913. # scache.set_cache_elem(key, titles)
  2914. return titles
  2915. def ref_list(self):
  2916. """
  2917. :return: list of all section-level Refs in the library
  2918. """
  2919. from version_state import VersionStateSet
  2920. return VersionStateSet().all_refs()
  2921. def get_term_dict(self, lang="en"):
  2922. """
  2923. :return: dict of shared titles that have an explicit ref
  2924. :param lang: "he" or "en"
  2925. """
  2926. # key = "term_dict_" + lang
  2927. # term_dict = self.local_cache.get(key)
  2928. term_dict = self._term_ref_maps.get(lang)
  2929. # if not term_dict:
  2930. # term_dict = scache.get_cache_elem(key)
  2931. # self.local_cache[key] = term_dict
  2932. if not term_dict:
  2933. term_dict = {}
  2934. terms = TermSet({"$and":[{"ref": {"$exists":True}},{"ref":{"$nin":["",[]]}}]})
  2935. for term in terms:
  2936. for title in term.get_titles(lang):
  2937. term_dict[title] = term.ref
  2938. # scache.set_cache_elem(key, term_dict)
  2939. # self.local_cache[key] = term_dict
  2940. self._term_ref_maps[lang] = term_dict
  2941. return term_dict
  2942. #todo: no usages?
  2943. def get_content_nodes(self, with_commentary=False):
  2944. """
  2945. :return: list of all content nodes in the library
  2946. :param bool with_commentary: If True, returns "X on Y" type titles as well
  2947. """
  2948. nodes = []
  2949. forest = self.get_index_forest(with_commentary=with_commentary)
  2950. for tree in forest:
  2951. nodes += tree.get_leaf_nodes()
  2952. return nodes
  2953. def get_index_forest(self, with_commentary=False):
  2954. """
  2955. :return: list of root Index nodes.
  2956. :param bool with_commentary: If True, returns "X on Y" type titles as well
  2957. """
  2958. #todo: speed: does it matter that this skips the index cache?
  2959. root_nodes = [i.nodes for i in IndexSet() if not i.is_commentary()]
  2960. if with_commentary:
  2961. ctitles = self.get_commentary_version_titles()
  2962. for title in ctitles:
  2963. try:
  2964. i = self.get_index(title)
  2965. root_nodes.append(i.nodes)
  2966. # TEMPORARY - filter out complex texts
  2967. except BookNameError:
  2968. pass
  2969. # End TEMPORARY
  2970. return root_nodes
  2971. def get_title_node_dict(self, lang="en", with_commentary=False):
  2972. """
  2973. :param lang: "he" or "en"
  2974. :param bool with_commentary: if true, includes "X on Y" types nodes
  2975. :return: dictionary of string titles and the nodes that they point to.
  2976. Does not include bare commentator names, like *Rashi*.
  2977. """
  2978. # key = "title_node_dict_" + lang
  2979. # key += "_commentary" if with_commentary else ""
  2980. # title_dict = self.local_cache.get(key)
  2981. #//TODO: mark for commentary refactor
  2982. title_dict = self._title_node_maps.get(lang) if with_commentary else self._title_node_with_commentary_maps.get(lang)
  2983. #if not title_dict:
  2984. # title_dict = scache.get_cache_elem(key)
  2985. # self.local_cache[key] = title_dict
  2986. #todo: Keep this path, or isolate creation to __init__ and refresh methods?
  2987. if not title_dict:
  2988. trees = self.get_index_forest(with_commentary=with_commentary)
  2989. for tree in trees:
  2990. try:
  2991. title_dict.update(tree.title_dict(lang))
  2992. except IndexSchemaError as e:
  2993. logger.error(u"Error in generating title node dictionary: {}".format(e))
  2994. # scache.set_cache_elem(key, title_dict)
  2995. # self.local_cache[key] = title_dict
  2996. return title_dict
  2997. #todo: handle terms
  2998. def get_schema_node(self, title, lang=None, with_commentary=False):
  2999. """
  3000. :param string title:
  3001. :param lang: "en" or "he"
  3002. :return: a particular SchemaNode that matches the provided title and language
  3003. :rtype: :class:`sefaria.model.schema.SchemaNode`
  3004. """
  3005. if not lang:
  3006. lang = "he" if is_hebrew(title) else "en"
  3007. title = title.replace("_", " ")
  3008. return self.get_title_node_dict(lang, with_commentary=with_commentary).get(title)
  3009. def get_text_titles_json(self, lang="en"):
  3010. """
  3011. :return: JSON of full texts list, (cached)
  3012. """
  3013. title_json = self._full_title_list_jsons.get(lang)
  3014. if not title_json:
  3015. title_json = json.dumps(self.full_title_list(lang=lang, with_commentary=True))
  3016. self._full_title_list_jsons[lang] = title_json
  3017. return title_json
  3018. def get_text_categories(self):
  3019. """
  3020. :return: List of all known text categories.
  3021. """
  3022. return IndexSet().distinct("categories")
  3023. def get_indexes_in_category(self, category, include_commentary=False, full_records=False):
  3024. """
  3025. :param string category: Name of category
  3026. :param bool include_commentary: If true includes records of Commentary and Targum
  3027. :param bool full_records: If True will return the actual :class: 'IndexSet' otherwise just the titles
  3028. :return: :class:`IndexSet` of :class:`Index` records in the specified category
  3029. """
  3030. if not include_commentary:
  3031. q = {"$and": [{"categories": category}, {"categories": {"$ne": "Commentary"}}, {"categories": {"$ne": "Commentary2"}}, {"categories": {"$ne": "Targum"}}]}
  3032. else:
  3033. q = {"categories": category}
  3034. return IndexSet(q) if full_records else IndexSet(q).distinct("title")
  3035. def get_commentator_titles(self, lang="en", with_variants=False, with_commentary2=False):
  3036. #//TODO: mark for commentary refactor
  3037. """
  3038. :param lang: "he" or "en"
  3039. :param with_variants: If True, includes titles variants along with the primary titles.
  3040. :return: List of titles
  3041. """
  3042. args = {
  3043. ("en", False): "title",
  3044. ("en", True): "titleVariants",
  3045. ("he", False): "heTitle",
  3046. ("he", True): "heTitleVariants"
  3047. }
  3048. commentators = IndexSet({"categories.0": "Commentary"}).distinct(args[(lang, with_variants)])
  3049. if with_commentary2:
  3050. commentary2 = IndexSet({"categories.0": "Commentary2"}).distinct(args[(lang, with_variants)])
  3051. commentators = commentators + [s.split(" on ")[0].split(u" על ")[0] for s in commentary2]
  3052. return commentators
  3053. def get_commentary_versions(self, commentators=None, with_commentary2=False):
  3054. """
  3055. :param string|list commentators: A single commentator name, or a list of commentator names.
  3056. :return: :class:`VersionSet` of :class:`Version` records for the specified commentators
  3057. If no commentators are provided, all commentary Versions will be returned.
  3058. """
  3059. if isinstance(commentators, basestring):
  3060. commentators = [commentators]
  3061. if not commentators:
  3062. commentators = self.get_commentator_titles(with_commentary2=with_commentary2)
  3063. commentary_re = ur"^({}) on ".format("|".join(commentators))
  3064. query = {"title": {"$regex": commentary_re}}
  3065. if with_commentary2:
  3066. # Handle Commentary2 texts that don't have "X on Y" titles (e.g., "Rambam's Introduction to the Mishnah")
  3067. if not commentators:
  3068. titles = IndexSet({"categories.0": "Commentary2"}).distinct("title")
  3069. else:
  3070. titles = IndexSet({"categories.0": "Commentary2", "categories.2": {"$in": commentators}}).distinct("title")
  3071. query = {"$or":[query, {"title": {"$in": titles}}]}
  3072. return VersionSet(query)
  3073. def get_commentary_version_titles(self, commentators=None, with_commentary2=False):
  3074. """
  3075. :param string|list commentators: A single commentator name, or a list of commentator names.
  3076. :return: list of titles of :class:`Version` records for the specified commentators
  3077. If no commentators are provided, all commentary Versions will be returned.
  3078. """
  3079. return self.get_commentary_versions(commentators, with_commentary2=with_commentary2).distinct("title")
  3080. def get_commentary_versions_on_book(self, book=None, with_commentary2=False):
  3081. """
  3082. :param string book: The primary name of a book
  3083. :return: :class:`VersionSet` of :class:`Version` records that comment on the provided book
  3084. """
  3085. assert book
  3086. commentators = self.get_commentator_titles(with_commentary2=with_commentary2)
  3087. commentary_re = ur"^({}) on {}".format("|".join(commentators), book)
  3088. return VersionSet({"title": {"$regex": commentary_re}})
  3089. def get_commentary_version_titles_on_book(self, book, with_commentary2=False):
  3090. """
  3091. :param string book: The primary name of a book
  3092. :return: list of titles of :class:`Version` records that comment on the provided book
  3093. """
  3094. return self.get_commentary_versions_on_book(book, with_commentary2=with_commentary2).distinct("title")
  3095. def get_titles_in_string(self, s, lang=None):
  3096. """
  3097. Returns the titles found in the string.
  3098. :param s: The string to search
  3099. :param lang: "en" or "he"
  3100. :return list: titles found in the string
  3101. """
  3102. if not lang:
  3103. lang = "he" if is_hebrew(s) else "en"
  3104. if lang=="en":
  3105. #todo: combine into one regex
  3106. return [m.group('title') for m in self.all_titles_regex(lang, with_commentary=True).finditer(s)]
  3107. elif lang=="he":
  3108. return [m.group('title') for m in self.all_titles_regex(lang, commentary=False).finditer(s)]
  3109. def get_refs_in_string(self, st, lang=None):
  3110. """
  3111. Returns an list of Ref objects derived from string
  3112. :param string st: the input string
  3113. :param lang: "he" or "en"
  3114. :return: list of :class:`Ref` objects
  3115. """
  3116. # todo: only match titles of content nodes
  3117. refs = []
  3118. if lang is None:
  3119. lang = "he" if is_hebrew(st) else "en"
  3120. if lang == "he":
  3121. unique_titles = {title: 1 for title in self.get_titles_in_string(st, lang)}
  3122. for title in unique_titles.iterkeys():
  3123. try:
  3124. res = self._build_all_refs_from_string(title, st)
  3125. except AssertionError as e:
  3126. logger.info(u"Skipping Schema Node: {}".format(title))
  3127. else:
  3128. refs += res
  3129. else: # lang == "en"
  3130. for match in self.all_titles_regex(lang, with_commentary=True).finditer(st):
  3131. title = match.group('title')
  3132. if not title:
  3133. continue
  3134. try:
  3135. res = self._build_ref_from_string(title, st[match.start():]) # Slice string from title start
  3136. except AssertionError as e:
  3137. logger.info(u"Skipping Schema Node: {}".format(title))
  3138. except InputError as e:
  3139. logger.info(u"Input Error searching for refs in string: {}".format(e))
  3140. else:
  3141. refs += res
  3142. return refs
  3143. # do we want to move this to the schema node? We'd still have to pass the title...
  3144. def get_regex_string(self, title, lang, for_js=False):
  3145. node = self.get_schema_node(title, lang, with_commentary=True)
  3146. assert isinstance(node, JaggedArrayNode) # Assumes that node is a JaggedArrayNode
  3147. if lang == "en" or for_js: # Javascript doesn't support look behinds.
  3148. return node.full_regex(title, lang, for_js=for_js, match_range=for_js, compiled=False, anchored=(not for_js))
  3149. elif lang == "he":
  3150. return ur"""(?<= # look behind for opening brace
  3151. [({] # literal '(', brace,
  3152. [^})]* # anything but a closing ) or brace
  3153. )
  3154. """ + regex.escape(title) + node.after_title_delimiter_re + node.address_regex(lang, for_js=for_js, match_range=for_js) + ur"""
  3155. (?=\W|$) # look ahead for non-word char
  3156. (?= # look ahead for closing brace
  3157. [^({]* # match of anything but an opening '(' or brace
  3158. [)}] # zero-width: literal ')' or brace
  3159. )"""
  3160. #todo: handle ranges in inline refs
  3161. def _build_ref_from_string(self, title=None, st=None, lang="en"):
  3162. """
  3163. Build a Ref object given a title and a string. The title is assumed to be at position 0 in the string.
  3164. This is used primarily for English matching. Hebrew matching is done with _build_all_refs_from_string()
  3165. :param title: The title used in the text to refer to this Index node
  3166. :param st: The source text for this reference
  3167. :return: Ref
  3168. """
  3169. node = self.get_schema_node(title, lang, with_commentary=True)
  3170. assert isinstance(node, JaggedArrayNode) # Assumes that node is a JaggedArrayNode
  3171. try:
  3172. re_string = self.get_regex_string(title, lang)
  3173. except AttributeError as e:
  3174. logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}. {}".format(title, e))
  3175. return []
  3176. reg = regex.compile(re_string, regex.VERBOSE)
  3177. ref_match = reg.match(st)
  3178. if ref_match:
  3179. sections = []
  3180. gs = ref_match.groupdict()
  3181. for i in range(0, node.depth):
  3182. gname = u"a{}".format(i)
  3183. if gs.get(gname) is not None:
  3184. sections.append(node._addressTypes[i].toNumber(lang, gs.get(gname)))
  3185. _obj = {
  3186. "tref": ref_match.group(),
  3187. "book": node.full_title("en"),
  3188. "index_node": node,
  3189. "index": node.index,
  3190. "type": node.index.categories[0],
  3191. "sections": sections,
  3192. "toSections": sections
  3193. }
  3194. try:
  3195. return [Ref(_obj=_obj)]
  3196. except InputError:
  3197. return []
  3198. else:
  3199. return []
  3200. #todo: handle ranges in inline refs
  3201. def _build_all_refs_from_string(self, title=None, st=None, lang="he"):
  3202. """
  3203. Build all Ref objects for title found in string. By default, only match what is found between braces (as in Hebrew).
  3204. This is used primarily for Hebrew matching. English matching uses _build_ref_from_string()
  3205. :param title: The title used in the text to refer to this Index node
  3206. :param st: The source text for this reference
  3207. :return: list of Refs
  3208. """
  3209. node = self.get_schema_node(title, lang)
  3210. assert isinstance(node, JaggedArrayNode) # Assumes that node is a JaggedArrayNode
  3211. refs = []
  3212. try:
  3213. re_string = self.get_regex_string(title, lang)
  3214. except AttributeError as e:
  3215. logger.warning(u"Library._build_all_refs_from_string() failed to create regex for: {}. {}".format(title, e))
  3216. return refs
  3217. reg = regex.compile(re_string, regex.VERBOSE)
  3218. for ref_match in reg.finditer(st):
  3219. sections = []
  3220. gs = ref_match.groupdict()
  3221. for i in range(0, node.depth):
  3222. gname = u"a{}".format(i)
  3223. if gs.get(gname) is not None:
  3224. sections.append(node._addressTypes[i].toNumber(lang, gs.get(gname)))
  3225. _obj = {
  3226. "tref": ref_match.group(),
  3227. "book": node.full_title("en"),
  3228. "index_node": node,
  3229. "index": node.index,
  3230. "type": node.index.categories[0],
  3231. "sections": sections,
  3232. "toSections": sections
  3233. }
  3234. try:
  3235. refs.append(Ref(_obj=_obj))
  3236. except InputError:
  3237. continue
  3238. return refs
  3239. library = Library()