summaries.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. # -*- coding: utf-8 -*-
  2. """
  3. summaries.py - create and manage Table of Contents document for all texts
  4. Writes to MongoDB Collection: summaries
  5. """
  6. import json
  7. from datetime import datetime
  8. from pprint import pprint
  9. import sefaria.system.cache as scache
  10. from sefaria.system.database import db
  11. from sefaria.utils.hebrew import hebrew_term
  12. from model import *
  13. from sefaria.system.exceptions import BookNameError
  14. # Giant list ordering or categories
  15. # indentation and inclusion of duplicate categories (like "Seder Moed")
  16. # is for readability only. The table of contents will follow this structure.
  17. ORDER = [
  18. "Tanach",
  19. "Torah",
  20. "Genesis",
  21. "Exodus",
  22. "Leviticus",
  23. "Numbers",
  24. "Deuteronomy",
  25. "Prophets",
  26. "Writings",
  27. "Targum",
  28. 'Onkelos Genesis',
  29. 'Onkelos Exodus',
  30. 'Onkelos Leviticus',
  31. 'Onkelos Numbers',
  32. 'Onkelos Deuteronomy',
  33. 'Targum Jonathan on Genesis',
  34. 'Targum Jonathan on Exodus',
  35. 'Targum Jonathan on Leviticus',
  36. 'Targum Jonathan on Numbers',
  37. 'Targum Jonathan on Deuteronomy',
  38. "Mishnah",
  39. "Seder Zeraim",
  40. "Seder Moed",
  41. "Seder Nashim",
  42. "Seder Nezikin",
  43. "Seder Kodashim",
  44. "Seder Tahorot",
  45. "Tosefta",
  46. "Seder Zeraim",
  47. "Seder Moed",
  48. "Seder Nashim",
  49. "Seder Nezikin",
  50. "Seder Kodashim",
  51. "Seder Tahorot",
  52. "Talmud",
  53. "Bavli",
  54. "Seder Zeraim",
  55. "Seder Moed",
  56. "Seder Nashim",
  57. "Seder Nezikin",
  58. "Seder Kodashim",
  59. "Seder Tahorot",
  60. "Yerushalmi",
  61. "Seder Zeraim",
  62. "Seder Moed",
  63. "Seder Nashim",
  64. "Seder Nezikin",
  65. "Seder Kodashim",
  66. "Seder Tahorot",
  67. "Rif",
  68. "Midrash",
  69. "Aggadic Midrash",
  70. "Midrash Rabbah",
  71. "Halachic Midrash",
  72. "Halakhah",
  73. "Mishneh Torah",
  74. 'Introduction',
  75. 'Sefer Madda',
  76. 'Sefer Ahavah',
  77. 'Sefer Zemanim',
  78. 'Sefer Nashim',
  79. 'Sefer Kedushah',
  80. 'Sefer Haflaah',
  81. 'Sefer Zeraim',
  82. 'Sefer Avodah',
  83. 'Sefer Korbanot',
  84. 'Sefer Taharah',
  85. 'Sefer Nezikim',
  86. 'Sefer Kinyan',
  87. 'Sefer Mishpatim',
  88. 'Sefer Shoftim',
  89. "Shulchan Arukh",
  90. "Kabbalah",
  91. "Zohar",
  92. 'Liturgy',
  93. 'Siddur',
  94. 'Piyutim',
  95. 'Philosophy',
  96. 'Parshanut',
  97. 'Chasidut',
  98. 'Musar',
  99. 'Responsa',
  100. "Rashba",
  101. "Rambam",
  102. 'Apocrypha',
  103. 'Elucidation',
  104. 'Modern Works',
  105. 'Other',
  106. ]
  107. REORDER_RULES = {
  108. "Commentary2": ["Commentary"],
  109. }
  110. def get_toc():
  111. """
  112. Returns table of contents object from cache,
  113. DB or by generating it, as needed.
  114. """
  115. toc_cache = scache.get_cache_elem('toc_cache')
  116. if toc_cache:
  117. return toc_cache
  118. toc = get_toc_from_db()
  119. if toc:
  120. save_toc(toc)
  121. return toc
  122. return update_table_of_contents()
  123. def get_toc_json():
  124. """
  125. Returns JSON representation of TOC.
  126. """
  127. toc_json = scache.get_cache_elem('toc_json_cache')
  128. if toc_json:
  129. return toc_json
  130. toc = get_toc()
  131. toc_json = json.dumps(toc)
  132. scache.set_cache_elem('toc_json_cache', toc_json, 600000)
  133. return toc_json
  134. def save_toc(toc):
  135. """
  136. Saves the table of contents object to in-memory cache,
  137. invalidtes texts_list cache.
  138. """
  139. scache.set_cache_elem('toc_cache', toc, 600000)
  140. scache.delete_template_cache("texts_list")
  141. scache.delete_template_cache("texts_dashboard")
  142. library.local_cache.pop("category_id_dict", None)
  143. def get_toc_from_db():
  144. """
  145. Retrieves the table of contents stored in MongoDB.
  146. """
  147. toc = db.summaries.find_one({"name": "toc"})
  148. return toc["contents"] if toc else None
  149. def save_toc_to_db():
  150. """
  151. Saves table of contents to MongoDB.
  152. (This write can be slow.)
  153. """
  154. db.summaries.remove()
  155. toc_doc = {
  156. "name": "toc",
  157. "contents": scache.get_cache_elem('toc_cache'),
  158. "dateSaved": datetime.now(),
  159. }
  160. db.summaries.save(toc_doc)
  161. def update_table_of_contents():
  162. toc = []
  163. sparseness_dict = get_sparesness_lookup()
  164. # Add an entry for every text we know about
  165. indices = IndexSet()
  166. for i in indices:
  167. if i.is_commentary() or i.categories[0] == "Commentary2":
  168. # Special case commentary below
  169. continue
  170. if i.categories[0] in REORDER_RULES:
  171. i.categories = REORDER_RULES[i.categories[0]] + i.categories[1:]
  172. if i.categories[0] not in ORDER:
  173. i.categories.insert(0, "Other")
  174. node = get_or_make_summary_node(toc, i.categories)
  175. text = i.toc_contents()
  176. text["sparseness"] = sparseness_dict[text["title"]]
  177. node.append(text)
  178. # Special handling to list available commentary texts
  179. commentary_texts = library.get_commentary_version_titles(with_commentary2=True)
  180. for c in commentary_texts:
  181. try:
  182. i = library.get_index(c)
  183. except BookNameError:
  184. continue
  185. if i.categories[0] in REORDER_RULES:
  186. cats = REORDER_RULES[i.categories[0]] + i.categories[1:]
  187. else:
  188. cats = i.categories[:]
  189. text = i.toc_contents()
  190. text["sparseness"] = sparseness_dict[text["title"]]
  191. cats[0], cats[1] = cats[1], cats[0] # Swap "Commentary" with toplevel category (e.g., "Tanach")
  192. node = get_or_make_summary_node(toc, cats)
  193. node.append(text)
  194. # Recursively sort categories and texts
  195. toc = sort_toc_node(toc, recur=True)
  196. save_toc(toc)
  197. save_toc_to_db()
  198. return toc
  199. def update_summaries_on_delete(ref, toc = None):
  200. """
  201. Deletes a title from the ToC
  202. :param ref: really the title of a book in the ToC
  203. """
  204. toc = recur_delete_element_from_toc(ref, get_toc())
  205. save_toc(toc)
  206. save_toc_to_db()
  207. def recur_delete_element_from_toc(ref, toc):
  208. for toc_elem in toc:
  209. #base element, a text- check if title matches.
  210. if 'title' in toc_elem:
  211. if toc_elem['title'] == ref:
  212. #if there is a match, append to this recursion's list of results.
  213. toc_elem['to_delete'] = True
  214. #category
  215. elif 'category' in toc_elem:
  216. #first go down the tree
  217. toc_elem['contents'][:] = [x for x in recur_delete_element_from_toc(ref, toc_elem['contents']) if not 'to_delete' in x]
  218. #add the current category name to any already-found results (since at this point we are on our way up from the recursion.
  219. if not len(toc_elem['contents']):
  220. toc_elem['to_delete'] = True
  221. return toc
  222. def update_summaries_on_change(bookname, old_ref=None, recount=True):
  223. """
  224. Update text summary docs to account for change or insertion of 'text'
  225. * recount - whether or not to perform a new count of available text
  226. """
  227. index = library.get_index(bookname)
  228. indx_dict = index.toc_contents()
  229. if recount:
  230. #counts.update_full_text_count(bookname)
  231. VersionState(bookname).refresh()
  232. toc = get_toc()
  233. resort_other = False
  234. if indx_dict["categories"][0] in REORDER_RULES:
  235. indx_dict["categories"] = REORDER_RULES[indx_dict["categories"][0]] + indx_dict["categories"][1:]
  236. if indx_dict["categories"][0] != "Commentary":
  237. if indx_dict["categories"][0] not in ORDER:
  238. indx_dict["categories"].insert(0, "Other")
  239. resort_other = True
  240. node = get_or_make_summary_node(toc, indx_dict["categories"])
  241. text = add_counts_to_index(indx_dict)
  242. else:
  243. commentator = indx_dict["commentator"]
  244. cats = [indx_dict["categories"][1], "Commentary", commentator]
  245. node = get_or_make_summary_node(toc, cats)
  246. text = add_counts_to_index(indx_dict)
  247. found = False
  248. test_title = old_ref or text["title"]
  249. for item in node:
  250. if item.get("title") == test_title:
  251. item.update(text)
  252. found = True
  253. break
  254. if not found:
  255. node.append(text)
  256. node[:] = sort_toc_node(node)
  257. # If a new category may have been added to other, resort the cateogries
  258. if resort_other:
  259. toc[-1]["contents"] = sort_toc_node(toc[-1]["contents"])
  260. save_toc(toc)
  261. save_toc_to_db()
  262. def update_summaries():
  263. """
  264. Update all stored documents which summarize known and available texts
  265. """
  266. update_table_of_contents()
  267. scache.reset_texts_cache()
  268. def get_or_make_summary_node(summary, nodes, contents_only=True):
  269. """
  270. Returns the node in 'summary' that is named by the list of categories in 'nodes',
  271. creates the node if it doesn't exist.
  272. Used recursively on sub-summaries.
  273. """
  274. if len(nodes) == 1:
  275. # Basecase, only need to search through one level
  276. for node in summary:
  277. if node.get("category") == nodes[0]:
  278. return node["contents"] if contents_only else node
  279. # we didn't find it, so let's add it
  280. summary.append({"category": nodes[0], "heCategory": hebrew_term(nodes[0]), "contents": []})
  281. return summary[-1]["contents"] if contents_only else summary[-1]
  282. # Look for the first category, or add it, then recur
  283. for node in summary:
  284. if node.get("category") == nodes[0]:
  285. return get_or_make_summary_node(node["contents"], nodes[1:], contents_only=contents_only)
  286. summary.append({"category": nodes[0], "heCategory": hebrew_term(nodes[0]), "contents": []})
  287. return get_or_make_summary_node(summary[-1]["contents"], nodes[1:], contents_only=contents_only)
  288. def get_sparesness_lookup():
  289. vss = db.vstate.find({}, {"title": 1, "content._en.sparseness": 1, "content._he.sparseness": 1})
  290. return {vs["title"]: max(vs["content"]["_en"]["sparseness"], vs["content"]["_he"]["sparseness"]) for vs in vss}
  291. def add_counts_to_index(indx_dict):
  292. """
  293. Returns a dictionary which decorates `indx_dict` with a spareness score.
  294. """
  295. vs = StateNode(indx_dict["title"], meta=True)
  296. indx_dict["sparseness"] = max(vs.get_sparseness("he"), vs.get_sparseness("en"))
  297. return indx_dict
  298. '''
  299. #not currently used
  300. #todo: category counts
  301. def add_counts_to_category(cat, parents=[]):
  302. """
  303. Recursively annotate catetory 'cat' as well as any subcategories with count info.
  304. - parent - optionally specficfy parent categories so that e.g, Seder Zeraim in Mishnah
  305. can be diffentiated from Seder Zeraim in Talmud.
  306. Adds the fields to cat:
  307. * availableCounts
  308. * textComplete
  309. * percentAvailable
  310. * num_texts
  311. """
  312. cat_list = parents + [cat["category"]]
  313. # Recur on any subcategories
  314. for subcat in cat["contents"]:
  315. if "category" in subcat:
  316. add_counts_to_category(subcat, parents=cat_list)
  317. counts_doc = counts.get_category_count(cat_list) or counts.count_category(cat_list)
  318. cat.update(counts_doc)
  319. # count texts in this category by summing sub counts and counting texts
  320. cat["num_texts"] = 0
  321. for item in cat["contents"]:
  322. if "category" in item:
  323. # add sub cat for a subcategory
  324. cat["num_texts"] += item["num_texts"]
  325. elif "title" in item:
  326. # add 1 for each indvidual text
  327. cat["num_texts"] += 1
  328. '''
  329. def node_sort_key(a):
  330. """
  331. Sort function for texts/categories per below.
  332. """
  333. if "category" in a:
  334. try:
  335. return ORDER.index(a["category"])
  336. except ValueError:
  337. return 'zz' + a["category"]
  338. elif "title" in a:
  339. try:
  340. return ORDER.index(a["title"])
  341. except ValueError:
  342. if "order" in a:
  343. return a["order"][0]
  344. else:
  345. return a["title"]
  346. return None
  347. def node_sort_sparse(a):
  348. if "category" in a or "order" in a:
  349. # Keep categories or texts with explicit orders at top
  350. score = -4
  351. else:
  352. score = -a.get('sparseness', 1)
  353. return score
  354. def sort_toc_node(node, recur=False):
  355. """
  356. Sort the texts and categories in node according to:
  357. 1. the order of categories and texts listed in the global var 'order'
  358. 2. the order field on a text
  359. 3. alphabetically
  360. If 'recur', call sort_toc_node on each category in 'node' as well.
  361. """
  362. node = sorted(node, key=node_sort_key)
  363. node = sorted(node, key=node_sort_sparse)
  364. if recur:
  365. for cat in node:
  366. if "category" in cat:
  367. cat["contents"] = sort_toc_node(cat["contents"], recur=True)
  368. return node
  369. def get_texts_summaries_for_category(category):
  370. """
  371. Returns the list of texts records in the table of contents corresponding to "category".
  372. """
  373. toc = get_toc()
  374. matched_category = find_category_node(category, toc)
  375. if matched_category:
  376. return extract_text_records_from_toc(matched_category["contents"])
  377. def find_category_node(category, toc):
  378. matched_category_elem = None
  379. for elem in toc:
  380. if "category" in elem:
  381. if elem["category"] == category:
  382. matched_category_elem = elem
  383. break
  384. else:
  385. matched_category_elem = find_category_node(category, elem["contents"])
  386. if matched_category_elem:
  387. break
  388. return matched_category_elem
  389. def extract_text_records_from_toc(toc):
  390. summary = []
  391. for elem in toc:
  392. if "category" in elem:
  393. summary += extract_text_records_from_toc(elem["contents"])
  394. else:
  395. summary += [elem]
  396. return summary
  397. def flatten_toc(toc, include_categories=False, categories_in_titles=False, version_granularity=False):
  398. """
  399. Returns an array of strings which corresponds to each category and text in the
  400. Table of Contents in order.
  401. - categories_in_titles: whether to include each category preceding a text title,
  402. e.g., "Tanach > Torah > Genesis".
  403. - version_granularity: whether to include a seperate entry for every text version.
  404. """
  405. results = []
  406. for x in toc:
  407. name = x.get("category", None) or x.get("title", None)
  408. if "category" in x:
  409. if include_categories:
  410. results += [name]
  411. subcats = flatten_toc(x["contents"], categories_in_titles=categories_in_titles)
  412. if categories_in_titles:
  413. subcats = ["%s > %s" %(name, y) for y in subcats]
  414. results += subcats
  415. elif "title" in x:
  416. if not version_granularity:
  417. results += [name]
  418. else:
  419. #versions = texts.get_version_list(name)
  420. versions = Ref(name).version_list()
  421. for v in versions:
  422. lang = {"he": "Hebrew", "en": "English"}[v["language"]]
  423. results += ["%s > %s > %s.json" % (name, lang, v["versionTitle"])]
  424. return results
  425. def category_id_dict(toc=None, cat_head="", code_head=""):
  426. if toc is None:
  427. d = library.local_cache.get("category_id_dict")
  428. if not d:
  429. d = category_id_dict(get_toc())
  430. library.local_cache["category_id_dict"] = d
  431. return d
  432. d = {}
  433. for i, c in enumerate(toc):
  434. name = c["category"] if "category" in c else c["title"]
  435. if cat_head:
  436. key = "/".join([cat_head, name])
  437. val = code_head + format(i, '02')
  438. else:
  439. key = name
  440. val = "A" + format(i, '02')
  441. d[key] = val
  442. if "contents" in c:
  443. d.update(category_id_dict(c["contents"], key, val))
  444. return d