hebrew.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. # -*- coding: utf-8 -*-
  2. """
  3. hebrew.py - functions relating to reading and generating Hebrew numerals.
  4. Issues:
  5. Numbers like 1 million are ambiguous
  6. Number like 2000 is ambiguous
  7. Okay to construct 15/16 and then make tet-vav/etc?
  8. """
  9. import re
  10. import regex
  11. import math
  12. ### Change to all caps for constants
  13. GERESH = u"\u05F3"
  14. GERSHAYIM = u"\u05F4"
  15. def heb_to_int(unicode_char):
  16. """Converts a single Hebrew unicode character into its Hebrew numerical equivalent."""
  17. hebrew_numerals = {
  18. u"\u05D0": 1,
  19. u"\u05D1": 2,
  20. u"\u05D2": 3,
  21. u"\u05D3": 4,
  22. u"\u05D4": 5,
  23. u"\u05D5": 6,
  24. u"\u05D6": 7,
  25. u"\u05D7": 8,
  26. u"\u05D8": 9,
  27. u"\u05D9": 10,
  28. u"\u05DB": 20,
  29. u"\u05DC": 30,
  30. u"\u05DE": 40,
  31. u"\u05E0": 50,
  32. u"\u05E1": 60,
  33. u"\u05E2": 70,
  34. u"\u05E4": 80,
  35. u"\u05E6": 90,
  36. u"\u05E7": 100,
  37. u"\u05E8": 200,
  38. u"\u05E9": 300,
  39. u"\u05EA": 400, # u"\u05F3": "'", # Hebrew geresh # u"\u05F4": '"', # Hebrew gershayim # u"'": "'",
  40. u"\u05DA": 20, # khaf sofit
  41. u"\u05DD": 40, # mem sofit
  42. u"\u05DF": 50, # nun sofit
  43. u"\u05E3": 80, # peh sofit
  44. u"\u05E5": 90, # tzadi sofit
  45. }
  46. if unicode_char not in hebrew_numerals.keys():
  47. raise KeyError, u"Invalid Hebrew numeral character {}".format(unicode_char)
  48. else:
  49. return hebrew_numerals[unicode_char]
  50. def split_thousands(n, littleendian=True):
  51. """
  52. Takes a string representing a Hebrew numeral, returns a tuple of the component thousands
  53. places. Requires a geresh (apostrophe or '\u05F3') to indicate thousands.
  54. Ignores single geresh at end for numbers < 10.
  55. Default returns the smallest thousands group first in the tuple (little-endian). Can be changed
  56. to big-endian by setting littleendian=False.
  57. """
  58. # Ignore geresh on digit < 10, if present
  59. if n[-1] == GERESH or n[-1] == "'":
  60. n = n[:-1]
  61. #assume that two single quotes in a row should be a double quote. '' -> "
  62. n = n.replace(GERESH, "'").replace("''", "\"")
  63. ret = n.split("'")
  64. if littleendian:
  65. return reversed(ret)
  66. else:
  67. return ret
  68. def heb_string_to_int(n):
  69. '''
  70. Takes a single thousands block of Hebrew characters, and returns the integer value of
  71. that set of characters, ignoring thousands order of magnitude.
  72. >>> heb_string_to_int(u'\u05ea\u05e9\u05e1\u05d3') # = u'תשסד'
  73. 764
  74. '''
  75. n = re.sub(u'[\u05F4"]', '', n) # remove gershayim
  76. return sum(map(heb_to_int, n))
  77. def decode_hebrew_numeral(n):
  78. """
  79. Takes any string representing a Hebrew numeral and returns it integer value.
  80. >>> decode_hebrew_numeral(u'ה׳תשס״ד')
  81. 5764
  82. """
  83. t = map(heb_string_to_int, split_thousands(n)) # split and convert to numbers
  84. t = map(lambda (E, num): pow(10, 3 * E) * num, enumerate(t)) # take care of thousands and add
  85. return sum(t)
  86. ########## ENCODING #############
  87. def chunks(l, n):
  88. """
  89. Yield successive n-sized chunks from l.
  90. """
  91. for i in xrange(0, len(l), n):
  92. yield l[i:i + n]
  93. def int_to_heb(integer):
  94. """
  95. Converts an integer that can be expressed by a single Hebrew character (1..9, 10..90, 100.400)
  96. and returns the Hebrew character that represents that integer.
  97. Also accepts values divisible by 100 from 500 to 1100.
  98. >> int_to_heb(10) #This fails as a doctest. The yud isn't seen as u'\u05d9'
  99. י
  100. >> int_to_heb(800) #TavTav is not seen as u'\u05ea\u05ea'
  101. תת
  102. """
  103. hebrew_numerals = {
  104. 0: u"",
  105. 1: u"\u05D0",
  106. 2: u"\u05D1",
  107. 3: u"\u05D2",
  108. 4: u"\u05D3",
  109. 5: u"\u05D4",
  110. 6: u"\u05D5",
  111. 7: u"\u05D6",
  112. 8: u"\u05D7",
  113. 9: u"\u05D8",
  114. 10: u"\u05D9",
  115. 15: u"\u05D8\u05D5", # Will not be hit when used with break_int_magnitudes
  116. 16: u"\u05D8\u05D6", # Will not be hit when used with break_int_magnitudes
  117. 20: u"\u05DB",
  118. 30: u"\u05DC",
  119. 40: u"\u05DE",
  120. 50: u"\u05E0",
  121. 60: u"\u05E1",
  122. 70: u"\u05E2",
  123. 80: u"\u05E4",
  124. 90: u"\u05E6",
  125. 100: u"\u05E7",
  126. 200: u"\u05E8",
  127. 300: u"\u05E9",
  128. 400: u"\u05EA",
  129. }
  130. # Fill in hebrew_numeral mappings up to 1100
  131. for num in range(500, 1200, 100):
  132. hebrew_numerals[num] = hebrew_numerals[400] * (num // 400) + hebrew_numerals[num % 400]
  133. if integer > 1100:
  134. raise KeyError, "Asked to convert individual integer {} above 1100; too large.".format(integer)
  135. else:
  136. return hebrew_numerals[integer]
  137. def break_int_magnitudes(n, start=None):
  138. """break_int_magnitudes(n, start=None)
  139. Accepts an integer and an optional integer (multiple of 10) for at what order of
  140. magnitude to start breaking apart the integer. If no option "start" is provided,
  141. function will determine the size of the input integer and start that the largest order
  142. of magnitude.
  143. Returns a big-endian list of the various orders of magnitude, by 10s, broken apart.
  144. >>> break_int_magnitudes(1129, 100)
  145. [1100, 20, 9]
  146. >>> break_int_magnitudes(2130)
  147. [2000, 100, 30, 0]
  148. >>> break_int_magnitudes(15000)
  149. [10000, 5000, 0, 0, 0]
  150. """
  151. if type(n) is not int:
  152. raise TypeError, "Argument 'n' must be int, {} provided.".format(type(n))
  153. # if n == 0:
  154. # return [0]
  155. # Set a default for 'start' if none specified
  156. if start is not None:
  157. if not (start % 10 == 0 or start == 1):
  158. raise TypeError, "Argument 'start' must be 1 or divisible by 10, {} provided.".format(start)
  159. else:
  160. start = 10 ** int(math.log10(n))
  161. if start == 1:
  162. return [n]
  163. else:
  164. return [n // start * start] + break_int_magnitudes(n - n // start * start, start=start / 10)
  165. def sanitize(input_string, punctuation=True):
  166. """sanitize(input_string, punctuation=True)
  167. Takes a Hebrew number input string and applies appropriate formatting and changes. This function
  168. includes any special cases, like 15 and 16.
  169. Optional addition of gershayim or geresh at end where appropriate with "punctuation" arg.
  170. Thousands geresh will be added regardless from previous functions.
  171. Note that high numbers may appear oddly due to lack of convention. For example,
  172. the sanitized version of 15000 will appear as טו׳.
  173. """
  174. # deal with 15 and 16
  175. # Should we support numbers like 15,000? Would that look like tet-vav-geresh?
  176. # if input_string[-2:] in (encode_small_hebrew_numeral(15), encode_small_hebrew_numeral(16)):
  177. # input_string = input_string[:-2] + int_to_heb(heb_string_to_int(input_string[-2:]))
  178. # This takes care of all instances of 15/16, even in the thousands
  179. replacement_pairs = (
  180. (u'\u05d9\u05d4', u'\u05d8\u05d5'), #15
  181. (u'\u05d9\u05d5', u'\u05d8\u05d6'), #16
  182. (u'\u05e8\u05e2\u05d4', u'\u05e2\u05e8\u05d4'), #275
  183. (u'\u05e8\u05e2\u05d1', u'\u05e2\u05e8\u05d1'), #272
  184. (u'\u05e8\u05e2', u'\u05e2\u05e8'), #270
  185. )
  186. for wrong, right in replacement_pairs:
  187. input_string = re.sub(wrong, right, input_string)
  188. if punctuation:
  189. # add gershayim at end
  190. if len(input_string) > 1:
  191. if GERESH not in input_string[-2:]:
  192. input_string = input_string[:-1] + GERSHAYIM + input_string[-1:]
  193. else:
  194. # or, add single geresh at end
  195. input_string += GERESH
  196. return input_string
  197. def encode_small_hebrew_numeral(n):
  198. """
  199. Takes an integer under 1200 and returns a string encoding it as a Hebrew numeral.
  200. """
  201. if n >= 1200:
  202. raise ValueError, "Tried to encode small numeral >= 1200."
  203. else:
  204. return u''.join(map(int_to_heb, break_int_magnitudes(n, 100)))
  205. def encode_hebrew_numeral(n, punctuation=True):
  206. """encode_hebrew_numeral(n, punctuation=True)
  207. Takes an integer and returns a string encoding it as a Hebrew numeral.
  208. Optional "punctuation" argument adds gershayim between last two characters
  209. or final geresh.
  210. Under 1200, will use taf-taf-shin, etc.
  211. Above 1200, will use aleph + geresh for thousands.
  212. This function is not intended for numbers 1,000,000 or more, as there is not currently
  213. an established convention and there can be ambiguity. This can be the same for numbers like
  214. 2000 (which would be displayed as bet-geresh) and should instead possibly use words, like "bet elef."
  215. """
  216. if n < 1200:
  217. ret = encode_small_hebrew_numeral(n)
  218. else:
  219. # Break into magnitudes, then break into thousands buckets, big-endian
  220. ret = list(chunks(list(reversed(break_int_magnitudes(n))), 3))
  221. # Eliminate the orders of magnitude in preparation for being encoded
  222. ret = map(lambda (x, y): int(sum(y) * pow(10, -3 * x)), enumerate(ret))
  223. # encode and join together, separating thousands with geresh
  224. ret = GERESH.join(map(encode_small_hebrew_numeral, reversed(ret)))
  225. ret = sanitize(ret, punctuation)
  226. return ret
  227. def encode_hebrew_daf(daf):
  228. """
  229. Turns a daf string ("21a") to a hebrew daf string ("כא.")
  230. """
  231. daf, amud = daf[:-1], daf[-1]
  232. amud_mark = {"a": ".", "b": ":"}[amud]
  233. return encode_hebrew_numeral(int(daf), punctuation=False) + amud_mark
  234. def strip_nikkud(rawString):
  235. return rawString.replace(r"[\u0591-\u05C7]", "");
  236. #todo: rewrite to handle edge case of hebrew words in english texts, and latin characters in Hebrew text
  237. def is_hebrew(s):
  238. if regex.search(u"\p{Hebrew}", s):
  239. return True
  240. return False
  241. def strip_cantillation(text, strip_vowels=False):
  242. if strip_vowels:
  243. strip_regex = re.compile(ur"[\u0591-\u05bd\u05bf-\u05c5\u05c7]", re.UNICODE)
  244. else:
  245. strip_regex = re.compile(ur"[\u0591-\u05af\u05bd\u05bf\u05c0\u05c4\u05c5]", re.UNICODE)
  246. return strip_regex.sub('', text)
  247. def has_cantillation(text, detect_vowels=False):
  248. if detect_vowels:
  249. rgx = re.compile(ur"[\u0591-\u05bd\u05bf-\u05c5\u05c7]", re.UNICODE)
  250. else:
  251. rgx = re.compile(ur"[\u0591-\u05af\u05bd\u05bf\u05c0\u05c4\u05c5]", re.UNICODE)
  252. return bool(rgx.search(text))
  253. def hebrew_plural(s):
  254. """
  255. Hebrew friendly plurals
  256. """
  257. known = {
  258. "Daf": "Dappim",
  259. "Mitzvah": "Mitzvot",
  260. "Mitsva": "Mitzvot",
  261. "Mesechet": "Mesechtot",
  262. "Perek": "Perokim",
  263. "Siman": "Simanim",
  264. "Seif": "Seifim",
  265. "Se'if": "Se'ifim",
  266. "Mishnah": "Mishnayot",
  267. "Mishna": "Mishnayot",
  268. "Chelek": "Chelekim",
  269. "Parasha": "Parshiot",
  270. "Parsha": "Parshiot",
  271. "Pasuk": "Psukim",
  272. "Midrash": "Midrashim",
  273. "Teshuva": "Teshuvot",
  274. "Aliyah": "Aliyot",
  275. "Tikun": "Tikunim",
  276. }
  277. return known[s] if s in known else str(s) + "s"
  278. def hebrew_term(s):
  279. """
  280. Simple translations for common Hebrew words
  281. """
  282. categories = {
  283. "Torah": u"תורה",
  284. "Tanach": u'תנ"ך',
  285. "Tanakh": u'תנ"ך',
  286. "Prophets": u"נביאים",
  287. "Writings": u"כתובים",
  288. "Commentary": u"מפרשים",
  289. "Targum": u"תרגומים",
  290. "Mishnah": u"משנה",
  291. "Tosefta": u"תוספתא",
  292. "Talmud": u"תלמוד",
  293. "Bavli": u"בבלי",
  294. "Yerushalmi": u"ירושלמי",
  295. "Rif": u'רי"ף',
  296. "Kabbalah": u"קבלה",
  297. "Halakha": u"הלכה",
  298. "Halakhah": u"הלכה",
  299. "Midrash": u"מדרש",
  300. "Aggadic Midrash": u"מדרש אגדה",
  301. "Halachic Midrash": u"מדרש הלכה",
  302. "Midrash Rabbah": u"מדרש רבה",
  303. "Responsa": u'שו"ת',
  304. "Other": u"אחר",
  305. "Siddur": u"סידור",
  306. "Liturgy": u"תפילה",
  307. "Piyutim": u"פיוטים",
  308. "Musar": u"ספרי מוסר",
  309. "Chasidut": u"חסידות",
  310. "Parshanut": u"פרשנות",
  311. "Philosophy": u"מחשבת ישראל",
  312. "Maharal": u'מהר"ל מפראג',
  313. "Apocrypha": u"ספרים חיצונים",
  314. "Seder Zeraim": u"סדר זרעים",
  315. "Seder Moed": u"סדר מועד",
  316. "Seder Nashim": u"סדר נשים",
  317. "Seder Nezikin": u"סדר נזיקין",
  318. "Seder Kodashim": u"סדר קדשים",
  319. "Seder Toharot": u"סדר טהרות",
  320. "Seder Tahorot": u"סדר טהרות",
  321. "Dictionary": u"מילון",
  322. "Early Jewish Thought": u"מחשבת ישראל קדומה",
  323. "Minor Tractates": u"מסכתות קטנות",
  324. "Rosh": u'רא"ש',
  325. "Maharsha": u'מהרשא',
  326. "Rashba": u'רשב"א',
  327. "Rambam": u'רמב"ם',
  328. "Radbaz": u'רדב"ז',
  329. "Tosafot Yom Tov": u"תוספות יום טוב",
  330. "Chidushei Halachot": u"חידושי הלכות",
  331. "Chidushei Agadot": u"חידושי אגדות",
  332. "Tiferet Shmuel": u"תפארת שמואל",
  333. "Korban Netanel": u"קרבן נתנאל",
  334. "Pilpula Charifta": u"פילפולא חריפתא",
  335. "Divrey Chamudot": u"דברי חמודות",
  336. "Maadaney Yom Tov": u"מעדני יום טוב",
  337. "Modern Works": u"יצירות מודרניות",
  338. }
  339. pseudo_categories = {
  340. "Mishneh Torah": u"משנה תורה",
  341. 'Introduction': u"הקדמה",
  342. 'Sefer Madda': u"ספר מדע",
  343. 'Sefer Ahavah': u"ספר אהבה",
  344. 'Sefer Zemanim': u"ספר זמנים",
  345. 'Sefer Nashim': u"ספר נשים",
  346. 'Sefer Kedushah': u"ספר קדושה",
  347. 'Sefer Haflaah': u"ספר הפלאה",
  348. 'Sefer Zeraim': u"ספר זרעים",
  349. 'Sefer Avodah': u"ספר עבודה",
  350. 'Sefer Korbanot': u"ספר קורבנות",
  351. 'Sefer Taharah': u"ספר טהרה",
  352. 'Sefer Nezikim': u"ספר נזיקין",
  353. 'Sefer Kinyan': u"ספר קניין",
  354. 'Sefer Mishpatim': u"ספר משפטים",
  355. 'Sefer Shoftim': u"ספר שופטים",
  356. "Shulchan Arukh": u"שולחן ערוך",
  357. }
  358. section_names = {
  359. "Chapter": u"פרק",
  360. "Chapters": u"פרקים",
  361. "Perek": u"פרק",
  362. "Line": u"שורה",
  363. "Daf": u"דף",
  364. "Paragraph": u"פסקה",
  365. "Parsha": u"פרשה",
  366. "Parasha": u"פרשה",
  367. "Parashah": u"פרשה",
  368. "Seif": u"סעיף",
  369. "Se'if": u"סעיף",
  370. "Siman": u"סימן",
  371. "Section": u"חלק",
  372. "Verse": u"פסוק",
  373. "Sentence": u"משפט",
  374. "Sha'ar": u"שער",
  375. "Gate": u"שער",
  376. "Comment": u"פירוש",
  377. "Phrase": u"ביטוי",
  378. "Mishna": u"משנה",
  379. "Chelek": u"חלק",
  380. "Helek": u"חלק",
  381. "Year": u"שנה",
  382. "Masechet": u"מסכת",
  383. "Massechet": u"מסכת",
  384. "Letter": u"אות",
  385. "Halacha": u"הלכה",
  386. "Seif Katan": u"סעיף קטן",
  387. "Se'if Katan": u"סעיף קטן",
  388. "Volume": u"כרך",
  389. "Book": u"ספר",
  390. "Shar": u"שער",
  391. "Seder": u"סדר",
  392. "Part": u"חלק",
  393. "Pasuk": u"פסוק",
  394. "Sefer": u"ספר",
  395. "Teshuva": u"תשובה",
  396. "Teshuvot": u"תשובות",
  397. "Tosefta": u"תוספתא",
  398. "Halakhah": u"הלכה",
  399. "Kovetz": u"קובץ",
  400. "Path": u"נתיב",
  401. "Parshah": u"פרשה",
  402. "Midrash": u"מדרש",
  403. "Mitzvah": u"מצוה",
  404. "Tefillah": u"תפילה",
  405. "Torah": u"תורה",
  406. "Perush": u"פירוש",
  407. "Peirush": u"פירוש",
  408. "Aliyah": u"עלייה",
  409. "Tikkun": u"תיקון",
  410. "Tikkunim": u"תיקונים",
  411. "Hilchot": u"הילכות",
  412. "Topic": u"נושא",
  413. "Contents": u"תוכן"
  414. }
  415. words = dict(categories.items() + pseudo_categories.items() + section_names.items())
  416. if s in words:
  417. return words[s]
  418. # If s is a text title, look for a stored Hebrew title
  419. try:
  420. from sefaria.model import library, IndexSet
  421. from sefaria.system.exceptions import BookNameError
  422. i = library.get_index(s)
  423. return i.get_title("he")
  424. except BookNameError:
  425. pass
  426. return s
  427. # def main():
  428. # t = u"ההתשסטו"
  429. # return [index for index, (f, s) in enumerate(zip(t, t[1:])) if f < s and heb_to_int(s) >= 100]
  430. # t = u"ההתשסטו"
  431. # if __name__ == '__main__':
  432. # print main().__repr__()