Spaces:

CalebKoster
/

Translation_Note_Alignment

Sleeping

App Files Files Community

Koster commited on Mar 8, 2024

Commit

b5f1359

verified ·

1 Parent(s): 2a9a49d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +2 -0
LanguageTool.py +14 -0
README.md +3 -9
ScriptureReference.py +386 -0
TrainingData.py +84 -0
TranslationNoteFinder.py +252 -0
TranslationNoteFinderLLMOnly.py +212 -0
__pycache__/LanguageTool.cpython-312.pyc +0 -0
__pycache__/ScriptureReference.cpython-312.pyc +0 -0
__pycache__/ScriptureReference.cpython-39.pyc +0 -0
__pycache__/TrainingData.cpython-312.pyc +0 -0
__pycache__/TrainingData.cpython-39.pyc +0 -0
__pycache__/TranslationNoteFinder.cpython-312.pyc +0 -0
__pycache__/TranslationNoteFinder.cpython-39.pyc +0 -0
__pycache__/TranslationNoteFinderLLMOnly.cpython-312.pyc +0 -0
__pycache__/nltk.cpython-312.pyc +0 -0
__pycache__/romanize.cpython-312.pyc +0 -0
__pycache__/romanize.cpython-39.pyc +0 -0
__pycache__/tfidf.cpython-312.pyc +0 -0
__pycache__/tfidf.cpython-39.pyc +0 -0
flagged/log.csv +2 -0
highlightNote.css +7 -0
highlightNote.js +10 -0
main.py +22 -0
main_gradio.py +72 -0
main_gradio_js.py +107 -0
romanize.py +43 -0
tests/english_note_to_hindi.py +35 -0
tests/find_greek_in_hindi.py +69 -0
tests/guidance-ai-readme.md +731 -0
tests/nltk-test.py +41 -0
tests/test.py +36 -0
tests/test2.py +3 -0
tests/tfidf.py +152 -0
tests/tsv_parse +27 -0
translation_notes.json +32 -0
translation_notes/tn_ROM.tsv +0 -0
uroman-1.2.8/.gitignore +35 -0
uroman-1.2.8/LICENSE.txt +11 -0
uroman-1.2.8/README.md +163 -0
uroman-1.2.8/README.txt +141 -0
uroman-1.2.8/bin/de-accent.pl +201 -0
uroman-1.2.8/bin/string-distance.pl +99 -0
uroman-1.2.8/bin/uroman-quick.pl +58 -0
uroman-1.2.8/bin/uroman-tsv.sh +28 -0
uroman-1.2.8/bin/uroman.pl +138 -0
uroman-1.2.8/data/Chinese_to_Pinyin.txt +0 -0
uroman-1.2.8/data/Scripts.txt +135 -0
uroman-1.2.8/data/UnicodeData.txt +0 -0
uroman-1.2.8/data/UnicodeDataOverwrite.txt +442 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.gguf
2	+ bibles/*

LanguageTool.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from langdetect import detect
+import pycountry
+import langid
+class Lang:
+    def __init__(self, text, options=None):
+        if options:
+            langid.set_languages(options) # ISO 639-1 codes
+            self.lang_code, _ = langid.classify(text)
+        else:
+            self.lang_code = detect(text[:1000])
+        self.lang_name = pycountry.languages.get(alpha_2=self.lang_code).name

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Translation Note Alignment
-emoji: 💻
-colorFrom: indigo
-colorTo: indigo
 sdk: gradio
-sdk_version: 4.20.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Translation_Note_Alignment
+app_file: main_gradio_js.py
 sdk: gradio
+sdk_version: 4.19.2
 ---

ScriptureReference.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import re
+from functools import cache
+class ScriptureReference:
+    verse_ones = [
+    1, 1534, 2747, 3606, 4895, 5854, 6512, 7130, 7215, 8026, 8721, 9538, 10257, 11200, 12022, 12302, 12707, 12874, 13944,
+    16471, 17386, 17608, 17725, 19016, 20380, 20534, 21807, 22164, 22361, 22434, 22580, 22601, 22649, 22754, 22801, 22857,
+    22910, 22948, 23159, 23214, 24285, 24963, 26114, 26993, 27999, 28432, 28869, 29125, 29274, 29429, 29533, 29628, 29717,
+    29764, 29877, 29960, 30006, 30031, 30334, 30442, 30547, 30608, 30713, 30726, 30741, 30766, 31171
+    ]
+    book_codes = {
+        'GEN': {
+            'codes': ['Gen', 'Gn', '1M'],
+            'verses': [31, 25, 24, 26, 32, 22, 24, 22, 29, 32, 32, 20, 18, 24, 21, 16, 27, 33, 38, 18, 34, 24, 20, 67, 34,
+                       35, 46, 22, 35, 43, 55, 32, 20, 31, 29, 43, 36, 30, 23, 23, 57, 38, 34, 34, 28, 34, 31, 22, 33, 26]
+        },
+        'EXO': {
+            'codes': ['Ex', '2M'],
+            'verses': [22, 25, 22, 31, 23, 30, 25, 32, 35, 29, 10, 51, 22, 31, 27, 36, 16, 27, 25, 26, 36, 31, 33, 18, 40,
+                       37, 21, 43, 46, 38, 18, 35, 23, 35, 35, 38, 29, 31, 43, 38]
+        },
+        'LEV': {
+            'codes': ['Lev', 'Lv', '3M'],
+            'verses': [17, 16, 17, 35, 19, 30, 38, 36, 24, 20, 47, 8, 59, 57, 33, 34, 16, 30, 37, 27, 24, 33, 44, 23, 55,
+                       46, 34]
+        },
+        'NUM': {
+            'codes': ['Nm', 'Nu', '4M'],
+            'verses': [54, 34, 51, 49, 31, 27, 89, 26, 23, 36, 35, 16, 33, 45, 41, 50, 13, 32, 22, 29, 35, 41, 30, 25, 18,
+                       65, 23, 31, 40, 16, 54, 42, 56, 29, 34, 13]
+        },
+        'DEU': {
+            'codes': ['Deut', 'Dt', '5M'],
+            'verses': [46, 37, 29, 49, 33, 25, 26, 20, 29, 22, 32, 32, 18, 29, 23, 22, 20, 22, 21, 20, 23, 30, 25, 22, 19,
+                       19, 26, 68, 29, 20, 30, 52, 29, 12]
+        },
+        'JOS': {
+            'codes': ['Josh', 'Jos'],
+            'verses': [18, 24, 17, 24, 15, 27, 26, 35, 27, 43, 23, 24, 33, 15, 63, 10, 18, 28, 51, 9, 45, 34, 16, 33]
+        },
+        'JDG': {
+            'codes': ['Jdg', 'Judg'],
+            'verses': [36, 23, 31, 24, 31, 40, 25, 35, 57, 18, 40, 15, 25, 20, 20, 31, 13, 31, 30, 48, 25]
+        },
+        'RUT': {
+            'codes': ['Ru', 'Rth'],
+            'verses': [22, 23, 18, 22]
+        },
+        '1SA': {
+            'codes': ['1Sam', '1Sm'],
+            'verses': [28, 36, 21, 22, 12, 21, 17, 22, 27, 27, 15, 25, 23, 52, 35, 23, 58, 30, 24, 42, 15, 23, 29, 22, 44,
+                       25, 12, 25, 11, 31, 13]
+        },
+        '2SA': {
+            'codes': ['2Sam', '2Sm'],
+            'verses': [27, 32, 39, 12, 25, 23, 29, 18, 13, 19, 27, 31, 39, 33, 37, 23, 29, 33, 43, 26, 22, 51, 39, 25]
+        },
+        '1KI': {
+            'codes': ['1Kg', '1K'],
+            'verses': [53, 46, 28, 34, 18, 38, 51, 66, 28, 29, 43, 33, 34, 31, 34, 34, 24, 46, 21, 43, 29, 53]
+        },
+        '2KI': {
+            'codes': ['2Kg', '2K'],
+            'verses': [18, 25, 27, 44, 27, 33, 20, 29, 37, 36, 21, 21, 25, 29, 38, 20, 41, 37, 37, 21, 26, 20, 37, 20, 30]
+        },
+        '1CH': {
+            'codes': ['1Ch'],
+            'verses': [54, 55, 24, 43, 26, 81, 40, 40, 44, 14, 47, 40, 14, 17, 29, 43, 27, 17, 19, 8, 30, 19, 32, 31, 31,
+                       32, 34, 21, 30]
+        },
+        '2CH': {
+            'codes': ['2Ch'],
+            'verses': [17, 18, 17, 22, 14, 42, 22, 18, 31, 19, 23, 16, 22, 15, 19, 14, 19, 34, 11, 37, 20, 12, 21, 27, 28,
+                       23, 9, 27, 36, 27, 21, 33, 25, 33, 27, 23]
+        },
+        'EZR': {
+            'codes': ['Ezr'],
+            'verses': [11, 70, 13, 24, 17, 22, 28, 36, 15, 44]
+        },
+        'NEH': {
+            'codes': ['Neh'],
+            'verses': [11, 20, 32, 23, 19, 19, 73, 18, 38, 39, 36, 47, 31]
+        },
+        'EST': {
+            'codes': ['Est'],
+            'verses': [22, 23, 15, 17, 14, 14, 10, 17, 32, 3]
+        },
+        'JOB': {
+            'codes': ['Jb', 'Job'],
+            'verses': [22, 13, 26, 21, 27, 30, 21, 22, 35, 22, 20, 25, 28, 22, 35, 22, 16, 21, 29, 29, 34, 30, 17, 25, 6,
+                       14, 23, 28, 25, 31, 40, 22, 33, 37, 16, 33, 24, 41, 30, 24, 34, 17]
+        },
+        'PSA': {
+            'codes': ['Ps'],
+            'verses': [6, 12, 8, 8, 12, 10, 17, 9, 20, 18, 7, 8, 6, 7, 5, 11, 15, 50, 14, 9, 13, 31, 6, 10, 22, 12, 14, 9,
+                       11, 12, 24, 11, 22, 22, 28, 12, 40, 22, 13, 17, 13, 11, 5, 26, 17, 11, 9, 14, 20, 23, 19, 9, 6, 7,
+                       23, 13, 11, 11, 17, 12, 8, 12, 11, 10, 13, 20, 7, 35, 36, 5, 24, 20, 28, 23, 10, 12, 20, 72, 13, 19,
+                       16, 8, 18, 12, 13, 17, 7, 18, 52, 17, 16, 15, 5, 23, 11, 13, 12, 9, 9, 5, 8, 28, 22, 35, 45, 48, 43,
+                       13, 31, 7, 10, 10, 9, 8, 18, 19, 2, 29, 176, 7, 8, 9, 4, 8, 5, 6, 5, 6, 8, 8, 3, 18, 3, 3, 21, 26,
+                       9, 8, 24, 13, 10, 7, 12, 15, 21, 10, 20, 14, 9, 6]
+        },
+        'PRO': {
+            'codes': ['Pr'],
+            'verses': [33, 22, 35, 27, 23, 35, 27, 36, 18, 32, 31, 28, 25, 35, 33, 33, 28, 24, 29, 30, 31, 29, 35, 34, 28,
+                       28, 27, 28, 27, 33, 31]
+        },
+        'ECC': {
+            'codes': ['Ec', 'Qoh'],
+            'verses': [18, 26, 22, 16, 20, 12, 29, 17, 18, 20, 10, 14]
+        },
+        'SNG': {
+            'codes': ['Sos', 'Song'],
+            'verses': [17, 17, 11, 16, 16, 13, 13, 14]
+        },
+        'ISA': {
+            'codes': ['Isa'],
+            'verses': [31, 22, 26, 6, 30, 13, 25, 22, 21, 34, 16, 6, 22, 32, 9, 14, 14, 7, 25, 6, 17, 25, 18, 23, 12, 21,
+                       13, 29, 24, 33, 9, 20, 24, 17, 10, 22, 38, 22, 8, 31, 29, 25, 28, 28, 25, 13, 15, 22, 26, 11, 23,
+                       15, 12, 17, 13, 12, 21, 14, 21, 22, 11, 12, 19, 12, 25, 24]
+        },
+        'JER': {
+            'codes': ['Jer', 'Jr'],
+            'verses': [19, 37, 25, 31, 31, 30, 34, 22, 26, 25, 23, 17, 27, 22, 21, 21, 27, 23, 15, 18, 14, 30, 40, 10, 38,
+                       24, 22, 17, 32, 24, 40, 44, 26, 22, 19, 32, 21, 28, 18, 16, 18, 22, 13, 30, 5, 28, 7, 47, 39, 46, 64,
+                       34]
+        },
+        'LAM': {
+            'codes': ['Lam', 'Lm'],
+            'verses': [22, 22, 66, 22, 22]
+        },
+        'EZK': {
+            'codes': ['Ezek', 'Ezk'],
+            'verses': [28, 10, 27, 17, 17, 14, 27, 18, 11, 22, 25, 28, 23, 23, 8, 63, 24, 32, 14, 49, 32, 31, 49, 27, 17, 21,
+                       36, 26, 21, 26, 18, 32, 33, 31, 15, 38, 28, 23, 29, 49, 26, 20, 27, 31, 25, 24, 23, 35]
+        },
+        'DAN': {
+            'codes': ['Dn', 'Dan'],
+            'verses': [21, 49, 30, 37, 31, 28, 28, 27, 27, 21, 45, 13]
+        },
+        'HOS': {
+            'codes': ['Hos', 'Hs'],
+            'verses': [11, 23, 5, 19, 15, 11, 16, 14, 17, 15, 12, 10, 14, 9]
+        },
+        'JOL': {
+            'codes': ['Joel', 'Jl'],
+            'verses': [20, 32, 21]
+        },
+        'AMO': {
+            'codes': ['Am'],
+            'verses': [15, 16, 15, 13, 27, 14, 17, 14, 15]
+        },
+        'OBA': {
+            'codes': ['Ob'],
+            'verses': [21]
+        },
+        'JON': {
+            'codes': ['Jon'],
+            'verses': [17, 10, 10, 11]
+        },
+        'MIC': {
+            'codes': ['Mi', 'Mc'],
+            'verses': [16, 13, 12, 13, 15, 16, 20]
+        },
+        'NAM': {
+            'codes': ['Na'],
+            'verses': [15, 13, 19]
+        },
+        'HAB': {
+            'codes': ['Hab'],
+            'verses': [17, 20, 19]
+        },
+        'ZEP': {
+            'codes': ['Zep', 'Zp'],
+            'verses': [18, 15, 20]
+        },
+        'HAG': {
+            'codes': ['Hag', 'Hg'],
+            'verses': [15, 23]
+        },
+        'ZEC': {
+            'codes': ['Zc', 'Zec'],
+            'verses': [21, 13, 10, 14, 11, 15, 14, 20, 12, 21, 17, 14, 20, 9, 15, 21]
+        },
+        'MAL': {
+            'codes': ['Mal', 'Ml'],
+            'verses': [14, 17, 18, 6]
+        },
+        'MAT': {
+            'codes': ['Mt', 'Mat'],
+            'verses': [25, 23, 17, 25, 48, 34, 29, 34, 38, 42, 30, 50, 58, 36, 39, 28, 30, 34, 34, 46, 30, 46, 39, 28, 34,
+                       31, 46, 46, 38, 71, 66, 20]
+        },
+        'MRK': {
+            'codes': ['Mk', 'Mar'],
+            'verses': [45, 28, 35, 41, 43, 56, 29, 38, 50, 52, 33, 44, 37, 72, 47, 20]
+        },
+        'LUK': {
+            'codes': ['Lk', 'Lu'],
+            'verses': [80, 52, 38, 44, 39, 49, 50, 56, 62, 42, 54, 59, 35, 35, 32, 31, 37, 43, 48, 47, 38, 71, 56, 39, 49,
+                       57, 80, 55, 28, 35, 32, 31, 37, 50, 26, 46, 51, 66, 53, 59, 37, 35, 50, 40, 46, 51, 69, 53, 56, 20]
+        },
+        'JHN': {
+            'codes': ['Jn', 'Joh', 'Jhn'],
+            'verses': [51, 25, 36, 54, 47, 71, 53, 59, 41, 42, 57, 50, 38, 31, 27, 33, 26, 40, 42, 31, 25]
+        },
+        'ACT': {
+            'codes': ['Ac'],
+            'verses': [26, 47, 26, 37, 42, 15, 60, 40, 43, 48, 30, 25, 52, 28, 41, 40, 34, 28, 40, 38, 40, 30, 35, 27, 27,
+                       32, 44, 31]
+        },
+        'ROM': {
+            'codes': ['Ro', 'Rm'],
+            'verses': [32, 29, 31, 25, 21, 23, 25, 39, 33, 21, 36, 21, 14, 23, 33, 27]
+        },
+        '1CO': {
+            'codes': ['1Co'],
+            'verses': [31, 16, 23, 21, 13, 20, 40, 13, 27, 33, 34, 31, 13, 40, 58, 24]
+        },
+        '2CO': {
+            'codes': ['2Co'],
+            'verses': [24, 17, 18, 18, 21, 18, 16, 24, 15, 18, 33, 21, 14]
+        },
+        'GAL': {
+            'codes': ['Gal', 'Gl'],
+            'verses': [24, 21, 29, 31, 26, 18]
+        },
+        'EPH': {
+            'codes': ['Ep'],
+            'verses': [23, 22, 21, 32, 33, 24]
+        },
+        'PHP': {
+            'codes': ['Php', 'Philip'],
+            'verses': [30, 30, 21, 23]
+        },
+        'COL': {
+            'codes': ['Col'],
+            'verses': [29, 23, 25, 18]
+        },
+        '1TH': {
+            'codes': ['1Th'],
+            'verses': [10, 20, 13, 18, 28]
+        },
+        '2TH': {
+            'codes': ['2Th'],
+            'verses': [12, 17, 18]
+        },
+        '1TI': {
+            'codes': ['1Ti', '1Tm'],
+            'verses': [20, 15, 16, 16, 25, 21, 25]
+        },
+        '2TI': {
+            'codes': ['2Ti', '2Tm'],
+            'verses': [18, 26, 17, 22]
+        },
+        'TIT': {
+            'codes': ['Tit'],
+            'verses': [16, 15, 15]
+        },
+        'PHM': {
+            'codes': ['Phile', 'Phm'],
+            'verses': [25]
+        },
+        'HEB': {
+            'codes': ['Hb', 'Heb'],
+            'verses': [14, 18, 19, 16, 14, 20, 28, 13, 28, 39, 40, 29, 25]
+        },
+        'JAS': {
+            'codes': ['Ja', 'Jm'],
+            'verses': [27, 26, 18, 17, 20]
+        },
+        '1PE': {
+            'codes': ['1Pe', '2Pt'],
+            'verses': [25, 25, 22, 19, 14]
+        },
+        '2PE': {
+            'codes': ['2Pe', '2Pt'],
+            'verses': [21, 22, 18]
+        },
+        '1JN': {
+            'codes': ['1Jn', '1Jo', '1Jh'],
+            'verses': [10, 29, 24, 21, 21]
+        },
+        '2JN': {
+            'codes': ['2Jn', '2Jo', '2Jh'],
+            'verses': [13]
+        },
+        '3JN': {
+            'codes': ['3Jn', '3Jo', '3Jh'],
+            'verses': [14]
+        },
+        'JUD': {
+            'codes': ['Ju', 'Jd'],
+            'verses': [25]
+        },
+        'REV': {
+            'codes': ['Rev', 'Rv'],
+            'verses': [20, 29, 22, 18, 14, 20, 17, 18, 20, 15, 23, 19, 21, 18, 18, 24, 22, 21, 21, 15, 27, 21]
+        }
+    }
+    def __init__(self, reference):
+        self.reference = reference
+        self._structured_ref = self.parse_scripture_reference(reference)
+    @classmethod
+    def parse_scripture_reference(cls, input_ref):
+        normalized_input = re.sub(r"\s+", "", input_ref).upper()
+        regex = re.compile(r"^(\d)?(\D+)(\d+)?(?::(\d+))?(?:-(\d+)?(?::(\d+))?)?$")
+        match = regex.match(normalized_input)
+        if not match:
+            return {'bookCode': '', 'startChapter': 0, 'endChapter': 0, 'startVerse': 0, 'endVerse': 0}
+        bookPrefix, bookName, startChapter, startVerse, endChapterOrVerse, endVerse = match.groups()
+        fullBookName = f"{bookPrefix or ''}{bookName}".upper()
+        bookCode = ''
+        for code, book in cls.book_codes.items():
+            if any(fullBookName.startswith(name.upper()) for name in book['codes']):
+                bookCode = code
+                break
+        startChap = int(startChapter) if startChapter else 0
+        endChap = int(endChapterOrVerse) if endChapterOrVerse and endVerse else startChap
+        startVer = int(startVerse) if startVerse else 0
+        endVer = int(endVerse) if endVerse else int(endChapterOrVerse) if endChapterOrVerse and not endVerse else startVer
+        if startVer != 0 and endVer == 0:
+            endVer = startVer
+        return {
+            'bookCode': bookCode,
+            'startChapter': startChap,
+            'endChapter': endChap,
+            'startVerse': startVer,
+            'endVerse': endVer,
+        }
+    @property
+    @cache
+    def structured_ref(self):
+        return self._structured_ref
+    @property
+    @cache
+    def line_number(self):
+        book_code = self.structured_ref['bookCode']
+        start_chapter = self.structured_ref['startChapter']
+        start_verse = self.structured_ref['startVerse']
+        # Find the index of the book to get the starting line number
+        book_index = list(self.book_codes.keys()).index(book_code)
+        start_line_of_book = self.verse_ones[book_index]
+        # Calculate the number of verses before the specified chapter
+        verses_before = sum(self.book_codes[book_code]['verses'][:start_chapter - 1])
+        # Calculate the line number of the verse
+        line_number = start_line_of_book + verses_before + start_verse - 1
+        return line_number
+    # Override eq method to allow comparison of ScriptureReference objects based on line number
+    # def __eq__(self, other):
+    #     return self.line_number == other.line_number
+    # def __hash__(self):
+    #     book_code = self.structured_ref['bookCode']
+    #     start_chapter = self.structured_ref['startChapter']
+    #     start_verse = self.structured_ref['startVerse']
+    #     book_index = list(self.book_codes.keys()).index(book_code)
+    #     start_line_of_book = self.verse_ones[book_index]
+    #     verses_before = sum(self.book_codes[book_code]['verses'][:start_chapter - 1])
+    #     line_number = start_line_of_book + verses_before + start_verse - 1
+    #     return hash(line_number)
+# # Example usage:
+# reference = "1Jn 5:7-8"
+# scripture_ref = ScriptureReference(reference)
+# print("Structured Reference:", scripture_ref.get_structured_ref())
+# print("Line Number:", scripture_ref.line_number)

TrainingData.py ADDED Viewed

	@@ -0,0 +1,84 @@

+greek_to_lang = {
+    'Hindi':'''
+        What is a good translation of πίστις into Hindi? विश्वास
+        What is a good translation of χάρις into Hindi? अनुग्रह
+        What is a good translation of σωτηρία into Hindi? उद्धार
+        What is a good translation of εὐαγγέλιον into Hindi? सुसमाचार
+        What is a good translation of ἀπόστολος into Hindi? प्रेरित
+        What is a good translation of ἀγάπη into Hindi? प्रेम
+        What is a good translation of ἐκκλησία into Hindi? कलीसिया
+        What is a good translation of ἁμαρτία into Hindi? पाप
+        What is a good translation of μετάνοια into Hindi? पश्चाताप
+        What is a good translation of κύριος into Hindi? प्रभु
+        What is a good translation of ἅγιον πνεῦμα into Hindi? पवित्र आत्मा
+        What is a good translation of ἀνάστασις into Hindi? पुनरुत्थान
+        What is a good translation of ζωὴ αἰώνιος into Hindi? अनन्त जीवन
+        What is a good translation of βασιλεία τοῦ Θεοῦ into Hindi? परमेश्वर का राज्य
+        What is a good translation of μαθητής into Hindi? शिष्य
+        What is a good translation of δύναμις into Hindi? चमत्कार
+        What is a good translation of ἐντολή into Hindi? आज्ञा
+        What is a good translation of δικαιοσύνη into Hindi? धार्मिकता
+        What is a good translation of εἰρήνη into Hindi? शांति
+        What is a good translation of ὁ ἔσχατος δεῖπνον into Hindi? अंतिम भोज
+        What is a good translation of υἱὸς τοῦ Θεοῦ into Hindi? परमेश्वर का पुत्र
+        What is a good translation of βασιλεὺς τῶν βασιλευόντων into Hindi? राजाओं का राजा
+        What is a good translation of ἀμνὸς τοῦ Θεοῦ into Hindi? परमेश्वर का मेमना
+        What is a good translation of καρπὸς τοῦ Πνεύματος into Hindi? आत्मा का फल
+        What is a good translation of δωρεὰ τοῦ Ἁγίου Πνεύματος into Hindi? पवित्र आत्मा की देन
+        What is a good translation of ὁ καλὸς ποιμήν into Hindi? भला चरवाहा
+        What is a good translation of ἡ μεγάλη ἐντολή into Hindi? महान आज्ञा
+        What is a good translation of γεννηθῆναι ἄνωθεν into Hindi? पुनर्जन्म
+        What is a good translation of ἡ προσευχὴ τοῦ Κυρίου into Hindi? प्रभु की प्रार्थना
+        What is a good translation of Ὄρος τῶν Ἐλαιῶν into Hindi? जैतून का पहाड़
+        What is a good translation of ὕδωρ ζῶν into Hindi? जीवित जल
+        What is a good translation of ἄρτος τῆς ζωῆς into Hindi? जीवन की रोटी
+        What is a good translation of φῶς τοῦ κόσμου into Hindi? संसार का प्रकाश
+        What is a good translation of ποτήριον τῆς ὀργῆς into Hindi? क्रोध का प्याला
+        What is a good translation of σφραγὶς τοῦ Θεοῦ into Hindi? परमेश्वर की मुहर
+        What is a good translation of καινὴ διαθήκη into Hindi? नया नियम
+        What is a good translation of πανοπλία τοῦ Θεοῦ into Hindi? परमेश्वर का कवच
+        What is a good translation of θρόνος τῆς χάριτος into Hindi? अनुग्रह का सिंहासन
+        ''',
+    'Spanish':'''
+        What is a good translation of Χριστός into Spanish? Cristo
+        What is a good translation of πίστις into Spanish? fe
+        What is a good translation of χάρις into Spanish? Gracia
+        What is a good translation of σωτηρία into Spanish? Salvación
+        What is a good translation of εὐαγγέλιον into Spanish? Evangelio
+        What is a good translation of ἀπόστολος into Spanish? Apóstol
+        What is a good translation of ἀγάπη into Spanish? Amor
+        What is a good translation of ἐκκλησία into Spanish? Iglesia
+        What is a good translation of ἁμαρτία into Spanish? Pecado
+        What is a good translation of μετάνοια into Spanish? Arrepentimiento
+        What is a good translation of κύριος into Spanish? Señor
+        What is a good translation of ἅγιον πνεῦμα into Spanish? Espíritu Santo
+        What is a good translation of ἀνάστασις into Spanish? Resurrección
+        What is a good translation of ζωὴ αἰώνιος into Spanish? Vida Eterna
+        What is a good translation of βασιλεία τοῦ Θεοῦ into Spanish? Reino de Dios
+        What is a good translation of μαθητής into Spanish? Discípulo
+        What is a good translation of δύναμις into Spanish? Poder
+        What is a good translation of ἐντολή into Spanish? Mandamiento
+        What is a good translation of δικαιοσύνη into Spanish? Justicia
+        What is a good translation of εἰρήνη into Spanish? Paz
+        What is a good translation of ὁ ἔσχατος δεῖπνον into Spanish? Última Cena
+        What is a good translation of υἱὸς τοῦ Θεοῦ into Spanish? Hijo de Dios
+        What is a good translation of βασιλεὺς τῶν βασιλευόντων into Spanish? Rey de reyes
+        What is a good translation of ἀμνὸς τοῦ Θεοῦ into Spanish? Cordero de Dios
+        What is a good translation of καρπὸς τοῦ Πνεύματος into Spanish? Fruto del Espíritu
+        What is a good translation of δωρεὰ τοῦ Ἁγίου Πνεύματος into Spanish? Don del Espíritu Santo
+        What is a good translation of ὁ καλὸς ποιμήν into Spanish? Buen Pastor
+        What is a good translation of ἡ μεγάλη ἐντολή into Spanish? Gran Mandamiento
+        What is a good translation of γεννηθῆναι ἄνωθεν into Spanish? Nacer de nuevo
+        What is a good translation of ἡ προσευχὴ τοῦ Κυρίου into Spanish? Oración del Señor
+        What is a good translation of Ὄρος τῶν Ἐλαιῶν into Spanish? Monte de los Olivos
+        What is a good translation of ὕδωρ ζῶν into Spanish? Agua Viva
+        What is a good translation of ἄρτος τῆς ζωῆς into Spanish? Pan de Vida
+        What is a good translation of φῶς τοῦ κόσμου into Spanish? Luz del Mundo
+        What is a good translation of ποτήριον τῆς ὀργῆς into Spanish? Copa de la Ira
+        What is a good translation of σφραγὶς τοῦ Θεοῦ into Spanish? Sello de Dios
+        What is a good translation of καινὴ διαθήκη into Spanish? Nuevo Pacto
+        What is a good translation of πανοπλία τοῦ Θεοῦ into Spanish? Armadura de Dios
+        What is a good translation of θρόνος τῆς χάριτος into Spanish? Trono de Gracia
+        What is a good translation of βιβλίον τῆς ζωῆς into Spanish? Libro de la Vida
+        '''
+}

TranslationNoteFinder.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import json
+import csv
+import re
+from langdetect import detect
+import pycountry
+from sklearn.feature_extraction.text import TfidfVectorizer
+from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
+import openai
+from romanize import uroman
+from ScriptureReference import ScriptureReference as SR
+import stanza
+import difflib
+import requests
+from TrainingData import greek_to_lang
+class TranslationNoteFinder:
+    verses = SR.verse_ones
+    greek_bible_path = 'bibles/grc-grctcgnt.txt'
+    # Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
+    # lang_code follows ISO 639-1 standard
+    def __init__(self, bible_text_path, api_key, model_path=None, lang_code=None):
+        # Load Bibles
+        self.verses = TranslationNoteFinder.verses
+        self.greek_bible_text = self.load_bible(self.greek_bible_path)
+        self.target_bible_text = self.load_bible(bible_text_path)
+        first_line_nt = self.target_bible_text.splitlines()[23213]
+        # Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
+        if lang_code:
+            self.language = lang_code
+            self.lang_name = pycountry.languages.get(alpha_2=self.language).name
+            print(f'Language of target Bible text: {self.lang_name}')
+        else:
+            self.language = detect(first_line_nt)
+            self.lang_name = pycountry.languages.get(alpha_2=self.language).name
+            print(f'Detected language of target Bible text: {self.lang_name}')
+        # Local model currently not in use
+        if model_path:
+            self.model_path = model_path
+        # Download target language data for use in tokenizer
+        stanza.download(self.language)
+        self.nlp = stanza.Pipeline(lang=self.language, processors='tokenize')
+        # Assign instance variables
+        self.target_bible_text = self.load_bible(bible_text_path)
+        self.api_key = api_key
+        # Get tf-idf vectorizer, matrix for target Bible text
+        self.tfidf_vectorizer, self.tfidf_matrix = self.create_tfidf_vectorizer_matrix()
+    def parse_tsv_to_json(self, file_content, book_abbrev):
+        result = []  # Initialize an empty list to store the dictionaries.
+        # Turn tsv content into reader
+        tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
+        for row in tsv_reader:
+            # Check if the row contains a Greek term (non-empty) in the expected position.
+            if row and len(row) > 3 and row[4].strip():
+                # Construct a dictionary for the current row.
+                entry = {
+                    "source_term": row[4].strip(),
+                    "translation_note": row[6].strip(),
+                    "verse": book_abbrev + row[0].strip()
+                }
+                # Append the dictionary to the result list.
+                result.append(entry)
+        return result
+    def load_translation_notes(self, book_abbrev):
+        # If filepath ends with json
+        translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
+        response = requests.get(translation_notes_path)
+        if response.status_code == 200:
+            translation_notes_raw = response.text
+        else:
+            translation_notes_raw = ''
+        translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
+        return translation_notes
+    def load_bible(self, bible_path):
+        # Check if the path starts with "http://" or "https://"
+        if bible_path.startswith('http'):
+            # Use requests to fetch the Bible text from the URL
+            response = requests.get(bible_path)
+            # Check if the request was successful
+            if response.status_code == 200:
+                bible_text = response.text
+            else:
+                bible_text = ''  # Or handle errors as needed
+        else:
+            # Load the Bible text from a local file
+            with open(bible_path, 'r', encoding='utf-8') as file:
+                bible_text = file.read()
+        return bible_text
+    # Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
+    # i.e., documents = [Genesis content, Exodus content, ...]
+    def segment_corpus(self, bible_text):
+        documents = []
+        current_document = []
+        verse_lines = bible_text.splitlines()
+        for i, line in enumerate(verse_lines, start=1):
+            if i in self.verses:
+                if current_document:
+                    joined_doc_string = " ".join(current_document)
+                    documents.append(joined_doc_string)
+                    current_document = []
+            current_document.append(line.strip())
+        # Add the last document
+        if current_document:
+            joined_doc_string = " ".join(current_document)
+            documents.append(joined_doc_string)
+        return documents
+    # A method created for the tokenizer arg of the TfidfVectorizer class constructor
+    # See create_tfidf_vectorizer_matrix method
+    def stanza_tokenizer(self, text):
+        # Use the Stanza pipeline to process the text
+        doc = self.nlp(text)
+        # Extract tokens from the Stanza Document object
+        tokens = [word.text for sent in doc.sentences for word in sent.words]
+        return tokens
+    # Create a tf-idf vectorizer and matrix for the target Bible text
+    def create_tfidf_vectorizer_matrix(self):
+        tfidf_vectorizer = TfidfVectorizer(tokenizer=self.stanza_tokenizer, ngram_range=(1, 10))
+        segmented_corpus = self.segment_corpus(self.target_bible_text)
+        tfidf_matrix = tfidf_vectorizer.fit_transform(segmented_corpus)
+        return tfidf_vectorizer, tfidf_matrix
+    # Use the tf-idf matrix to get the tf-idf scores for the features (n-grams) of a specific book
+    def get_tfidf_book_features(self, book_code):
+        book_index = list(SR.book_codes.keys()).index(book_code)
+        feature_names = self.tfidf_vectorizer.get_feature_names_out()
+        dense = self.tfidf_matrix[book_index].todense()
+        document_tfidf_scores = dense.tolist()[0]
+        feature_scores = dict(zip(feature_names, document_tfidf_scores))
+        # Filter out zero scores
+        filtered_feature_scores = {feature: score for feature, score in feature_scores.items() if score > 0}
+        # Sort by score in descending order (just because...)
+        sorted_feature_scores = dict(sorted(filtered_feature_scores.items(), key=lambda item: item[1], reverse=True))
+        return sorted_feature_scores
+    # For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated Greek term
+    def best_ngram_for_note(self, note, verse_ngrams, language):
+        # local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens
+        openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
+        openai_lm = openai_llm
+        print(f'All ngrams in verse guidance is selecting from: {[key for key in verse_ngrams.keys()]}')
+        # print(f'All ngrams in verse guidance is selecting from: {[uroman(key) for key in verse_ngrams.keys()]}')
+        source_term = note['source_term'].strip()
+        # source_term = uroman(note['source_term']).strip()
+        with system():
+            openai_lm += f'You are an expert at translating from Greek into {language}.'
+            openai_lm += 'When asked to translate, provide only the translation of the term. Nothing else. Do not provide any additional information or context.'
+            openai_lm += 'Be extrememly succinct in your translations.'
+            openai_lm += 'You must choose only from the list of translation options you are given. Choose the single best option.'
+        # with instruction():
+        with user():
+            openai_lm += f'What is a good translation of {source_term} from Greek into {language} and is found here: {verse_ngrams.keys()}?'
+        with assistant():
+            openai_lm += gen('openai_translation', stop='.')
+        print(f'OpenAI translation: {openai_lm["openai_translation"]}')
+        try:
+            ngram = difflib.get_close_matches(openai_lm["openai_translation"].strip(), verse_ngrams.keys(), n=1, cutoff=0.3)[0]
+        except IndexError:
+            ngram = "No close match found"
+        print(f'Best ngram found for note: {ngram}')
+        return ngram
+    def verse_notes(self, verse_ref):
+        # Get the Greek form of the verse
+        v_ref = SR(verse_ref)
+        gk_verse_text = self.greek_bible_text.splitlines()[v_ref.line_number - 1]
+        # Get all relevant translation notes for the verse (based on Greek terms found in Greek verse)
+        # with open('translation_notes.json', 'r', encoding='utf-8') as file:
+        #     translation_notes = json.load(file)
+        translation_notes_in_verse = []
+        print(f'Let\'s see if there are any translation notes for this verse: \n\t {gk_verse_text}')
+        translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
+        for note in translation_notes:
+            note_v_ref = SR(note['verse'])
+            if note_v_ref.line_number != v_ref.line_number:
+                continue
+            print('Note verse:', note_v_ref.structured_ref)
+            print(f'Checking for existence of: {note["source_term"]}')
+            if note['source_term'].lower() in gk_verse_text.lower():
+                translation_notes_in_verse.append(note)
+        print(f'Greek terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
+        # Get the target language form of the verse
+        target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]
+        # Find n-grams from the book of the verse which exist in the verse
+        bookCode = v_ref.structured_ref['bookCode']
+        book_ngrams = self.get_tfidf_book_features(bookCode)
+        print(f'First 30 n-grams of the book: {list(book_ngrams.keys())[:30]}')
+        verse_ngrams = {feature: score for feature, score in book_ngrams.items() if feature.lower() in target_verse_text.lower()}
+        print(f'First five n-grams of the verse along with their scores: {list(verse_ngrams.items())[:5]}')
+        ngrams = []
+        for note in translation_notes_in_verse:
+            ngram = self.best_ngram_for_note(note, verse_ngrams, self.lang_name)
+            start_pos = target_verse_text.lower().find(ngram.lower())
+            end_pos = start_pos + len(ngram)
+            source_term = note['source_term']
+            trans_note = note['translation_note']
+            ngrams.append(
+            {
+                'ngram': ngram,
+                'start_pos': start_pos,
+                'end_pos': end_pos,
+                'source_term': source_term,
+                'trans_note': trans_note
+            })
+        print(f'Verse notes to be returned: {ngrams}')
+        return {
+            'target_verse_text': target_verse_text,
+            'verse_ref': v_ref.structured_ref,
+            'line_number': v_ref.line_number,
+            'ngrams': ngrams
+        }

TranslationNoteFinderLLMOnly.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import json
+import csv
+import re
+from langdetect import detect
+import pycountry
+from LanguageTool import Lang
+from sklearn.feature_extraction.text import TfidfVectorizer
+from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
+import openai
+from romanize import uroman
+from ScriptureReference import ScriptureReference as SR
+import stanza
+import difflib
+import requests
+# from TrainingData import greek_to_lang
+class TranslationNoteFinder:
+    verses = SR.verse_ones
+    # greek_bible_path = 'bibles/grc-grctcgnt.txt'
+    # hebrew_bible_path = 'bibles/heb-hebrewtanakh.txt'
+    # english_bible_path = 'bibles/eng-web.txt'
+    # Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
+    # lang_code follows ISO 639-1 standard
+    def __init__(self, bible_text_path, api_key, lang_code=None):
+        # Load Bibles
+        self.verses = TranslationNoteFinder.verses
+        # self.greek_bible_text = self.load_bible('bibles/grc-grctcgnt.txt')
+        # self.hebrew_bible_text = self.load_bible('bibles/heb-heb.txt')
+        # self.english_bible_text = self.load_bible('bibles/eng-engwebp.txt')
+        self.target_bible_text = self.load_bible(bible_text_path)
+        # Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
+        if lang_code:
+            self.language = lang_code
+            self.lang_name = pycountry.languages.get(alpha_2=self.language).name
+            print(f'Language of target Bible text: {self.lang_name}')
+        else:
+            first_line_nt = self.target_bible_text.splitlines()[23213]
+            self.language = detect(first_line_nt)
+            self.lang_name = pycountry.languages.get(alpha_2=self.language).name
+            print(f'Detected language of target Bible text: {self.lang_name}')
+        # Assign instance variables
+        self.target_bible_text = self.load_bible(bible_text_path)
+        self.api_key = api_key
+    def parse_tsv_to_json(self, file_content, book_abbrev):
+        result = []  # Initialize an empty list to store the dictionaries.
+        # Turn tsv content into reader
+        tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
+        for row in tsv_reader:
+            # Check if the row contains a source term (non-empty) in the expected position.
+            if row and len(row) > 3 and row[4].strip():
+                # Construct a dictionary for the current row.
+                entry = {
+                    "source_term": row[4].strip(),
+                    "translation_note": row[6].strip(),
+                    "verse": book_abbrev + row[0].strip()
+                }
+                # Append the dictionary to the result list.
+                result.append(entry)
+        return result
+    def load_translation_notes(self, book_abbrev):
+        # If filepath ends with json
+        translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
+        response = requests.get(translation_notes_path)
+        if response.status_code == 200:
+            translation_notes_raw = response.text
+        else:
+            translation_notes_raw = ''
+        translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
+        return translation_notes
+    def load_bible(self, bible_path):
+        # Check if the path starts with "http://" or "https://"
+        if bible_path.startswith('http'):
+            # Use requests to fetch the Bible text from the URL
+            response = requests.get(bible_path)
+            # Check if the request was successful
+            if response.status_code == 200:
+                bible_text = response.text
+            else:
+                bible_text = ''  # Or handle errors as needed
+        else:
+            # Load the Bible text from a local file
+            with open(bible_path, 'r', encoding='utf-8') as file:
+                bible_text = file.read()
+        return bible_text
+    # Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
+    # i.e., documents = [Genesis content, Exodus content, ...]
+    def segment_corpus(self, bible_text):
+        documents = []
+        current_document = []
+        verse_lines = bible_text.splitlines()
+        for i, line in enumerate(verse_lines, start=1):
+            if i in self.verses:
+                if current_document:
+                    joined_doc_string = " ".join(current_document)
+                    documents.append(joined_doc_string)
+                    current_document = []
+            current_document.append(line.strip())
+        # Add the last document
+        if current_document:
+            joined_doc_string = " ".join(current_document)
+            documents.append(joined_doc_string)
+        return documents
+    # For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated source term
+    def best_ngram_for_note(self, note, target_verse_text, language):
+        # local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens
+        openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
+        openai_lm = openai_llm
+        source_term = note['source_term'].strip()
+        source_lang = Lang(source_term, options=['en', 'he', 'el']).lang_name   # Can only choose between English, Hebrew, and Greek
+        print(f'Source term: {source_term}, \nSource language: {source_lang}')
+        # source_term = uroman(note['source_term']).strip()
+        with system():
+            openai_lm += f'You are an expert at translating between {source_lang} and {language}.'
+            openai_lm += f'When asked to translate, provide only the {language} translation of the {source_lang} term found in the {language} verse.'
+            openai_lm += 'Nothing else. Do not provide any additional information or context. Be extrememly succinct in your translations.'
+            openai_lm += f'You must choose only an N-gram which already exists in the {language} verse.'
+        with user():
+            openai_lm += f'What is a good translation of {source_term} from {source_lang} into {language} and is also found within this verse: {target_verse_text}?'
+            # openai_lm += f'What part of the verse \"{target_verse_text}\" is a good translation of {source_term} from {source_lang} into {language}?'
+        with assistant():
+            openai_lm += gen('openai_translation', stop='.')
+        print(f'OpenAI translation: {openai_lm["openai_translation"]}')
+        # If openai_lm["openai_translation"] can be found in the verse, return it
+        llm_output = openai_lm["openai_translation"].strip()
+        print(f'LLM output: {llm_output}')
+        if llm_output in target_verse_text:
+            print(f'LLM output found in verse: {llm_output}')
+            return llm_output
+        else:
+            print(f'LLM output not found in verse: {llm_output}')
+            return ''
+    def verse_notes(self, verse_ref):
+        # Get the source form of the verse
+        v_ref = SR(verse_ref)
+        # source_verse_text = self.source_bible_text.splitlines()[v_ref.line_number - 1]
+        translation_notes_in_verse = []
+        # print(f'Let\'s see if there are any translation notes for this verse: \n\t {source_verse_text}')
+        translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
+        # for note in translation_notes:
+        #     note_v_ref = SR(note['verse'])
+        #     if note_v_ref.line_number != v_ref.line_number:
+        #         continue
+        #     print('Note verse:', note_v_ref.structured_ref)
+        #     print(f'Checking for existence of: {note["source_term"]}')
+        #     if note['source_term'].lower() in source_verse_text.lower():
+        #         translation_notes_in_verse.append(note)
+        for note in translation_notes:
+            note_v_ref = SR(note['verse'])
+            if note_v_ref.line_number == v_ref.line_number: # Not checking for existence assumes there is a verse reference
+                translation_notes_in_verse.append(note)
+        print(f'Source terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
+        # Get the target language form of the verse
+        target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]
+        ngrams = []
+        for note in translation_notes_in_verse:
+            source_term = note['source_term']
+            trans_note = note['translation_note']
+            ngram = self.best_ngram_for_note(note, target_verse_text, self.lang_name)
+            start_pos = target_verse_text.lower().find(ngram.lower())
+            end_pos = start_pos + len(ngram)
+            ngrams.append(
+            {
+                'ngram': ngram,
+                'start_pos': start_pos,
+                'end_pos': end_pos,
+                'source_term': source_term,
+                'trans_note': trans_note
+            })
+        print('Verse notes to be returned:')
+        print(json.dumps(ngrams, indent=4))
+        return {
+            'target_verse_text': target_verse_text,
+            'verse_ref': v_ref.structured_ref,
+            'line_number': v_ref.line_number,
+            'ngrams': ngrams
+        }

__pycache__/LanguageTool.cpython-312.pyc ADDED Viewed

Binary file (1.01 kB). View file

__pycache__/ScriptureReference.cpython-312.pyc ADDED Viewed

Binary file (13.9 kB). View file

__pycache__/ScriptureReference.cpython-39.pyc ADDED Viewed

Binary file (11.1 kB). View file

__pycache__/TrainingData.cpython-312.pyc ADDED Viewed

Binary file (7.95 kB). View file

__pycache__/TrainingData.cpython-39.pyc ADDED Viewed

Binary file (4.54 kB). View file

__pycache__/TranslationNoteFinder.cpython-312.pyc ADDED Viewed

Binary file (12.8 kB). View file

__pycache__/TranslationNoteFinder.cpython-39.pyc ADDED Viewed

Binary file (7.28 kB). View file

__pycache__/TranslationNoteFinderLLMOnly.cpython-312.pyc ADDED Viewed

Binary file (8.84 kB). View file

__pycache__/nltk.cpython-312.pyc ADDED Viewed

Binary file (607 Bytes). View file

__pycache__/romanize.cpython-312.pyc ADDED Viewed

Binary file (1.89 kB). View file

__pycache__/romanize.cpython-39.pyc ADDED Viewed

Binary file (1.11 kB). View file

__pycache__/tfidf.cpython-312.pyc ADDED Viewed

Binary file (4.14 kB). View file

__pycache__/tfidf.cpython-39.pyc ADDED Viewed

Binary file (2.87 kB). View file

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Language Code,Verse References (comma-separated),Results,flag,username,timestamp
2	+ hi,eph1:3,,,,2024-03-02 19:58:40.869428

highlightNote.css ADDED Viewed

	@@ -0,0 +1,7 @@

+.note {
+    cursor: pointer;
+    text-decoration: underline;
+}
+.highlight {
+    background-color: yellow;
+}

highlightNote.js ADDED Viewed

	@@ -0,0 +1,10 @@

+function highlightNote(noteId, verseText, startPos, endPos) {
+    const noteElement = document.getElementById(noteId);
+    if(noteElement) {
+        const highlightedText = `<span class="highlight">${verseText.substring(startPos, endPos)}</span>`;
+        const verseTextElement = document.querySelector('.verse-text');
+        if (verseTextElement) {
+            verseTextElement.innerHTML = verseText.substring(0, startPos) + highlightedText + verseText.substring(endPos);
+        }
+    }
+}

main.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from TranslationNoteFinder import TranslationNoteFinder
+api_key = os.getenv('OPENAI_API_KEY')
+tnf = TranslationNoteFinder('translation_notes.json', 'bibles/hin-hin2017.txt', api_key=api_key, lang_code='hi')
+print(tnf.verse_notes('rom3:22'))
+# verse that includes en Christo
+# print(tnf.verse_notes('eph1:1'))
+# # verse with no translation note matches
+# print(tnf.verse_notes('jn1:8'))
+# # verse that includes logos
+# print(tnf.verse_notes('jn1:1'))
+# # verse that includes agape
+# print(tnf.verse_notes('1cor13:13'))
+# # verse that includes koinonia
+# print(tnf.verse_notes('1jn1:3'))
+# verse that includes dikaioo
+print(tnf.verse_notes('rom3:22'))
+# Expect ~1 min startup runtime, ~5 sec per verse

main_gradio.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import gradio as gr
+from gradio import HighlightedText
+from TranslationNoteFinder import TranslationNoteFinder
+# Updated dictionary mapping language codes to URLs of Bible text files
+bible_urls = {
+    'en': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/eng-kjvcpb.txt',
+    'hi': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/hin-hin2017.txt',
+    'es': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/spa-spabes.txt',
+    'ru': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/rus-russyn.txt'
+}
+tnf = None
+def load_resources(api_key, lang_code):
+    global tnf
+    bible_text_url = bible_urls.get(lang_code)
+    # 'translation_notes.json'
+    # 'translation_notes/tn_ROM.tsv'
+    tnf = TranslationNoteFinder('translation_notes/tn_ROM.tsv', bible_text_url, api_key, lang_code=lang_code)
+    return "Language resources loaded successfully.", "", "", ""
+def find_notes(verse_ref):
+    global tnf
+    if tnf is None:
+        return "Please load language resources first.", "", "", ""
+    results = tnf.verse_notes(verse_ref)
+    verse_ref_formatted = f"{results['verse_ref']['bookCode']} {results['verse_ref']['startChapter']}:{results['verse_ref']['startVerse']}"
+    target_text = results['target_verse_text']
+    colors = ["yellow", "lightgreen", "lightblue", "pink", "lightgrey", "orange", "purple", "cyan", "magenta", "lime", "teal",
+              "maroon", "navy", "olive", "silver", "gold", "coral", "turquoise", "indigo", "violet"]
+    ngrams_highlights = {}
+    for i, ngram in enumerate(reversed(results['ngrams'])):  # Reverse to not mess up the indices
+        start, end = ngram['start_pos'], ngram['end_pos']
+        highlight = f"<mark style='background-color:{colors[i]};'>{target_text[start:end]}</mark>"
+        target_text = target_text[:start] + highlight + target_text[end:]
+        # Map Greek terms to their corresponding highlight color
+        ngrams_highlights[ngram['greek_term']] = colors[i]
+    line_number = str(results['line_number'])
+    # Apply highlights to Greek terms in translation notes
+    ngrams_formatted = ""
+    for ngram in results['ngrams']:
+        greek_term_highlight = f"<span style='background-color:{ngrams_highlights[ngram['greek_term']]}'>{ngram['greek_term']}</span>"
+        ngrams_formatted += f"{greek_term_highlight}: {ngram['trans_note']}<br>"
+    # Since HTML component is used, all outputs must be strings
+    return verse_ref_formatted, target_text, line_number, ngrams_formatted
+# Adjusting Gradio interface for HTML output
+with gr.Blocks() as app:
+    api_key_input = gr.Textbox(label="API Key", type='password')
+    with gr.Row():
+        lang_dropdown = gr.Dropdown(choices=list(bible_urls.keys()), label="Language Code")
+        load_btn = gr.Button("Load Language")
+    verse_input = gr.Textbox(label="Verse Reference")
+    translate_btn = gr.Button("Translate")
+    verse_ref_output = gr.Textbox(label="Verse Reference")
+    target_text_output = gr.HTML(label="Target Verse Text")  # Changed to HTML component
+    # target_text_output = gr.HighlightedText(label="Target Verse Text")
+    line_number_output = gr.Textbox(label="Line Number")
+    ngrams_output = gr.HTML(label="N-grams")  # Changed to HTML for formatted output
+    load_btn.click(fn=load_resources, inputs=[api_key_input, lang_dropdown], outputs=[verse_ref_output, target_text_output, line_number_output, ngrams_output])
+    translate_btn.click(fn=find_notes, inputs=verse_input, outputs=[verse_ref_output, target_text_output, line_number_output, ngrams_output])
+app.launch()

main_gradio_js.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import gradio as gr
+# from TranslationNoteFinder import TranslationNoteFinder
+from TranslationNoteFinderLLMOnly import TranslationNoteFinder
+# Updated dictionary mapping language codes to URLs of Bible text files
+bible_urls = {
+    'en': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/eng-engkjvcpb.txt',
+    'hi': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/hin-hin2017.txt',
+    'es': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/spa-spabes.txt',
+    'ru': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/rus-russyn.txt'
+}
+tnf = None
+def load_resources(api_key, lang_code):
+    global tnf
+    bible_text_url = bible_urls.get(lang_code)
+    # 'translation_notes.json'
+    # 'translation_notes/tn_ROM.tsv'
+    tnf = TranslationNoteFinder(bible_text_url, api_key, lang_code=lang_code)
+    return "Language resources loaded successfully.", "", "", ""
+with gr.Blocks(css="highlightNote.css") as app:
+    def find_notes(verse_ref):
+        global tnf
+        if tnf is None:
+            return "Please load language resources first.", "", "", ""
+        # Clear the output fields by returning empty strings
+        # yield "", "", "", ""
+        results = tnf.verse_notes(verse_ref)
+        verse_ref_formatted = f"{results['verse_ref']['bookCode']} {results['verse_ref']['startChapter']}:{results['verse_ref']['startVerse']}"
+        target_text = results['target_verse_text']
+        ngrams_formatted = ""
+        line_number = str(results['line_number'])
+        # Apply highlights to Greek terms in translation notes
+        for i, ngram in enumerate(results['ngrams']):
+            note_id = f"note_{i}"
+            ngram_text = f"""<span id='{note_id}'
+                            class='note'
+                            data-verse-text='{target_text}'
+                            data-start-pos='{ngram['start_pos']}'
+                            data-end-pos='{ngram['end_pos']}'
+                            onmouseover="
+                                const noteElement = document.getElementById('{note_id}');
+                                if (noteElement) {{
+                                    const highlightedText = `<span class='highlight'>{target_text[ngram['start_pos']:ngram['end_pos']]}</span>`;
+                                    const verseTextElement = document.querySelector('.verse-text');
+                                    if (verseTextElement) {{
+                                        verseTextElement.innerHTML = `{target_text[:ngram['start_pos']]}`
+                                            + highlightedText
+                                            + `{target_text[ngram['end_pos']:]}`;
+                                    }}
+                                }}
+                            "
+                            onmouseout="document.querySelector('.verse-text').innerHTML = '{target_text}'"
+                            >
+                            {ngram['source_term']}: {ngram['trans_note']}
+                            </span><br><br>"""
+            ngrams_formatted += ngram_text
+        # Since HTML component is used, all outputs must be strings
+        return verse_ref_formatted, target_text, line_number, ngrams_formatted
+    api_key_input = gr.Textbox(label="API Key", type='password')
+    with gr.Row():
+        lang_dropdown = gr.Dropdown(choices=list(bible_urls.keys()), label="Language Code")
+        load_btn = gr.Button("Load Language")
+    verse_input = gr.Textbox(label="Verse Reference")
+    translate_btn = gr.Button("Translate")
+    verse_ref_output = gr.Textbox(label="Verse Reference")
+    target_text_output = gr.HTML(label="Target Verse Text", elem_classes=["verse-text"])
+    line_number_output = gr.Textbox(label="Line Number")
+    notes_output = gr.HTML(label="N-grams") # needs elem_classes?
+    load_btn.click(
+        fn=load_resources,
+        inputs=[
+            api_key_input,
+            lang_dropdown],
+        outputs=[
+            verse_ref_output,
+            target_text_output,
+            line_number_output,
+            notes_output
+            ]
+        )
+    translate_btn.click(
+        fn=find_notes,
+        inputs=verse_input,
+        outputs=[
+            verse_ref_output,
+            target_text_output,
+            line_number_output,
+            notes_output
+            ]
+        )
+app.launch(share=True)

romanize.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import zipfile
+import requests
+import subprocess
+# Downloading the zip file
+url = 'https://github.com/isi-nlp/uroman/archive/refs/tags/v1.2.8.zip'
+zip_filename = 'uroman.zip'
+with open(zip_filename, 'wb') as zip_file:
+    response = requests.get(url)
+    zip_file.write(response.content)
+# Unzipping the downloaded file
+with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
+    zip_ref.extractall()
+# Function to call the unzipped code
+def uroman(input_string, language=None, chart=False):
+    script_path = 'C:/Users/caleb/Bible Translation Project/guidance/uroman-1.2.8/bin/uroman.pl'  # Adjust if necessary
+    command = ["perl", script_path]
+    # Add language flag if specified
+    if language:
+        command.extend(["-l", language])
+    # Add chart flag if specified
+    if chart:
+        command.append("--chart")
+    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = process.communicate(input=input_string.encode())
+    if process.returncode != 0:
+        # There was an error
+        print(f"Error code {process.returncode}: {stderr.decode()}")
+        return None
+    # Return the output as a string
+    return stdout.decode()
+# Example usage
+# print(uroman("わたしはにほんじんです"))

tests/english_note_to_hindi.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import guidance
+from guidance import models, select, gen, system, user, assistant, one_or_more
+from openai import OpenAI
+model_path = 'models/neural-chat-7b-v3-3.Q2_K.gguf'
+llm = models.LlamaCpp(model_path, n_gpu_layers=1)
+# llm = models.OpenAI("gpt-4")
+verse = 'Blessed be the God and Father of our Lord Jesus Christ, who has blessed us in Christ with every spiritual blessing in the heavenly places,'
+hin_verse = 'हमारे प्रभु यीशु मसीह का पिता और परमेश्वर धन्य हो। उसने हमें मसीह के रूप में स्वर्ग के क्षेत्र में हर तरह के आशीर्वाद दिये हैं।'
+greek_term = 'ἐν Χριστῷ'
+translation_note = 'illustrates the intimate union between believers and Christ. The preposition ἐν (in) goes beyond physical location, indicating a profound spiritual reality. Translators need to convey the concept of being "in Christ" as being part of a new creation, identity, and living within the sphere of Christ\'s influence and lordship.'
+note = 'hey'
+# OpenAI implementation
+# with system():
+#     lm = llm + "You are an expert at translating into Hindi."
+# with user():
+#     lm += "Translate the following translation note into Hindi: \n" + translation_note
+# with assistant():
+#     lm += gen(max_tokens=1000)
+# print(lm)
+# Neural Chat implementation
+lm = llm + f"Translate the following into Hindi:\n {translation_note}"
+lm += gen('hin_note', max_tokens=400)
+print(lm)
+print(f"Translation note: {lm['hin_note']}")

tests/find_greek_in_hindi.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import guidance
+from guidance import  models, gen, system, user, assistant, instruction
+from openai import OpenAI
+import string
+from itertools import islice
+from romanize import uroman
+from tfidf import analyze_verse_in_corpus
+llm = models.OpenAI("gpt-3.5-turbo-instruct")
+eng_verse = 'Blessed be the God and Father of our Lord Jesus Christ, who has blessed us in Christ with every spiritual blessing in the heavenly places,'
+hin_verse = 'हमारे प्रभु यीशु मसीह का पिता और परमेश्वर धन्य हो। उसने हमें मसीह के रूप में स्वर्ग के क्षेत्र में हर तरह के आशीर्वाद दिये हैं।'
+greek_term = 'ἐν Χριστῷ'
+translation_note = 'illustrates the intimate union between believers and Christ. The preposition ἐν (in) goes beyond physical location, indicating a profound spiritual reality. Translators need to convey the concept of being "in Christ" as being part of a new creation, identity, and living within the sphere of Christ\'s influence and lordship.'
+from guidance import models, select
+model_path = 'models/neural-chat-7b-v3-3.Q2_K.gguf'
+# llm = models.LlamaCpp(model_path, n_gpu_layers=1)
+lm = llm
+with instruction():
+    lm += "What is a popular flavor?"
+lm += select(['chocolate', 'vanilla', 'strawberry'], name='flavor')
+print(lm['flavor'])
+# print(uroman(greek_term))
+language = 'Greek'
+romanize = False
+# lm = llm
+# with instruction():
+#     lm += f'The best translation of {uroman(greek_term)} from Romanized Greek into {language} is '
+# # lm += select(['fat albert', 'in heavenly places', 'not found'], name='translation')
+# # Generate only english letters from lm
+# lm += gen('translation', stop='.')
+# translation = lm['translation']
+translation = greek_term
+if romanize:
+    translation = uroman(translation)
+# Remove punctuation
+translation = translation.translate(str.maketrans('', '', string.punctuation)).lower()
+print(translation)
+if language == 'English':
+    file_path = 'bibles/eng-engkjvcpb.txt'
+if language == 'Hindi':
+    file_path = 'bibles/hin-hin2017.txt'
+if language == 'Greek':
+    file_path = 'bibles/grc-grctcgnt.txt'
+interested_line = 29276  # Example line (verse) number
+verse_scores = analyze_verse_in_corpus(file_path, interested_line, romanize=romanize)
+# verse_scores is a dictionary with n-grams as keys and their respective TF-IDF scores as values in descending order
+# Print n-grams and respective scores in the verse in descending score order
+for ngram, score in verse_scores.items():
+    print(f"{ngram}: {score:.4f}")
+# If any of the n-grams contains 'translation', print the n-gram with the highest score, and print its score
+for ngram, score in verse_scores.items():
+    if translation in ngram:
+        print(f"The n-gram '{ngram}' has the highest score of {score:.4f} in the verse.")
+        break

tests/guidance-ai-readme.md ADDED Viewed

	@@ -0,0 +1,731 @@

+<div align="right"><a href="https://guidance.readthedocs.org"><img src="https://readthedocs.org/projects/guidance/badge/?version=latest&style=flat" /></a></div>
+<div align="center"><picture>
+  <source media="(prefers-color-scheme: dark)" srcset="docs/figures/guidance_logo_blue_dark.svg">
+  <img alt="guidance" src="docs/figures/guidance_logo_blue.svg" width=300">
+</picture></div>
+<br/>
+> *Note that v0.1 is a dramatically new version developed while releases had to be paused over the summer. If you are looking for the old version based on handlebars, you can use v0.0.64, but you should instead try porting over to the much better new version :)*
+**`guidance`** is a programming paradigm that offers superior control and efficiency compared to conventional prompting and chaining. It allows users to constrain generation (e.g. with regex and CFGs) as well as to interleave control (conditional, loops) and generation seamlessly. Here are some important features:
+1. **Pure, beautiful python** with additional LM functionality. E.g. here is [basic generation](#basic-generation):
+```python
+from guidance import models, gen
+# load a model (could be Transformers, LlamaCpp, VertexAI, OpenAI...)
+llama2 = models.LlamaCpp(path)
+# append text or generations to the model
+llama2 + f'Do you want a joke or a poem? ' + gen(stop='.')
+```
+<img alt="Do you want a joke or a poem? I'll give you a poem" src="docs/figures/simple_gen_llama2_7b.png" width="354">
+2. [**Constrained generation**](#constrained-generation) with [selects](#select-basic), [regular expressions](#regular-expressions), and [context-free grammars](#context-free-grammars).
+```python
+from guidance import select
+# a simple select between two options
+llama2 + f'Do you want a joke or a poem? A ' + select(['joke', 'poem'])
+```
+<img alt="Do you want a joke or a poem? A poem" src="docs/figures/simple_select_llama2_7b.png" width="277">
+3. **Rich templates with f-strings**:
+```python
+llama2 + f'''\
+Do you want a joke or a poem? A {select(['joke', 'poem'])}.
+Okay, here is a one-liner: "{gen(stop='"')}"
+'''
+```
+<img width="358" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/486ca968-89b1-4c02-b914-3b9714fe5890"><br>
+4. [**Stateful control + generation**](#stateful-control--generation) makes it easy to interleave prompting / logic / generation, no need for intermediate parsers:
+```python
+# capture our selection under the name 'answer'
+lm = llama2 + f"Do you want a joke or a poem? A {select(['joke', 'poem'], name='answer')}.\n"
+# make a choice based on the model's previous selection
+if lm["answer"] == "joke":
+    lm += f"Here is a one-line joke about cats: " + gen('output', stop='\n')
+else:
+    lm += f"Here is a one-line poem about dogs: " + gen('output', stop='\n')
+```
+<img width="393" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/66d47ce7-1d5a-4dbd-b676-66b9c1094184"><br>
+5. **Abstract chat interface** that uses the correct special tokens for any chat model:
+```python
+from guidance import user, assistant
+# load a chat model
+chat_lm = models.LlamaCppChat(path)
+# wrap with chat block contexts
+with user():
+    lm = chat_lm + 'Do you want a joke or a poem?'
+with assistant():
+    lm += f"A {select(['joke', 'poem'])}."`
+```
+<img width="331" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/89c3e0e2-ed0a-4715-8366-2efca74b7b71"><br>
+6. **Easy to write reusable components**
+```python
+import guidance
+@guidance
+def one_line_thing(lm, thing, topic):
+    lm += f'Here is a one-line {thing} about {topic}: ' + gen(stop='\n')
+    return lm # return our updated model
+# pick either a joke or a poem
+lm = llama2 + f"Do you want a joke or a poem? A {select(['joke', 'poem'], name='thing')}.\n"
+# call our guidance function
+lm += one_line_thing(lm['thing'], 'cats')
+```
+<img width="386" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/60071680-8bbb-4fa5-a298-613d4fd55fa7"><br>
+7. **A library of pre-built components**, e.g. substring:
+```python
+from guidance import substring
+# define a set of possible statements
+text = 'guidance is awesome. guidance is so great. guidance is the best thing since sliced bread.'
+# force the model to make an exact quote
+llama2 + f'Here is a true statement about the guidance library: "{substring(text)}"'
+```
+<img width="589" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/9a7178ad-ed73-4e6b-b418-f9d2a3a76b88"><br>
+8. [**Easy tool use**](#automatic-interleaving-of-control-and-generation-tool-use), where the model stops generation when a tool is called, calls the tool, then resumes generation. For example, here is a simple version of a calculator, via four separate 'tools':
+```python
+@guidance
+def add(lm, input1, input2):
+    lm += f' = {int(input1) + int(input2)}'
+    return lm
+@guidance
+def subtract(lm, input1, input2):
+    lm += f' = {int(input1) - int(input2)}'
+    return lm
+@guidance
+def multiply(lm, input1, input2):
+    lm += f' = {float(input1) * float(input2)}'
+    return lm
+@guidance
+def divide(lm, input1, input2):
+    lm += f' = {float(input1) / float(input2)}'
+    return lm
+```
+Now we call `gen` with these tools as options. Notice how generation is stopped and restarted automatically:
+```python
+lm = llama2 + '''\
+1 + 1 = add(1, 1) = 2
+2 - 3 = subtract(2, 3) = -1
+'''
+lm + gen(max_tokens=15, tools=[add, subtract, multiply, divide])
+```
+<img width="201" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/646e1a7d-0206-419b-8206-1d835c3a0e0a"><br>
+9. **Speed**: In contrast to chaining, `guidance` programs are the equivalent of a single LLM call. More so, whatever non-generated text that gets appended is batched, so that `guidance` programs are **faster** than having the LM generate intermediate text when you have a set structure.
+10. **Token healing**: Users deal with text (or bytes) rather than tokens, and thus don't have to worry about [perverse token boundaries issues](https://towardsdatascience.com/the-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38) such as 'prompt ending in whitespace'.
+11. **Streaming support**, also integrated with jupyter notebooks:
+```python
+lm = llama2 + 'Here is a cute 5-line poem about cats and dogs:\n'
+for i in range(5):
+    lm += f"LINE {i+1}: " + gen(temperature=0.8, suffix="\n")
+```
+<img src="docs/figures/simple_streaming_example.gif" width="337">
+13. **High compatibility:** works with Transformers, llama.cpp, VertexAI, OpenAI. Users can write one guidance program and execute it on many backends. (note that the most powerful control features require endpoint integration, and for now work best with Transformers and llama.cpp).
+```python
+gpt = models.OpenAI("gpt-3.5-turbo")
+with user():
+    lm = gpt + "What is the capital of France?"
+with assistant():
+    lm += gen("capital")
+with user():
+    lm += "What is one short surprising fact about it?"
+with assistant():
+    lm += gen("fact")
+```
+<img width="645" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/f31ed7b8-1868-44d2-b14c-4842b0a40e5c"><br>
+14. **Multi-modal support.**
+```python
+from guidance import image
+gemini = models.VertexAI("gemini-pro-vision")
+with user():
+    lm = gemini + "What is this a picture of?" + image("longs_peak.jpg")
+with assistant():
+    lm += gen("answer")
+```
+<img width="673" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/6450d05d-52e9-4ef5-b280-8b57e733d46d">
+## Table of Contents
+   * [Install](#install)
+   * [Loading models](#loading-models)
+      * [llama.cpp](#llamacpp)
+      * [transformers](#transformers)
+      * [Vertex](#vertex-ai)
+      * [OpenAI](#openai)
+   * [Example notebooks](#example-notebooks)
+   * [Basic generation](#basic-generation)
+   * [Constrained Generation](#constrained-generation)
+      * [Select (basic)](#select-basic)
+      * [Regular expressions](#regular-expressions)
+         * [Regex to constrain generation](#regex-to-constrain-generation)
+         * [Regex as stopping criterion](#regex-as-stopping-criterion)
+      * [Context-free grammars](#context-free-grammars)
+   * [Stateful control + generation](#stateful-control--generation)
+      * [State in immutable objects](#state-in-immutable-objects)
+      * [Stateful guidance functions](#stateful-guidance-functions)
+      * [Example: ReAct](#example-react)
+      * [Example: Changing intermediate step of a Chat session](#example-changing-intermediate-step-of-a-chat-session)
+      * [Automatic interleaving of control and generation: tool use](#automatic-interleaving-of-control-and-generation-tool-use)
+      * [Gsm8k example](#gsm8k-example)
+      * [Automatic call grammar for @guidance functions](#automatic-call-grammar-for-guidance-functions)
+   * [Text, not tokens](#text-not-tokens)
+   * [Fast](#fast)
+      * [Integrated stateful control is faster](#integrated-stateful-control-is-faster)
+      * [Guidance acceleration](#guidance-acceleration)
+## Install
+```bash
+pip install guidance
+```
+## Loading models
+### llama.cpp
+Install the python bindings:
+```bash
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+```
+Loading the model:
+```python
+from guidance import models
+lm = models.LlamaCpp(path_to_model, n_gpu_layers=-1)
+```
+### Transformers
+Install transformers:
+```python
+from guidance import models
+lm = models.Transformers(model_name_or_path)
+```
+### Vertex AI
+Remote endpoints that don't have explicit guidance integration are run "optimistically". This means that all the text that can be forced is given to the model as a prompt (or chat context) and then the model is run in streaming mode without hard constrants (since the remote API doesn't support them). If the model ever violates the contraints then the model stream is stopped and we optionally try it again at that point. This means that all the API-supported control work as expected, and more complex controls/parsing that is not supported by the API work if the model stays consistent with the program.
+```python
+palm2 = models.VertexAI("text-bison@001")
+with instruction():
+    lm = palm2 + "What is one funny fact about Seattle?"
+lm + gen("fact", max_tokens=100)
+```
+<img width="635" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/693ae08f-68f7-4368-bd25-19afc9bfc0a5"><br>
+### OpenAI
+OpenAI endpoint don't have direct support for guidance grammars, but through optimistic running we can still control them in ways that match the model type:
+*Legacy completion models:*
+```python
+curie = models.OpenAI("text-curie-001")
+curie + "The smallest cats are" + gen(stop=".")
+```
+<img width="263" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/116a906c-ea77-4a13-a83a-682029d5e5c8"><br>
+*Instruct tuned models:*
+```python
+gpt_instruct = models.OpenAI("gpt-3.5-turbo-instruct")
+with instruction():
+    lm = gpt_instruct + "What are the smallest cats?"
+lm += gen(stop=".")
+```
+<img width="574" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/56a53ce1-89f5-4e9d-bdb8-86fb3eebf309"><br>
+*Chat models:*
+```python
+gpt = models.OpenAI("gpt-3.5-turbo")
+with system():
+    lm = gpt + "You are a cat expert."
+with user():
+    lm += "What are the smallest cats?"
+with assistant():
+    lm += gen("answer", stop=".")
+```
+<img width="367" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/46102f0f-37dc-4bb1-99b7-e5895bdee772"><br>
+## Example notebooks
+We are working on updating our example notebooks. The following ones have been updated:
+- [Basic tutorial](notebooks/tutorials/intro_to_guidance.ipynb)
+- [Chatbot with search](notebooks/chat_with_search.ipynb)
+More coming soon
+## Basic generation
+An `lm` object is immutable, so you change it by creating new copies of it. By default, when you append things to `lm`, it creates a copy, e.g.:
+```python
+from guidance import models, gen, select
+llama2 = models.LlamaCpp(model)
+# llama2 is not modified, `lm` is a copy of `llama2` with 'This is a prompt' appended to its state
+lm = llama2 + 'This is a prompt'
+```
+<img width="124" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/c1e96b2b-8f4a-44ee-a8f4-a694a8d7784b"><br>
+You can append _generation_ calls to model objects, e.g.
+```python
+lm = llama2 + 'This is a prompt' + gen(max_tokens=10)
+```
+<img width="267" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/d2e5ed34-ba9d-4bdd-872d-2b76f8e3cf85"><br>
+You can also interleave generation calls with plain text, or control flows:
+```python
+# Note how we set stop tokens
+lm = llama2 + 'I like to play with my ' + gen(stop=' ') + ' in' + gen(stop=['\n', '.', '!'])
+```
+<img width="279" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/2d47fd65-1982-4dd8-9ba9-a01e62fba455"><br>
+## Constrained Generation
+### Select (basic)
+`select` constrains generation to a set of options:
+```python
+lm = llama2 + 'I like the color ' + select(['red', 'blue', 'green'])
+```
+<img width="137" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/f0b97629-78a9-439d-90b2-06af31fdc40e"><br>
+### Regular expressions
+`gen` has optional arguments `regex` and `stop_regex`, which allow generation (and stopping, respectively) to be controlled by a regex.
+#### Regex to constrain generation
+Unconstrained:
+```python
+lm = llama2 + 'Question: Luke has ten balls. He gives three to his brother.\n'
+lm += 'How many balls does he have left?\n'
+lm += 'Answer: ' + gen(stop='\n')
+```
+<img width="405" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/55fb66ea-a717-417a-8a70-14c46eba4c66"><br>
+Constrained by regex:
+```python
+lm = llama2 + 'Question: Luke has ten balls. He gives three to his brother.\n'
+lm += 'How many balls does he have left?\n'
+lm += 'Answer: ' + gen(regex='\d+')
+```
+<img width="404" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/b45a5a79-55e0-4c15-884a-fba830c0a153"><br>
+#### Regex as stopping criterion
+Unconstrained:
+```python
+lm = llama2 + '19, 18,' + gen(max_tokens=50)
+```
+<img width="359" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/5dd13454-cc42-4e27-a52c-19a31237891c"><br>
+Stop with traditional stop text, whenever the model generates the number 7:
+```python
+lm = llama2 + '19, 18,' + gen(max_tokens=50, stop='7')
+```
+<img width="73" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/fc96d7c3-381d-4766-8bee-c930669f518a"><br>
+Stop whenever the model generates the character `7` without any numbers around it:
+```python
+lm = llama2 + '19, 18,' + gen(max_tokens=50, stop_regex='[^\d]7[^\d]')
+```
+<img width="293" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/a657e566-b1a4-447a-82a5-b88977b5fedf"><br>
+### Context-free grammars
+We expose a variety of operators that make it easy to define CFGs, which in turn can be used to constrain generation.
+For example, we can use the `select` operator (it accepts CFGs as options), `zero_or_more` and `one_or_more` to define a grammar for mathematical expressions:
+```python
+import guidance
+from guidance import one_or_more, select, zero_or_more
+# stateless=True indicates this function does not depend on LLM generations
+@guidance(stateless=True)
+def number(lm):
+    n = one_or_more(select(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))
+    # Allow for negative or positive numbers
+    return lm + select(['-' + n, n])
+@guidance(stateless=True)
+def operator(lm):
+    return lm + select(['+' , '*', '**', '/', '-'])
+@guidance(stateless=True)
+def expression(lm):
+    # Either
+    # 1. A number (terminal)
+    # 2. two expressions with an operator and optional whitespace
+    # 3. An expression with parentheses around it
+    return lm + select([
+        number(),
+        expression() + zero_or_more(' ') +  operator() + zero_or_more(' ') +  expression(),
+        '(' + expression() + ')'
+    ])
+```
+The `@guidance(stateless=True)` decorator makes it such that a function (e.g. `expression`) lives as a stateless grammar that does not get 'executed' until we call call `lm + expression()` or `lm += expression()`. For example, here is an example of _unconstrained_ generation:
+```python
+# Without constraints
+lm = llama2 + 'Problem: Luke has a hundred and six balls. He then loses thirty six.\n'
+lm += 'Equivalent arithmetic expression: ' + gen(stop='\n') + '\n'
+```
+<img width="462" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/54af1909-cad4-4fb1-8987-dfdfc02f8f42"><br>
+Notice how the model wrote the right equation but solved it (incorrectly). If we wanted to constrain the model such that it only writes valid expressions (without trying to solve them), we can just append our grammar to it:
+```python
+grammar = expression()
+lm = llama2 + 'Problem: Luke has a hundred and six balls. He then loses thirty six.\n'
+lm += 'Equivalent arithmetic expression: ' + grammar + '\n'
+```
+<img width="460" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/dbda0ff8-8edd-4384-b63d-fc98792e0689"><br>
+Grammars are very easy to compose. For example, let's say we want a grammar that generates either a mathematical expression or an expression followed by a solution followed by another expression. Creating this grammar is easy:
+```python
+from guidance import regex
+grammar = select([expression(), expression() +  regex(' = \d+; ') + expression()])
+```
+We can generate according to it:
+```python
+llama2 + 'Here is a math expression for two plus two: ' + grammar
+```
+<img width="346" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/283e6973-0b8d-4153-a82b-9f5db1460da9"><br>
+```python
+llama2 + '2 + 2 = 4; 3+3\n' + grammar
+```
+<img width="109" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/d584a93c-bf24-43d5-8f8d-501e7eb88422"><br>
+Even if you don't like thinking in terms of recursive grammars, this formalism makes it easy to constrain generation. For example, let's say we have the following one-shot prompt:
+```python
+@guidance(stateless=True)
+def ner_instruction(lm, input):
+    lm += f'''\
+    Please tag each word in the input with PER, ORG, LOC, or nothing
+    ---
+    Input: John worked at Apple.
+    Output:
+    John: PER
+    worked:
+    at:
+    Apple: ORG
+    .:
+    ---
+    Input: {input}
+    Output:
+    '''
+    return lm
+input = 'Julia never went to Morocco in her life!!'
+llama2 + ner_instruction(input) + gen(stop='---')
+```
+<img width="465" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/8ecf5ad4-68b8-4e7a-b107-b1a5613e4c68"><br>
+Notice that the model did not spell the word 'Morocco' correctly. Sometimes the model might also hallucinate a tag that doesn't exist. We can improve this by adding more few-shot examples, etc, but we can also constrain generation to the exact format we want:
+```python
+import re
+@guidance(stateless=True)
+def constrained_ner(lm, input):
+    # Split into words
+    words = [x for x in re.split('([^a-zA-Z0-9])', input) if x and not re.match('\s', x)]
+    ret = ''
+    for x in words:
+        ret += x + ': ' + select(['PER', 'ORG', 'LOC', '']) + '\n'
+    return lm + ret
+llama2 + ner_instruction(input) + constrained_ner(input)
+```
+<img width="462" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/72545093-ef16-479a-b666-bd97c54a5dc7">
+While `constrained_ner(input)` **is** a grammar that constrains the model generation, it _feels_ like you're just writing normal imperative python code with `+=` and `selects`.
+## Stateful control + generation
+### State in immutable objects
+Whenever you do `lm + grammar` or `lm + gen`, `lm + select`, etc, you return a new lm object with additional state. For example:
+```python
+lm = llama2 + 'This is a prompt' + gen(name='test', max_tokens=10)
+lm += select(['this', 'that'], name='test2')
+lm['test'], lm['test2']
+```
+<img width="296" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/f0f9d180-6209-40df-9401-40da35d46e1a"><br>
+### Stateful `guidance` functions
+The guidance decorator is `@guidance(stateless=False)` by default, meaning that a function with this decorator depends on the lm state to execute (either prior state or state generated within the function). For example:
+```python
+@guidance(stateless=False)
+def test(lm):
+    lm += 'Should I say "Scott"?\n' + select(['yes', 'no'], name='answer') + '\n'
+    if lm['answer'] == 'yes':
+        lm += 'Scott'
+    else:
+        lm += 'Not Scott'
+    return lm
+llama2 + test()
+```
+<img width="159" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/5a55496b-aea0-46e9-8de6-b63655027653"><br>
+### Example: ReAct
+A big advantage of stateful control is that you don't have to write any intermediate parsers, and adding follow-up 'prompting' is easy, even if the follow up depends on what the model generates.
+For example, let's say we want to implement the first example of ReAct prompt in [this](https://www.promptingguide.ai/techniques/react), and let's say the valid acts are only 'Search' or 'Finish'. We might write it like this:
+```python
+@guidance
+def react_prompt_example(lm, question, max_rounds=10):
+    lm += f'Question: {question}\n'
+    i = 1
+    while True:
+        lm += f'Thought {i}: ' + gen(suffix='\n')
+        lm += f'Act {i}: ' + select(['Search', 'Finish'], name='act')
+        lm += '[' + gen(name='arg', suffix=']') + '\n'
+        if lm['act'] == 'Finish' or i == max_rounds:
+            break
+        else:
+            lm += f'Observation {i}: ' + search(lm['arg']) + '\n'
+        i += 1
+    return lm
+```
+Notice how we don't have to write a parser for Act and argument and hope that the model generates something valid: we enforce it. Notice also that the loop only stops once the model chooses to act with 'Finish' (or once we hit a maximum number of rounds).
+### Example: Changing intermediate step of a Chat session
+We can also hide or change some of what the model generates. For example, below we get a Chat model (notice we use special `role` blocks) to name some experts to answer a question, but we always remove 'Ferriss' from the list if he is mentioned:
+```python
+from guidance import user, system, assistant
+lm = llama2
+query = 'How can I be more productive?'
+with system():
+    lm += 'You are a helpful and terse assistant.'
+with user():
+    lm += f'I want a response to the following question:\n{query}\n'
+    lm += 'Name 3 world-class experts (past or present) who would be great at answering this.'
+with assistant():
+    temp_lm = lm
+    for i in range(1, 4):
+        # This regex only allows strings that look like names (where every word is capitalized)
+        # list_append appends the result to a list
+        temp_lm += f'{i}. ' + gen(regex='([A-Z][a-z]*\s*)+', suffix='\n',
+                                  name='experts', list_append=True)
+    experts = [x for x in temp_lm['experts'] if 'Ferriss' not in x]
+    # Notice that even if the model generates 'Ferriss' above,
+    # it doesn't get added to `lm`, only to `temp_lm`
+    lm += ', '.join(experts)
+with user():
+    lm += 'Please answer the question as if these experts had collaborated in writing an anonymous answer.'
+with assistant():
+    lm += gen(max_tokens=100)
+```
+<img width="688" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/d274f8b8-52e7-41a5-9635-b34f70ed50e0"><br>
+### Automatic interleaving of control and generation: tool use
+Tool use is a common case of stateful control. To make it easy to do so, `gen` calls take `tools` as an optional argument, where each tool is defined by (1) a grammar that triggers its call and captures the arguments (if any), and (2) the actual tool call. Then, as generation unrolls, whenever the model generates something that matches the grammar of a tool call, it (1) stops generation, (2) calls the tool (which can append whatever it wants to the LM session), and (3) continues generation.
+For example, here is how we might implement a calculator tool, leveraging our `expression` grammar above:
+```python
+from guidance import capture, Tool
+@guidance(stateless=True)
+def calculator_call(lm):
+    # capture just 'names' the expression, to be saved in the LM state
+    return lm + 'calculator(' + capture(expression(), 'tool_args') + ')'
+@guidance
+def calculator(lm):
+    expression = lm['tool_args']
+    # You typically don't want to run eval directly for save reasons
+    # Here we are guaranteed to only have mathematical expressions
+    lm += f' = {eval(expression)}'
+    return lm
+calculator_tool = Tool(calculator_call(), calculator)
+lm = llama2 + 'Here are five expressions:\ncalculator(3 *3) = 33\ncalculator(2 + 1 * 3) = 5\n'
+lm += gen(max_tokens=30, tools=[calculator_tool], stop='\n\n')
+```
+<img width="201" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/2d9b840a-4fad-4dab-b3e7-20887539b447"><br>
+### Gsm8k example
+Notice that the calculator is just called seamlessly during generation. Here is a more realistic exampe of the model solving a gsm8k question:
+```python
+@guidance
+def math_with_calc(lm, question):
+    # Two-shot example
+    lm += '''\
+    Question: John starts with 2 balls. He then quintupled his number of balls. Then he lost half of them. He then gave 3 to his brother. How many does he have left?
+    Reasoning:
+    1. He quintupled his balls. So he has calculator(2 * 5) = 10 balls.
+    1. He lost half. So he has calculator(10 / 2) = 5 balls.
+    3. He gave 3 to his brother. So he has calculator(5 - 3) = 2 balls.
+    Answer: 2
+    Question: Jill get 7 dollars a day in allowance. She uses 1 each day to by a bus pass, then gives half away. How much does she have left each day?
+    Reasoning:
+    1. She gets 7 dollars a day.
+    1. She spends 1 on a bus pass. So she has calculator(5 - 1) = 6.
+    3. She gives half away. So that makes calculator(6 / 2) = 3.
+    Answer: 3
+    '''
+    lm += f'Question: {question}\n'
+    lm += 'Reasoning:\n' + gen(max_tokens=200, tools=[calculator_tool], stop='Answer')
+    # Only numbers or commas
+    lm += 'Answer: ' + gen(regex='[-\d,]+')
+    return lm
+question = '''Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?'''
+llama2 + math_with_calc(question)
+```
+<img width="685" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/0c7b8da0-b295-46cd-a312-604ecfba7b33"><br>
+### Automatic call grammar for @guidance functions
+You can also initialize a `Tool` with any `@guidance`-decorated function, and the default call grammar will be like a python call. Here is an example of using multiple such tools in the same `gen` call:
+```python
+@guidance
+def say_scott(lm, n):
+    lm += '\n'
+    for _ in range(int(n)):
+        lm += 'Scott\n'
+    return lm
+@guidance
+def say_marco(lm, n):
+    lm += '\n'
+    for _ in range(int(n)):
+        lm += 'marco\n'
+    return lm
+tools = [Tool(callable=say_scott), Tool(callable=say_marco)]
+llama2 + '''\
+I am going to call say_scott and say_marco a few times:
+say_scott(1)
+Scott
+''' + gen(max_tokens=20, tools=tools)
+```
+<img width="395" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/8025699b-59a1-4a3f-8b1e-a895a54924e2"><br>
+## Text, not tokens
+The standard greedy tokenizations used by most language models introduce a variety of subtle and powerful biases, which that can have all kinds of unintended consequences for your prompts.
+For example, take the following prompt, given to gpt-2 (standard greedy tokenization):
+hf_gen(prompt, max_tokens=10)
+```python
+from transformers import pipeline
+pipe = pipeline("text-generation", model="gpt2")
+def hf_gen(prompt, max_tokens=100):
+    return pipe(prompt, do_sample=False, max_length=max_tokens, return_full_text=False)[0]['generated_text']
+prompt = 'http:'
+hf_gen(prompt, max_tokens=10)
+```
+<img width="198" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/a0fe3e81-89e0-4b4a-8981-edf8b1a8a723"><br>
+ Notice how the output generated by the LLM does not complete the URL with the obvious next characters (two forward slashes). It instead creates an invalid URL string with a space in the middle. Why? Because the string `://` is its own token, and so once the model sees a colon by itself, it assumes that the next characters cannot be `//`; otherwise, the tokenizer would not have used `:`, and instead would have used `://`. This is why there are warnings about ending prompts in whitespace, but the problem is way more pervasive than that: any boundary that may span multiple tokens will cause problems, e.g. notice how a partial word causes incorrect completion:
+ ```python
+prompt = 'John is a'
+hf_gen(prompt, max_tokens=5)
+```
+<img width="133" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/44906e57-c4ca-4dc3-a1c3-2fdba040259b"><br>
+ ```python
+prompt = 'John is a fo'
+hf_gen(prompt, max_tokens=5)
+```
+<img width="52" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/df649320-ec8e-468a-bb2f-e1994f16c9b6"><br>
+While problematic enough for normal prompts, these problems would be a disaster in the kinds of prompts we wrote in this readme, where there is interleaving of prompting and generation happening multiple times (and thus multiple opportunities for problems). This is why `guidance` implements [token healing](https://towardsdatascience.com/the-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38), a feature that deals with prompt boundaries automatically, allowing users to just think in terms of **text** rather than tokens. For example:
+```python
+from guidance import models
+gpt = models.Transformers('gpt2')
+prompt = 'http:'
+gpt + prompt + gen(max_tokens=10)
+```
+<img width="244" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/c9f26a58-52f2-457c-958a-e048f68eb388"><br>
+```python
+prompt = 'John is a fo'
+gpt + prompt + gen(max_tokens=2)
+```
+<img width="186" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/bc5e4cd4-9b82-4c09-9db2-9e890dad1d69"><br>
+## Fast
+### Integrated stateful control is faster
+We have full control of the decoding loop in our integration with `transformers` and `llamacpp`, allowing us to add control and additional prompt without any extra cost.
+If instead we're calling a server, we pay the extra cost of making additional requests, which might be ok if the server has caching, but quickly becomes impractical if the server does not have fine-grained caching. For example, note again the output from the [gsm8k example with calculator](#gsm8k-example) above:
+<img width="624" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/2c75b0f2-6997-43d9-b10e-cb9f6f2e2de5">
+Every time we call `calculator`, we have to stop generation, append the result to the prompt, and resume generation. To avoid slowing down after the first call, a server would need to keep the KV cache up to '3 for breakfast. So she has calculator(16 - 3)', then roll forward generation from that point on. Even servers that _do_ have caching often don't have a way to guarantee state is preserved at each stop and start, and so user's pay a significant overhead at each interruption. The normal approach of considering everything as a new prompt would cause significant slow downs every time `calculator` is called.
+### Guidance acceleration
+In addition to the benefit above, `guidance` calls are often **faster** than running equivalent prompts the traditional way, because we can batch any additional text that is added by the user as execution unrolls (rather than generating it). Take the example below, where we generate a json with a GGUF compressed `llama2` 7B executed using llama.cpp:
+```python
+@guidance
+def character_maker(lm, id, description, valid_weapons):
+    lm += f"""\
+    The following is a character profile for an RPG game in JSON format.
+    ```json
+    {{
+        "id": "{id}",
+        "description": "{description}",
+        "name": "{gen('name', stop='"')}",
+        "age": {gen('age', regex='[0-9]+', stop=',')},
+        "armor": "{select(options=['leather', 'chainmail', 'plate'], name='armor')}",
+        "weapon": "{select(options=valid_weapons, name='weapon')}",
+        "class": "{gen('class', stop='"')}",
+        "mantra": "{gen('mantra', stop='"')}",
+        "strength": {gen('strength', regex='[0-9]+', stop=',')},
+        "items": ["{gen('item', list_append=True, stop='"')}", "{gen('item', list_append=True, stop='"')}", "{gen('item', list_append=True, stop='"')}"]
+    }}```"""
+    return lm
+a = time.time()
+lm = llama2 + character_maker(1, 'A nimble fighter', ['axe', 'sword', 'bow'])
+time.time() - a
+```
+<img width="480" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/85b5a181-6e6a-4582-9203-730f49353aeb"><br>
+Everything that is not green is not actually generated by the model, and is thus batched (much faster). This prompt takes about 1.2 seconds on an A100 GPU. Now, if we let the model generate everything (as in the roughly equivalent prompt below), it takes roughly `2.6` seconds (not only is it slower, we also have less control over generation).
+```python
+@guidance
+def character_maker2(lm, id, description):
+    lm += f"""\
+    The following is a character profile for an RPG game in JSON format. It has fields 'id', 'description', 'name', 'age', 'armor', weapon', 'class', 'mantra', 'strength', and 'items (just the names of 3 items)'
+    please set description to '{description}'
+    ```json""" + gen(stop='```')
+    return lm
+a = time.time()
+lm = llama2 + character_maker2(1, 'A nimble fighter')
+time.time() - a
+```
+<img width="586" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/9c55500d-4c90-4f42-9343-43aa2a25efa4"><br>

tests/nltk-test.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import nltk
+from nltk import FreqDist
+from nltk.util import ngrams
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import string
+# Step 0: Ensure necessary NLTK resources are downloaded
+nltk.download('punkt')
+nltk.download('stopwords')
+# Step 1: Load your Bible text
+with open('bibles/eng-engkjvcpb.txt', 'r', encoding='utf-8') as file:
+    bible_text = file.read()
+# Step 2: Preprocess the text
+tokens = word_tokenize(bible_text.lower())  # Tokenize and normalize case
+# Remove punctuation and stop words
+stop_words = set(stopwords.words('english'))
+tokens = [token for token in tokens if token not in string.punctuation]
+# Step 3: Generate n-grams
+all_ngrams = []
+for n in range(2, 5):
+    all_ngrams.extend(ngrams(tokens, n))
+# Step 4: Analyze frequency of n-grams
+freq_dist = FreqDist(all_ngrams)
+most_common_ngrams = freq_dist.most_common(30)  # Adjust the number to get more or fewer common n-grams
+# Display the most common n-grams
+for ngram, occurrence in most_common_ngrams:
+    print("{}: {}".format(' '.join(ngram), occurrence))
+# Find the index (rank) of the specific n-gram in freq_dist
+ngram_rank = sorted(freq_dist, key=freq_dist.get, reverse=True).index(('in', 'christ')) + 1
+# If 'in christ' found in the most common n-grams, print the number of occurrences and how it ranks among the most common n-grams
+if ('in', 'christ') in freq_dist:
+    print("The n-gram 'in christ' occurs {} times and is ranked {} among the most common n-grams.".format(freq_dist[('in', 'christ')], ngram_rank))

tests/test.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from guidance import models, select
+import nltk
+from nltk import FreqDist
+from nltk.util import ngrams
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import string
+nltk.download('punkt')
+from romanize import uroman
+llm = models.LlamaCpp('models/neural-chat-7b-v3-3.Q2_K.gguf', n_gpu_layers=1)
+hin_str = 'हमारे परमेश्‍वर और प्रभु यीशु मसीह के पिता का धन्यवाद हो कि उसने हमें मसीह में स्वर्गीय स्थानों में सब प्रकार की आत्मिक आशीष* दी है।'
+hin_str = uroman(hin_str)
+greek_term = "ἐν Χριστῷ"
+greek_term = uroman(greek_term)
+tokens= word_tokenize(hin_str.lower())  # Tokenize and normalize case
+tokens = [token for token in tokens if token not in string.punctuation]
+all_ngrams = []
+for n in range(2, 4):
+    all_ngrams.extend(ngrams(tokens, n))
+all_ngrams = [x[0] + x[1] for x in all_ngrams]
+all_ngrams = []
+print(all_ngrams)
+lm = llm
+lm += f'The best translation of {greek_term} from Greek into Hindi is '
+lm += select(all_ngrams, name='ngram')
+print(lm['ngram'])

tests/test2.py ADDED Viewed

	@@ -0,0 +1,3 @@

+verse = "हमारे परमेश्‍वर और प्रभु यीशु मसीह के पिता का धन्यवाद हो कि उसने हमें मसीह में स्वर्गीय स्थानों में सब प्रकार की आत्मिक आशीष* दी है।"
+n_gram = 'मसीह में'
+print(n_gram in verse)  # This would print True

tests/tfidf.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+import numpy as np
+from itertools import islice
+from romanize import uroman
+verses = [
+    1,
+    1534,
+    2747,
+    3606,
+    4895,
+    5854,
+    6512,
+    7130,
+    7215,
+    8026,
+    8721,
+    9538,
+    10257,
+    11200,
+    12022,
+    12302,
+    12707,
+    12874,
+    13944,
+    16471,
+    17608,
+    17725,
+    19016,
+    20380,
+    20534,
+    21807,
+    22164,
+    22361,
+    22434,
+    22580,
+    22601,
+    22649,
+    22754,
+    22857,
+    22910,
+    22948,
+    23159,
+    23214,
+    24285,
+    24963,
+    26114,
+    26993,
+    27999,
+    28432,
+    28869,
+    29125,
+    29274,
+    29429,
+    29533,
+    29628,
+    29717,
+    29764,
+    29877,
+    29960,
+    30006,
+    30031,
+    30334,
+    30442,
+    30547,
+    30608,
+    30713,
+    30726,
+    30741,
+    30766,
+    31171
+]
+# Adjust verses to be zero-indexed for Python
+verses = [x-1 for x in verses]
+# Function to extract the verse of interest from the corpus
+def extract_interested_verse(file_path, line_number, romanize=False):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for i, line in enumerate(file):
+            if i == line_number:
+                if romanize:
+                    return uroman(line.strip())
+                else:
+                    return line.strip()
+    return None
+# Function to segment the corpus into documents based on the verses list
+def segment_corpus(file_path, romanize=False):
+    documents = []
+    current_document = []
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for i, line in enumerate(file, start=1):
+            if i in verses:
+                if current_document:
+                    joined_doc_string = " ".join(current_document)
+                    if romanize:
+                        joined_doc_string = uroman(joined_doc_string)
+                    documents.append(joined_doc_string)
+                    current_document = []
+            current_document.append(line.strip())
+        # Don't forget to add the last document
+        if current_document:
+            joined_doc_string = " ".join(current_document)
+            if romanize:
+                joined_doc_string = uroman(joined_doc_string)
+            documents.append(joined_doc_string)
+    return documents
+# Function to perform TF-IDF on the corpus and extract scores for a specific verse
+def analyze_verse_in_corpus(file_path, interested_line, romanize=False):
+    documents = segment_corpus(file_path, romanize=romanize)
+    tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 4))
+    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
+    feature_names = tfidf_vectorizer.get_feature_names_out()
+    # Identify the document index for the interested line
+    document_index = next(i for i, v in enumerate(verses) if v > interested_line) - 1
+    # Extract TF-IDF scores for the document containing the interested line
+    scores = np.array(tfidf_matrix[document_index].todense()).flatten()
+    scores_dict = dict(zip(feature_names, scores))
+    # Extract the interested verse text
+    interested_verse = extract_interested_verse(file_path, interested_line - 1, romanize=romanize)
+    # Map n-grams in verse to their TF-IDF scores
+    if interested_verse:
+        tfidf_vectorizer_verse = TfidfVectorizer(ngram_range=(2, 4))
+        tfidf_vectorizer_verse.fit([interested_verse])
+        verse_ngrams = tfidf_vectorizer_verse.get_feature_names_out()
+        verse_scores = {ngram: scores_dict.get(ngram, 0) for ngram in verse_ngrams}
+        # Get ngrams and respective scores in the verse in descending score order
+        sorted_verse_scores = dict(sorted(verse_scores.items(), key=lambda item: item[1], reverse=True))
+        return sorted_verse_scores
+    else:
+        return "Verse not found."
+# file_path = 'bibles/eng-engkjvcpb.txt'
+# interested_line = 29276  # Example line number
+# verse_scores = analyze_verse_in_corpus(file_path, kjv_verses, interested_line)
+# Print or return the results
+# print(verse_scores)
+# Print ngrams and respective scores in the verse in descending score order
+# for ngram, score in islice(sorted_verse_scores.items(), 30):
+#     print(f"{ngram}: {score:.4f}")

tests/tsv_parse ADDED Viewed

	@@ -0,0 +1,27 @@

+import csv
+def parse_tsv_to_json(filepath, book_abbrev):
+    result = []  # Initialize an empty list to store the dictionaries.
+    with open(filepath, mode='r', encoding='utf-8') as file:
+        tsv_reader = csv.reader(file, delimiter='\t')
+        for row in tsv_reader:
+            # Check if the row contains a Greek term (non-empty) in the expected position.
+            if row and len(row) > 3 and row[4].strip():
+                # Construct a dictionary for the current row.
+                entry = {
+                    "greek_term": row[4].strip(),
+                    "translation_note": row[6].strip(),
+                    "verse": book_abbrev + row[0].strip()
+                }
+                # Append the dictionary to the result list.
+                result.append(entry)
+    return result
+# Example usage
+result = parse_tsv_to_json('./translation_notes/tn_ROM.tsv', 'rom')
+# Print first 5 entries
+print(result[:5])

translation_notes.json ADDED Viewed

	@@ -0,0 +1,32 @@

+[
+  {
+    "greek_term": "λόγος",
+    "translation_note": "often translated as 'word,' 'λόγος' in the Johannine prologue conveys a complex concept that includes divine revelation, reason, and creative power. In the context of John 1:1, it refers to Jesus as the pre-existent divine Word through whom all things were made and who reveals God to humanity. Translators should capture the multi-faceted nature of 'λόγος' as both communication and the personification of divine wisdom and presence.",
+    "verse": ""
+  },
+  {
+    "greek_term": "ἀγάπη",
+    "translation_note": "represents a form of love that is selfless, sacrificial, and unconditional, often distinguishing divine love from other types of love. In the New Testament, 'ἀγάπη' describes God's love for humanity and the love believers are called to have for one another. Translators should convey the depth and sacrificial nature of this love, differentiating it from feelings or affection.",
+    "verse": ""
+  },
+  {
+    "greek_term": "κοινωνία",
+    "translation_note": "translates to 'fellowship' or 'communion,' but encompasses much more than mere social interaction. 'κοινωνία' in the New Testament implies a deep, spiritual connection among believers, rooted in their shared participation in Christ and the Holy Spirit. It involves mutual support, sharing, and a common commitment to Christ's mission. Translators need to convey the richness of this fellowship as an expression of shared life and unity in the Spirit.",
+    "verse": ""
+  },
+  {
+    "greek_term": "δικαιοσύνη",
+    "translation_note": "often rendered as 'righteousness,' this term reflects a status of being in right relationship with God, conforming to His standards and will. In the New Testament, 'δικαιοσύνη' is closely related to justification through faith in Christ, signifying not only ethical behavior but also a legal standing of acquittal and acceptance by God. Translators should highlight the forensic aspect of righteousness as well as its ethical implications, emphasizing its source in God's grace through faith.",
+    "verse": ""
+  },
+  {
+    "greek_term": "ἐν Χριστῷ",
+    "translation_note": "illustrates the intimate union between believers and Christ. The preposition ἐν (in) goes beyond physical location, indicating a profound spiritual reality. Translators need to convey the concept of being 'in Christ' as being part of a new creation, identity, and living within the sphere of Christ's influence and lordship.",
+    "verse": ""
+  },
+  {
+    "greek_term": "διὰ πίστεως Ἰησοῦ Χριστοῦ",
+    "translation_note": "Here, faith in Jesus Christ is a possessive form that indicates faith that is associated with Jesus Christ. This could refer to: (1) trust in Jesus Christ. Alternate translation: “by trusting in Jesus Christ” or “by believing in Jesus Christ” (2) the faithfulness of Jesus Christ. Alternate translation: “through the faithfulness that Jesus Christ possesses” or “through how faithful Jesus Christ is”",
+    "verse": ""
+  }
+]

translation_notes/tn_ROM.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

uroman-1.2.8/.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+!Build/
+.last_cover_stats
+/META.yml
+/META.json
+/MYMETA.*
+*.o
+*.pm.tdy
+*.bs
+# Devel::Cover
+cover_db/
+# Devel::NYTProf
+nytprof.out
+# Dizt::Zilla
+/.build/
+# Module::Build
+_build/
+Build
+Build.bat
+# Module::Install
+inc/
+# ExtUtils::MakeMaker
+/blib/
+/_eumm/
+/*.gz
+/Makefile
+/Makefile.old
+/MANIFEST.bak
+/pm_to_blib
+/*.zip

uroman-1.2.8/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)".
+Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

uroman-1.2.8/README.md ADDED Viewed

	@@ -0,0 +1,163 @@

+# URoman
+*uroman* is a *universal romanizer*. It converts text in any script to the Latin alphabet.
+Version: 1.2.8
+Release date: April 23, 2021
+Author: Ulf Hermjakob, USC Information Sciences Institute
+### Usage
+```bash
+$ uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
+       where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
+            grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
+       --chart specifies chart output (in JSON format) to represent alternative romanizations.
+       --no-cache disables caching.
+```
+### Examples
+```bash
+$ bin/uroman.pl < text/zho.txt
+$ bin/uroman.pl -l tur < text/tur.txt
+$ bin/uroman.pl -l heb --chart < text/heb.txt
+$ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
+```
+Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
+Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
+Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or
+Yiddish will improve romanization for those languages as some letters in those
+languages have different sound values from other languages using the same script
+(French, Russian, Hebrew respectively).
+No effect for other languages in this version.
+### Bibliography
+Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. ACL-2018 Best Demo Paper Award. [Paper in ACL Anthology](https://www.aclweb.org/anthology/P18-4003) | [Poster](https://www.isi.edu/~ulf/papers/poster-uroman-acl2018.pdf) | [BibTex](https://www.aclweb.org/anthology/P18-4003.bib)
+### Change History
+Changes in version 1.2.8
+ * Updated to Unicode 13.0 (2021), which supports several new scripts (10% larger UnicodeData.txt).
+ * Improved support for Georgian.
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
+ * Various small improvements.
+Changes in version 1.2.7
+ * Improved support for Pashto.
+Changes in version 1.2.6
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
+ * Added support for English Braille.
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
+   reflecting a casual style that many native speakers of those languages use
+   when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
+   rather than phonetically motivated combinations of letters (e.g. "sh").
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
+   that language for that line. This is used for the new reference test file.
+ * Various small improvements.
+Changes in version 1.2.5
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
+   -- For Serbian and Macedonian, which are often written in both Cyrillic
+      and Latin scripts, uroman will map both official versions to the same
+      romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
+      properly reflects the pronunciation of the city's name).
+      For both Serbian and Macedonian, casual writers often use a simplified
+      Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
+      and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
+      other such pairs. The casual romanization can be simulated by using
+      alternative uroman language codes "srp2" and "mkd2", which romanize
+      both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
+ * Various small improvements.
+Changes in version 1.2.4
+  * Bug-fix that generated two emtpy lines for each empty line in cache mode.
+Changes in version 1.2
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
+   romanization (identity) of ASCII strings for default 1-best (non-chart)
+   output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
+   large size texts.
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
+   languages.
+ * Richer lattice structure (more alternatives) for "Romanization" of English
+   to support better matching to romanizations of other languages.
+   Changes output only when --chart option is specified. No change in output for
+   default 1-best output, which for ASCII characters is always the input string.
+Changes in version 1.1 (major upgrade)
+ * Offers chart output (in JSON format) to represent alternative romanizations.
+   -- Location of first character is defined to be "line: 1, start:0, end:0".
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
+   -- Shows corresponding original and romanization text in red
+      when hovering over a text segment.
+   -- Shows alternative romanizations when hovering over romanized text
+      marked by dotted underline.
+   -- Added right-to-left script detection and improved display for right-to-left
+      script text (as determined line by line).
+   -- On-page support for some scripts that are often not pre-installed on users'
+      computers (Burmese, Egyptian, Klingon).
+Changes in version 1.0 (major upgrade)
+ * Upgraded principal internal data structure from string to lattice.
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
+ * Japanese Katakana middle dots now mapped to ASCII space.
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
+ * Some corrections regarding analysis of Chinese numbers.
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
+ * Spaces normalized to ASCII space.
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
+ * Tested against previous version of uroman with a new uroman visual diff tool.
+ * Almost an order of magnitude faster.
+Changes in version 0.7 (minor upgrade)
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
+   Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
+   Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
+   or Chinese characters in Uyghur texts.
+Changes in version 0.6 (minor upgrade)
+ * Added support for two letter characters used in Uzbek:
+   (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
+   (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
+   Both are now mapped to "'" (plain ASCII apostrophe).
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
+   even when they are not preceded by "ئ" (yeh with hamza above).
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
+   ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
+   However, it is strongly recommended to normalize any presentation form Arabic letters
+   to their non-presentation form before calling uroman.
+ * Added force flush directive ($|=1;).
+Changes in version 0.5 (minor upgrade)
+ * Improvements for Uyghur (make sure to use language option: -l uig)
+Changes in version 0.4 (minor upgrade)
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
+ * Minor change for Arabic (added "alef+fathatan" = "an")
+New features in version 0.3
+ * Covers Mandarin (Chinese)
+ * Improved romanization for numerous languages
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
+ * Maps from native digits to Western numbers
+ * Faster for South Asian languages
+### Other features
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
+ * Vowelization is provided when locally computable, e.g. for many South Asian languages and Tibetan.
+### Limitations
+ * The current version of uroman has a few limitations, some of which we plan to address in future versions.
+   For Japanese, *uroman* currently romanizes hiragana and katakana as expected, but kanji are interpreted as Chinese characters and romanized as such.
+   For Egyptian hieroglyphs, only single-sound phonetic characters and numbers are currently romanized.
+   For Linear B, only phonetic syllabic characters are romanized.
+   For some other extinct scripts such as cuneiform, no romanization is provided.
+ * A romanizer is not a full transliterator. For example, this version of
+   uroman does not vowelize text that lacks explicit vowelization such as
+   normal text in Arabic and Hebrew (without diacritics/points).

uroman-1.2.8/README.txt ADDED Viewed

	@@ -0,0 +1,141 @@

+uroman version 1.2.8
+Release date: April 23, 2021
+Author: Ulf Hermjakob, USC Information Sciences Institute
+uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
+Usage: uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
+       where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
+            grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
+       --chart specifies chart output (in JSON format) to represent alternative romanizations.
+       --no-cache disables caching.
+Examples: bin/uroman.pl < text/zho.txt
+          bin/uroman.pl -l tur < text/tur.txt
+	  bin/uroman.pl -l heb --chart < text/heb.txt
+	  bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
+Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
+Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
+Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or Yiddish
+will improve romanization for those languages as some letters in those languages
+have different sound values from other languages using the same script.
+No effect for other languages in this version.
+Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. [Best Demo Paper Award]
+Changes in version 1.2.8
+ * Improved support for Georgian.
+ * Updated UnicodeData.txt to version 13 (2021) with several new scripts (10% larger).
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
+ * Various small improvements.
+Changes in version 1.2.7
+ * Improved support for Pashto.
+Changes in version 1.2.6
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
+ * Added support for English Braille.
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
+   reflecting a casual style that many native speakers of those languages use
+   when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
+   rather than phonetically motivated combinations of letters (e.g. "sh").
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
+   that language for that line. This is used for the new reference test file.
+ * Various small improvements.
+Changes in version 1.2.5
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
+   -- For Serbian and Macedonian, which are often written in both Cyrillic
+      and Latin scripts, uroman will map both official versions to the same
+      romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
+      properly reflects the pronunciation of the city's name).
+      For both Serbian and Macedonian, casual writers often use a simplified
+      Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
+      and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
+      other such pairs. The casual romanization can be simulated by using
+      alternative uroman language codes "srp2" and "mkd2", which romanize
+      both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
+ * Various small improvements.
+Changes in version 1.2.4
+ * Added support for Tifinagh (a script used for Berber languages).
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
+Changes in version 1.2.3
+ * Exclude emojis, dingbats, many other pictographs from being romanized (e.g. to "face")
+Changes in version 1.2
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
+   romanization (identity) of ASCII strings for default 1-best (non-chart)
+   output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
+   large size texts.
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
+   languages.
+ * Richer lattice structure (more alternatives) for "Romanization" of English
+   to support better matching to romanizations of other languages.
+   Changes output only when --chart option is specified. No change in output for
+   default 1-best output, which for ASCII characters is always the input string.
+Changes in version 1.1 (major upgrade)
+ * Offers chart output (in JSON format) to represent alternative romanizations.
+   -- Location of first character is defined to be "line: 1, start:0, end:0".
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
+   -- Shows corresponding original and romanization text in red
+      when hovering over a text segment.
+   -- Shows alternative romanizations when hovering over romanized text
+      marked by dotted underline.
+   -- Added right-to-left script detection and improved display for right-to-left
+      script text (as determined line by line).
+   -- On-page support for some scripts that are often not pre-installed on users'
+      computers (Burmese, Egyptian, Klingon).
+Changes in version 1.0 (major upgrade)
+ * Upgraded principal internal data structure from string to lattice.
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
+ * Japanese Katakana middle dots now mapped to ASCII space.
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
+ * Some corrections regarding analysis of Chinese numbers.
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
+ * Spaces normalized to ASCII space.
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
+ * Tested against previous version of uroman with a new uroman visual diff tool.
+ * Almost an order of magnitude faster.
+Changes in version 0.7 (minor upgrade)
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
+   Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
+   Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
+   or Chinese characters in Uyghur texts.
+Changes in version 0.6 (minor upgrade)
+ * Added support for two letter characters used in Uzbek:
+   (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
+   (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
+   Both are now mapped to "'" (plain ASCII apostrophe).
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
+   even when they are not preceded by "ئ" (yeh with hamza above).
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
+   ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
+   However, it is strongly recommended to normalize any presentation form Arabic letters
+   to their non-presentation form before calling uroman.
+ * Added force flush directive ($|=1;).
+Changes in version 0.5 (minor upgrade)
+ * Improvements for Uyghur (make sure to use language option: -l uig)
+Changes in version 0.4 (minor upgrade)
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
+ * Minor change for Arabic (added "alef+fathatan" = "an")
+New features in version 0.3
+ * Covers Mandarin (Chinese)
+ * Improved romanization for numerous languages
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
+ * Maps from native digits to Western numbers
+ * Faster for South Asian languages
+Other features
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
+ * Vowelization is provided when locally computable, e.g. for many South Asian
+   languages and Tibetan.
+Limitations
+ * This version of uroman assumes all CJK ideographs to be Mandarin (Chinese).
+   This means that Japanese kanji are incorrectly romanized; however, Japanese
+   hiragana and katakana are properly romanized.
+ * A romanizer is not a full transliterator. For example, this version of
+   uroman does not vowelize text that lacks explicit vowelization such as
+   normal text in Arabic and Hebrew (without diacritics/points).

uroman-1.2.8/bin/de-accent.pl ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/perl -w
+sub print_version {
+   print STDERR "$0 version 1.1\n";
+   print STDERR "   Author: Ulf Hermjakob\n";
+   print STDERR "   Last changed: March 14, 2011\n";
+}
+sub print_usage {
+   print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
+   print STDERR "   -h or -help\n";
+   print STDERR "   -v or -version\n";
+}
+sub de_accent_string {
+   local($s) = @_;
+   # $s =~ tr/A-Z/a-z/;
+   unless (0) {
+      # Latin-1
+      if ($s =~ /\xC3[\x80-\xBF]/) {
+         $s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
+         $s =~ s/Æ/Ae/g;
+         $s =~ s/Ç/C/g;
+         $s =~ s/Ð/D/g;
+         $s =~ s/(È|É|Ê|Ë)/E/g;
+         $s =~ s/(Ì|Í|Î|Ï)/I/g;
+         $s =~ s/Ñ/N/g;
+         $s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
+         $s =~ s/(Ù|Ú|Û|Ü)/U/g;
+         $s =~ s/Þ/Th/g;
+         $s =~ s/Ý/Y/g;
+         $s =~ s/(à|á|â|ã|ä|å)/a/g;
+         $s =~ s/æ/ae/g;
+         $s =~ s/ç/c/g;
+         $s =~ s/(è|é|ê|ë)/e/g;
+         $s =~ s/(ì|í|î|ï)/i/g;
+         $s =~ s/ð/d/g;
+         $s =~ s/ñ/n/g;
+         $s =~ s/(ò|ó|ô|õ|ö)/o/g;
+         $s =~ s/ß/ss/g;
+         $s =~ s/þ/th/g;
+         $s =~ s/(ù|ú|û|ü)/u/g;
+         $s =~ s/(ý|ÿ)/y/g;
+      }
+      # Latin Extended-A
+      if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
+         $s =~ s/(Ā|Ă|Ą)/A/g;
+         $s =~ s/(ā|ă|ą)/a/g;
+         $s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
+         $s =~ s/(ć|ĉ|ċ|č)/c/g;
+         $s =~ s/(Ď|Đ)/D/g;
+         $s =~ s/(ď|đ)/d/g;
+         $s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
+         $s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
+         $s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
+         $s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
+         $s =~ s/(Ĥ|Ħ)/H/g;
+         $s =~ s/(ĥ|ħ)/h/g;
+         $s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
+         $s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
+         $s =~ s/Ĳ/Ij/g;
+         $s =~ s/ĳ/ij/g;
+         $s =~ s/Ĵ/J/g;
+         $s =~ s/ĵ/j/g;
+         $s =~ s/Ķ/K/g;
+         $s =~ s/(ķ|ĸ)/k/g;
+         $s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
+         $s =~ s/(ļ|ľ|ŀ|ł)/l/g;
+         $s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
+         $s =~ s/(ń|ņ|ň|ŉ|ŋ)/n/g;
+         $s =~ s/(Ō|Ŏ|Ő)/O/g;
+         $s =~ s/(ō|ŏ|ő)/o/g;
+         $s =~ s/Œ/Oe/g;
+         $s =~ s/œ/oe/g;
+         $s =~ s/(Ŕ|Ŗ|Ř)/R/g;
+         $s =~ s/(ŕ|ŗ|ř)/r/g;
+         $s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
+         $s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
+         $s =~ s/(Ţ|Ť|Ŧ)/T/g;
+         $s =~ s/(ţ|ť|ŧ)/t/g;
+         $s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
+         $s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
+         $s =~ s/Ŵ/W/g;
+         $s =~ s/ŵ/w/g;
+         $s =~ s/(Ŷ|Ÿ)/Y/g;
+         $s =~ s/ŷ/y/g;
+         $s =~ s/(Ź|Ż|Ž)/Z/g;
+         $s =~ s/(ź|ż|ž)/z/g;
+      }
+      # Latin Extended Additional
+      if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
+          $s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
+          $s =~ s/(ḃ|ḅ|ḇ)/b/g;
+          $s =~ s/(ḉ)/c/g;
+          $s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
+          $s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
+          $s =~ s/(ḟ)/f/g;
+          $s =~ s/(ḡ)/g/g;
+          $s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
+          $s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
+          $s =~ s/(ḱ|ḳ|ḵ)/k/g;
+          $s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
+          $s =~ s/(ḿ|ṁ|ṃ)/m/g;
+          $s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
+          $s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
+          $s =~ s/(ṕ|ṗ)/p/g;
+          $s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
+          $s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
+          $s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
+          $s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
+          $s =~ s/(ṽ|ṿ)/v/g;
+          $s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
+          $s =~ s/(ẋ|ẍ)/x/g;
+          $s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
+          $s =~ s/(ẑ|ẓ|ẕ)/z/g;
+          $s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
+          $s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
+          $s =~ s/(Ḉ)/C/g;
+          $s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
+          $s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
+          $s =~ s/(Ḟ)/F/g;
+          $s =~ s/(Ḡ)/G/g;
+          $s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
+          $s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
+          $s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
+          $s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
+          $s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
+          $s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
+          $s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
+          $s =~ s/(Ṕ|Ṗ)/P/g;
+          $s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
+          $s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
+          $s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
+          $s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
+          $s =~ s/(Ṽ|Ṿ)/V/g;
+          $s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
+          $s =~ s/(Ẍ)/X/g;
+          $s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
+          $s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
+      }
+      # Greek letters
+      if ($s =~ /\xCE[\x86-\xAB]/) {
+          $s =~ s/ά/α/g;
+          $s =~ s/έ/ε/g;
+          $s =~ s/ί/ι/g;
+          $s =~ s/ϊ/ι/g;
+          $s =~ s/ΐ/ι/g;
+          $s =~ s/ό/ο/g;
+          $s =~ s/ύ/υ/g;
+          $s =~ s/ϋ/υ/g;
+          $s =~ s/ΰ/υ/g;
+          $s =~ s/ώ/ω/g;
+          $s =~ s/Ά/Α/g;
+          $s =~ s/Έ/Ε/g;
+          $s =~ s/Ή/Η/g;
+          $s =~ s/Ί/Ι/g;
+          $s =~ s/Ϊ/Ι/g;
+          $s =~ s/Ύ/Υ/g;
+          $s =~ s/Ϋ/Υ/g;
+          $s =~ s/Ώ/Ω/g;
+      }
+      # Cyrillic letters
+      if ($s =~ /\xD0[\x80-\xAF]/) {
+          $s =~ s/Ѐ/Е/g;
+          $s =~ s/Ё/Е/g;
+          $s =~ s/Ѓ/Г/g;
+          $s =~ s/Ќ/К/g;
+          $s =~ s/Ѝ/И/g;
+          $s =~ s/Й/И/g;
+          $s =~ s/ѐ/е/g;
+          $s =~ s/ё/е/g;
+          $s =~ s/ѓ/г/g;
+          $s =~ s/ќ/к/g;
+          $s =~ s/ѝ/и/g;
+          $s =~ s/й/и/g;
+      }
+   }
+   return $s;
+}
+while (@ARGV) {
+   $arg = shift @ARGV;
+   if ($arg =~ /^-*(h|help)$/i) {
+      &print_usage;
+      exit 1;
+   } elsif ($arg =~ /^-*(v|version)$/i) {
+      &print_version;
+      exit 1;
+   } else {
+      print STDERR "Ignoring unrecognized argument $arg\n";
+   }
+}
+$line_number = 0;
+while (<>) {
+   $line_number++;
+   print &de_accent_string($_);
+}
+exit 0;

uroman-1.2.8/bin/string-distance.pl ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/perl -w
+# Author: Ulf Hermjakob
+# Release date: October 13, 2019
+# Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
+# Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
+# Example: string-distance.pl < ../test/string-similarity-test-input.txt
+# Input format: two strings per line (tab-separated, in Latin script)
+#    Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
+# Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
+#    Additional output meta info lines at the top are marked with an initial #.
+#
+# The script uses data from a string-distance-cost-rules file that lists costs,
+# where the default cost is "1" with lower costs for differences in vowels,
+# duplicate consonants, "f" vs. "ph" etc.
+# Language cost rules can be language-specific and context-sensitive.
+$|=1;
+use FindBin;
+use Cwd "abs_path";
+use File::Basename qw(dirname);
+use File::Spec;
+my $bin_dir = abs_path(dirname($0));
+my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
+my $data_dir = File::Spec->catfile($root_dir, "data");
+my $lib_dir = File::Spec->catfile($root_dir, "lib");
+use lib "$FindBin::Bin/../lib";
+use List::Util qw(min max);
+use NLP::utilities;
+use NLP::stringDistance;
+$util = NLP::utilities;
+$sd = NLP::stringDistance;
+$verbose = 0;
+$separator = "\t";
+$cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
+$lang_code1 = "eng";
+$lang_code2 = "eng";
+%ht = ();
+while (@ARGV) {
+   $arg = shift @ARGV;
+   if ($arg =~ /^-+lc1$/) {
+      $lang_code_candidate = shift @ARGV;
+      $lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
+   } elsif ($arg =~ /^-+lc2$/) {
+      $lang_code_candidate = shift @ARGV;
+      $lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
+   } elsif ($arg =~ /^-+(v|verbose)$/) {
+      $verbose = shift @ARGV;
+   } else {
+      print STDERR "Ignoring unrecognized arg $arg\n";
+   }
+}
+$sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
+print STDERR "Loaded resources.\n" if $verbose;
+my $chart_id = 0;
+my $line_number = 0;
+print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
+while (<>) {
+   $line_number++;
+   if ($verbose) {
+      if ($line_number =~ /000$/) {
+         if ($line_number =~ /0000$/) {
+	    print STDERR $line_number;
+         } else {
+	    print STDERR ".";
+         }
+      }
+   }
+   my $line = $_;
+   $line =~ s/^\xEF\xBB\xBF//;
+   next if $line =~ /^\s*(\#.*)?$/;
+   my $s1;
+   my $s2;
+   if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
+      $s1 = $util->dequote_string($s1);
+      $s2 = $util->dequote_string($s2);
+   } elsif ($line =~ /^\s*(#.*)$/) {
+   } else {
+      print STDERR "Could not process line $line_number: $line" if $verbose;
+      print "\n";
+      next;
+   }
+   $cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
+   print "$s1\t$s2\t$cost\n";
+}
+print STDERR "\n" if $verbose;
+exit 0;

uroman-1.2.8/bin/uroman-quick.pl ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/usr/bin/perl -w
+# uroman  Nov. 12, 2015 - July 25, 2016
+# version v0.7
+# Author: Ulf Hermjakob
+# Usage: uroman-quick.pl {-l [tur|uig|ukr|yid]} < STDIN
+# currently only for Arabic script languages, incl. Uyghur
+$|=1;
+use FindBin;
+use Cwd "abs_path";
+use File::Basename qw(dirname);
+use File::Spec;
+my $bin_dir = abs_path(dirname($0));
+my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
+my $data_dir = File::Spec->catfile($root_dir, "data");
+my $lib_dir = File::Spec->catfile($root_dir, "lib");
+use lib "$FindBin::Bin/../lib";
+use NLP::Romanizer;
+use NLP::UTF8;
+$romanizer = NLP::Romanizer;
+%ht = ();
+$lang_code = "";
+while (@ARGV) {
+   $arg = shift @ARGV;
+   if ($arg =~ /^-+(l|lc|lang-code)$/) {
+      $lang_code = lc (shift @ARGV || "")
+   } else {
+      print STDERR "Ignoring unrecognized arg $arg\n";
+   }
+}
+$romanization_table_arabic_block_filename = File::Spec->catfile($data_dir, "romanization-table-arabic-block.txt");
+$romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
+$romanizer->load_romanization_table(*ht, $romanization_table_arabic_block_filename);
+$romanizer->load_romanization_table(*ht, $romanization_table_filename);
+$line_number = 0;
+while (<>) {
+   $line_number++;
+   my $line = $_;
+   print $romanizer->quick_romanize($line, $lang_code, *ht) . "\n";
+   if ($line_number =~ /0000$/) {
+      print STDERR $line_number;
+   } elsif ($line_number =~ /000$/) {
+      print STDERR ".";
+   }
+}
+print STDERR "\n";
+exit 0;

uroman-1.2.8/bin/uroman-tsv.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env bash
+# Created by Thamme Gowda on June 17, 2019
+DIR=$(dirname "${BASH_SOURCE[0]}")  # get the directory name
+# DIR=$(realpath "${DIR}")    # resolve its full path if need be
+if [[ $# -lt 1 || $# -gt 2 ]]; then
+    >&2 echo "ERROR: invalid args"
+    >&2 echo "Usage: <input.tsv> [<output.tsv>]"
+    exit 2
+fi
+INP=$1
+OUT=$2
+CMD=$DIR/uroman.pl
+function romanize(){
+    paste <(cut -f1 $INP) <(cut -f2 $INP | $CMD)
+}
+if [[ -n $OUT ]]; then
+    romanize > $OUT
+else
+    romanize
+fi

uroman-1.2.8/bin/uroman.pl ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/usr/bin/perl -w
+# uroman  Nov. 12, 2015 - Apr. 23, 2021
+$version = "v1.2.8";
+# Author: Ulf Hermjakob
+# Usage: uroman.pl {-l [ara|bel|bul|deu|ell|eng|fas|grc|heb|kaz|kir|lav|lit|mkd|mkd2|oss|pnt|rus|srp|srp2|tur|uig|ukr|yid]} {--chart|--offset-mapping} {--no-cache} {--workset} < STDIN
+# Example: cat workset.txt | uroman.pl --offset-mapping --workset
+$|=1;
+use FindBin;
+use Cwd "abs_path";
+use File::Basename qw(dirname);
+use File::Spec;
+my $bin_dir = abs_path(dirname($0));
+my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
+my $data_dir = File::Spec->catfile($root_dir, "data");
+my $lib_dir = File::Spec->catfile($root_dir, "lib");
+use lib "$FindBin::Bin/../lib";
+use NLP::Chinese;
+use NLP::Romanizer;
+use NLP::UTF8;
+use NLP::utilities;
+use JSON;
+$chinesePM = NLP::Chinese;
+$romanizer = NLP::Romanizer;
+$util = NLP::utilities;
+%ht = ();
+%pinyin_ht = ();
+$lang_code = "";
+$return_chart_p = 0;
+$return_offset_mappings_p = 0;
+$workset_p = 0;
+$cache_rom_tokens_p = 1;
+$script_data_filename = File::Spec->catfile($data_dir, "Scripts.txt");
+$unicode_data_overwrite_filename = File::Spec->catfile($data_dir, "UnicodeDataOverwrite.txt");
+$unicode_data_filename = File::Spec->catfile($data_dir, "UnicodeData.txt");
+$romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
+$chinese_tonal_pinyin_filename = File::Spec->catfile($data_dir, "Chinese_to_Pinyin.txt");
+while (@ARGV) {
+   $arg = shift @ARGV;
+   if ($arg =~ /^-+(l|lc|lang-code)$/) {
+      $lang_code = lc (shift @ARGV || "")
+   } elsif ($arg =~ /^-+chart$/i) {
+      $return_chart_p = 1;
+   } elsif ($arg =~ /^-+workset$/i) {
+      $workset_p = 1;
+   } elsif ($arg =~ /^-+offset[-_]*map/i) {
+      $return_offset_mappings_p = 1;
+   } elsif ($arg =~ /^-+unicode[-_]?data/i) {
+      $filename = shift @ARGV;
+      if (-r $filename) {
+         $unicode_data_filename = $filename;
+      } else {
+         print STDERR "Ignoring invalid UnicodeData filename $filename\n";
+      }
+   } elsif ($arg =~ /^-+(no-tok-cach|no-cach)/i) {
+      $cache_rom_tokens_p = 0;
+   } else {
+      print STDERR "Ignoring unrecognized arg $arg\n";
+   }
+}
+$romanizer->load_script_data(*ht, $script_data_filename);
+$romanizer->load_unicode_data(*ht, $unicode_data_filename);
+$romanizer->load_unicode_overwrite_romanization(*ht, $unicode_data_overwrite_filename);
+$romanizer->load_romanization_table(*ht, $romanization_table_filename);
+$chinese_to_pinyin_not_yet_loaded_p = 1;
+$current_date = $util->datetime("dateTtime");
+$lang_code_clause = ($lang_code) ? " \"lang-code\":\"$lang_code\",\n" : "";
+print "{\n \"romanizer\":\"uroman $version (Ulf Hermjakob, USC/ISI)\",\n \"date\":\"$current_date\",\n$lang_code_clause \"romanization\": [\n" if $return_chart_p;
+my $line_number = 0;
+my $chart_result = "";
+while (<>) {
+   $line_number++;
+   my $line = $_;
+   my $snt_id = "";
+   if ($workset_p) {
+      next if $line =~ /^#/;
+      if (($i_value, $s_value) = ($line =~ /^(\S+\.\d+)\s(.*)$/)) {
+	 $snt_id = $i_value;
+	 $line = "$s_value\n";
+      } else {
+	 next;
+      }
+   }
+   if ($chinese_to_pinyin_not_yet_loaded_p && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($line)) {
+      $chinesePM->read_chinese_tonal_pinyin_files(*pinyin_ht, $chinese_tonal_pinyin_filename);
+      $chinese_to_pinyin_not_yet_loaded_p = 0;
+   }
+   if ($return_chart_p) {
+      print $chart_result;
+      *chart_ht = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return chart", $line_number);
+      $chart_result = $romanizer->chart_to_json_romanization_elements(0, $chart_ht{N_CHARS}, *chart_ht, $line_number);
+   } elsif ($return_offset_mappings_p) {
+      ($best_romanization, $offset_mappings) = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return offset mappings", $line_number, 0);
+      print "::snt-id $snt_id\n" if $workset_p;
+      print "::orig $line";
+      print "::rom $best_romanization\n";
+      print "::align $offset_mappings\n\n";
+   } elsif ($cache_rom_tokens_p) {
+      print $romanizer->romanize_by_token_with_caching($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
+   } else {
+      print $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
+   }
+}
+$chart_result =~ s/,(\s*)$/$1/;
+print $chart_result;
+print " ]\n}\n" if $return_chart_p;
+$dev_test_p = 0;
+if ($dev_test_p) {
+   $n_suspicious_code_points = 0;
+   $n_instances = 0;
+   foreach $char_name (sort { hex($ht{UTF_NAME_TO_UNICODE}->{$a}) <=> hex($ht{UTF_NAME_TO_UNICODE}->{$b}) }
+                            keys %{$ht{SUSPICIOUS_ROMANIZATION}}) {
+      $unicode_value = $ht{UTF_NAME_TO_UNICODE}->{$char_name};
+      $utf8_string = $ht{UTF_NAME_TO_CODE}->{$char_name};
+      foreach $romanization (sort keys %{$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}}) {
+         $count = $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization};
+	 $s = ($count == 1) ? "" : "s";
+         print STDERR "*** Suspiciously lengthy romanization:\n" unless $n_suspicious_code_points;
+         print STDERR "::s $utf8_string ::t $romanization ::comment $char_name (U+$unicode_value)\n";
+	 $n_suspicious_code_points++;
+	 $n_instances += $count;
+      }
+   }
+   print STDERR "  *** Total of $n_suspicious_code_points suspicious code points ($n_instances instance$s)\n" if $n_suspicious_code_points;
+}
+exit 0;

uroman-1.2.8/data/Chinese_to_Pinyin.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

uroman-1.2.8/data/Scripts.txt ADDED Viewed

	@@ -0,0 +1,135 @@

+::script-name Aegean
+::script-name Ahom
+::script-name Anatolian Hieroglyph
+::script-name Arabic ::direction right-to-left
+::script-name Armenian
+::script-name Avestan
+::script-name Balinese
+::script-name Bamum
+::script-name Bassa Vah
+::script-name Batak
+::script-name Bengali ::abugida-default-vowel a
+::script-name Bhaiksuki
+::script-name Bopomofo ::language Chinese
+::script-name Brahmi ::abugida-default-vowel a
+::script-name Braille
+::script-name Buginese
+::script-name Buhid
+::script-name Canadian Syllabics
+::script-name Carian
+::script-name Caucasian Albanian
+::script-name Chakma
+::script-name Cham
+::script-name Cherokee
+::script-name Coptic
+::script-name Cuneiform
+::script-name Cypriot
+::script-name Cyrillic
+::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin
+::script-name Deseret
+::script-name Devanagari ::abugida-default-vowel a
+::script-name Duployan
+::script-name Egyptian Hieroglyph
+::script-name Elbasan
+::script-name Ethiopic
+::script-name Georgian
+::script-name Glagolitic
+::script-name Gothic
+::script-name Grantha
+::script-name Greek
+::script-name Gujarati ::abugida-default-vowel a
+::script-name Gurmukhi ::abugida-default-vowel a
+::script-name Hangul ::language Korean
+::script-name Hanunoo
+::script-name Hatran
+::script-name Hebrew ::direction right-to-left
+::script-name Hiragana ::language Japanese
+::script-name Imperial Aramaic
+::script-name Inscriptional Pahlavi
+::script-name Inscriptional Parthian
+::script-name Javanese
+::script-name Kaithi
+::script-name Kannada ::abugida-default-vowel a
+::script-name Katakana ::language Japanese
+::script-name Kayah Li
+::script-name Kharoshthi
+::script-name Khmer ::abugida-default-vowel a, o
+::script-name Khojki
+::script-name Khudawadi
+::script-name Klingon
+::script-name Lao
+::script-name Lepcha
+::script-name Latin
+::script-name Limbu
+::script-name Linear A
+::script-name Linear B
+::script-name Lycian
+::script-name Lydian
+::script-name Mahajani
+::script-name Malayalam ::abugida-default-vowel a
+::script-name Mandaic
+::script-name Manichaean
+::script-name Marchen
+::script-name Meetei Mayek
+::script-name Meroitic Cursive
+::script-name Meroitic Hieroglyphic
+::script-name Miao
+::script-name Modi ::abugida-default-vowel a
+::script-name Mongolian
+::script-name Mro
+::script-name Multani
+::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a
+::script-name Nabataean
+::script-name New Tai Lue
+::script-name Newa
+::script-name Nko ::direction right-to-left
+::script-name Ogham
+::script-name Ol Chiki
+::script-name Old Hungarian
+::script-name Old Italic
+::script-name Old Permic
+::script-name Old Persian
+::script-name Old North Arabian
+::script-name Old South Arabian
+::script-name Old Turkic
+::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a
+::script-name Osage
+::script-name Osmanya
+::script-name Pahawh Hmong
+::script-name Palmyrene
+::script-name Pau Cin Hau
+::script-name Phags-pa
+::script-name Phaistos Disc
+::script-name Phoenician
+::script-name Psalter Pahlavi
+::script-name Rejang
+::script-name Runic
+::script-name Samaritan
+::script-name Saurashtra
+::script-name Sharada
+::script-name Shavian
+::script-name Siddham
+::script-name Sinhala ::abugida-default-vowel a
+::script-name Sora Sompeng
+::script-name Sundanese ::abugida-default-vowel a
+::script-name Syloti Nagri
+::script-name Syriac
+::script-name Tagalog
+::script-name Tagbanwa
+::script-name Tai Le
+::script-name Tai Tham
+::script-name Tai Viet
+::script-name Takri
+::script-name Tamil ::abugida-default-vowel a
+::script-name Tangut
+::script-name Telugu ::abugida-default-vowel a
+::script-name Thaana ::direction right-to-left
+::script-name Thai
+::script-name Tibetan ::abugida-default-vowel a
+::script-name Tifinagh
+::script-name Tirhuta
+::script-name Ugaritic
+::script-name Vai
+::script-name Vedic
+::script-name Warang Citi
+::script-name Yi

uroman-1.2.8/data/UnicodeData.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

uroman-1.2.8/data/UnicodeDataOverwrite.txt ADDED Viewed

	@@ -0,0 +1,442 @@

+## UnicodeDataOverwrite.txt
+::u 00A0 ::r " "  ::comment no-break space
+::u 01BF ::r w  ::comment ƿ Latin Character Wynn (Old English)
+::u 0294 ::r ' ::comment gottal stop
+::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative
+::u 0305 ::r ""  ::comment ̅ Combining overline
+::u 0306 ::r ""  ::comment ̆ Combining breve
+::u 0307 ::r ""  ::comment ̇ Combining dot above
+::u 030A ::r ""  ::comment ̊ Combining ring above
+::u 030C ::r ""  ::comment ̌ Combining caron
+::u 0311 ::r ""  ::comment ̑ Combining inverted breve
+::u 031D ::r ""  ::comment ̝ Combining down up below
+::u 031E ::r ""  ::comment ̞ Combining down tack below
+::u 031F ::r ""  ::comment ̟ Combining plus sign below
+::u 0323 ::r ""  ::comment ̣ Combining dot below
+::u 0325 ::r ""  ::comment ̥ Combining ring below
+::u 0329 ::r ""  ::comment ̩ Combining vertical line below
+::u 032A ::r ""  ::comment ̪ Combining bridge below
+::u 032F ::r ""  ::comment ̯ Combining inverted breve below
+::u 0342 ::r ""  ::comment ͂ Combining Greek perispomeni (circumflex accent)
+::u 0343 ::r ""  ::comment ̓ Combining Greek koronis
+::u 0361 ::r ""  ::comment Combining double inverted breve
+::u 0384 ::r ""  ::comment ΄ Greek tonos
+::u 0482 ::r 1000·  ::comment ҂ Cyrillic thousands sign
+::u 0483 ::r ""  ::comment ҃ Combining Cyrillic Titlo ::annotation titlo
+::u 0484 ::r ""  ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization
+::u 055B ::r ""  ::comment ՛ Armenian emphasis mark
+::u 055F ::r ""  ::comment ՟ Armenian abbreviation mark ::annotation abbreviation
+::u 0901 ::r +m  ::comment Devanagari sign candrabindu
+::u 0902 ::r +m  ::comment Devanagari sign anusvara
+::u 0903 ::r +h  ::comment Devanagari sign visarga
+::u 093D ::r '   ::comment Devanagari sign avagraha
+::u 0950 ::r om  ::comment ॐ Devanagari om symbol
+::u 0951 ::r ""  ::comment ॑ Devanagari stress sign "udatta"
+::u 0952 ::r ""  ::comment ॒ Devanagari stress sign "anudatta"
+::u 0981 ::r +n  ::comment Bengali sign candrabindu ("chôndrôbindu")
+::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar")
+::u 0983 ::r +h  ::comment Bengali sign visarga ("bishôrgô")
+::u 099A ::r ch  ::comment instead of Bengali C(A)
+::u 099B ::r chh ::comment instead of Bengali CC(A)
+::u 0A02 ::r +m  ::comment Gurmukhi sign bindi
+::u 0A70 ::r +m  ::comment Gurmukhi tippi
+# ::u 0A72 ::r "" ::comment Gurmukhi addak
+::u 0A72 ::r ""  ::comment Gurmukhi iri
+::u 0A73 ::r ""  ::comment Gurmukhi ura
+::u 0B01 ::r +m  ::comment Oriya sign candrabindu
+::u 0B03 ::r +h  ::comment Oriya sign visarga
+::u 0B5F ::r ya  ::comment ୟ Oriya letter yya
+::u 0B82 ::r +m  ::comment Tamil sign anusvara (not to be used?)
+::u 0B83 ::r +h  ::comment Tamil sign visarga ("āytam")
+::u 0B9F ::r t   ::comment instead of Tamil TT(A)
+::u 0BA3 ::r n   ::comment instead of Tamil NN(A)
+::u 0BA9 ::r n   ::comment instead of Tamil NNN(A)
+::u 0BB1 ::r r   ::comment instead of Tamil RR(A)
+::u 0BB3 ::r l   ::comment instead of Tamil LL(A)
+::u 0BB4 ::r l   ::comment instead of Tamil LLL(A)
+::u 0C03 ::r +h  ::comment ః Telugu sign visarga
+::u 0C83 ::r +h  ::comment Kannada sign visarga
+::u 0D02 ::r +m  ::comment Malayalam sign anusvara
+::u 0D03 ::r +h  ::comment Malayalam sign visarga
+::u 0D82 ::r +n  ::comment Sinhala sign anusvaraya
+::u 0DA4 ::r ny ::comment Sinhala ඤ
+::u 0DA5 ::r gn ::comment Sinhala ඥ
+::u 0DCA ::r ""  ::comment Sinhala sign al-lakuna (virama = no vowel)
+::u 0DCF ::r aa ::comment Sinhala ා
+::u 0DD0 ::r ae ::comment Sinhala ැ
+::u 0DD1 ::r ae ::comment Sinhala ෑ
+::u 0DD2 ::r i ::comment Sinhala ි
+::u 0DD3 ::r ii ::comment Sinhala ී
+::u 0DD4 ::r u ::comment Sinhala ු
+::u 0DD6 ::r uu ::comment Sinhala ූ
+::u 0DD8 ::r r ::comment Sinhala ෘ
+::u 0DD9 ::r e ::comment Sinhala ෙ
+::u 0DDA ::r ee ::comment Sinhala ේ
+::u 0DDB ::r ai ::comment Sinhala ෛ
+::u 0DDC ::r o ::comment Sinhala ො
+::u 0DDD ::r oo ::comment Sinhala ෝ
+::u 0DDE ::r au ::comment Sinhala ෞ
+::u 0DDF ::r aa ::comment Sinhala ා
+::u 0DF2 ::r rr ::comment Sinhala ෲ
+::u 0E02 ::r k   ::comment Thai character KHO KHAI
+::u 0E03 ::r k   ::comment Thai character KHO KHUAT
+::u 0E04 ::r k   ::comment Thai character KHO KHWAI
+::u 0E05 ::r k   ::comment Thai character KHO KHON
+::u 0E06 ::r k   ::comment Thai character KHO RAKHANG
+::u 0E10 ::r t   ::comment Thai character THO THAN
+::u 0E11 ::r t   ::comment Thai character THO NANGMONTHO
+::u 0E12 ::r t   ::comment Thai character THO PHUTHAO
+::u 0E16 ::r t   ::comment Thai character THO THUNG
+::u 0E17 ::r t   ::comment Thai character THO THAHAN
+::u 0E18 ::r t   ::comment Thai character THO THONG
+::u 0E1C ::r p   ::comment Thai character PHO PHUNG
+::u 0E1E ::r p   ::comment Thai character PHO PHAN
+::u 0E20 ::r p   ::comment Thai character PHO SAMPHAO
+::u 0E2D ::r o   ::comment Thai character O ANG
+::u 0E2F ::r ...  ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation)
+::u 0E31 ::r a   ::comment Thai character MAI HAN-AKAT
+::u 0E3A ::r ""  ::comment Thai character PHINTHU (Pali virama)
+::u 0E40 ::r e   ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E
+::u 0E41 ::r ae  ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE
+::u 0E42 ::r o   ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O
+::u 0E43 ::r ai  ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN
+::u 0E44 ::r ai  ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI
+::u 0E45 ::r ""  ::comment Thai character LAKKHANGYAO vowel lengthener
+::u 0E47 ::r o   ::comment Thai character MAITAIKHU vowel shortener
+::u 0E48 ::r ""  ::tone-mark non-standard ::comment Thai tone mark MAI EK
+::u 0E49 ::r ""  ::tone-mark standard ::comment Thai tone mark MAI THO
+::u 0E4A ::r ""  ::tone-mark high ::comment Thai tone mark MAI TRI
+::u 0E4B ::r ""  ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA
+::u 0E4C ::r ""  ::comment Thai character THANTHAKHAT cancellation mark (cf. virama)
+::u 0E4D ::r +m  ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara)
+::u 0ECC ::r ""  ::comment ໌ Lao cancellation mark ::annotation cancellation
+::u 0F0B ::r ·  ::comment ་ Tibetan mark intersyllabic tsheg
+::u 0F0C ::r ""  ::comment ༌ Tibetan mark delimiter tsheg bstar
+::u 0F84 ::r ""  ::comment ྄ Tibetan halanta
+::u 1036 ::r +n  ::comment Myanmar sign anusvara ("auk myit")
+::u 1037 ::r ""  ::tone-mark creaky ::comment Myanmar sign dot below
+::u 1038 ::r ""  ::tone-mark high ::comment Myanmar sign visarga
+::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F
+::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V
+::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U
+::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR
+::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y
+::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W
+::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN
+::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH
+::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A
+::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O
+::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A
+::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC
+::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O
+::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O
+::u 16AE ::r o ::comment ᚮ RUNIC LETTER O
+::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE
+::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON
+::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R
+::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA
+::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN
+::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K
+::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G
+::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG
+::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G
+::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR
+::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W
+::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H
+::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H
+::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H
+::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H
+::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N
+::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N
+::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N
+::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I
+::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E
+::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J
+::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER
+::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE
+::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A
+::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH
+::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P
+::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX
+::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S
+::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S
+::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S
+::u 16CD ::r c ::comment ᛍ RUNIC LETTER C
+::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z
+::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T
+::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T
+::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D
+::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B
+::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B
+::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P
+::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P
+::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E
+::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M
+::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M
+::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M
+::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L
+::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L
+::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ
+::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING
+::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D
+::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O
+::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR
+::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR
+::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH
+::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC
+::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC
+::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN
+::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR
+::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR
+::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR
+::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q
+::u 16EA ::r x ::comment ᛪ RUNIC LETTER X
+::u 17B9 ::r oe ::comment Khmer vowel sign y (short)
+::u 17BA ::r oe ::comment Khmer vowel sign yy (long)
+::u 17C6 ::r +m  ::comment Khmer sign nikahit (cf. anusvara)
+::u 17C7 ::r +h  ::comment Khmer sign reahmuk (cf. visarga)
+::u 17C8 ::r '   ::comment Khmer sign yuukaleapintu (short vowel and glottal stop)
+::u 17C9 ::r ""  ::comment Khmer sign muusikatoan: changes the second register to the first
+::u 17CA ::r ""  ::comment Khmer sign triisap: changes the first register to the second
+::u 17CB ::r ""  ::comment Khmer sign bantoc (vowel shortener)
+::u 17D2 ::r ""  ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel)
+::u 17D5 ::r .  ::comment Khmer sign bariyoosan; period ending entire text or chapter
+::u 180E ::r '  ::comment ᠎ Mongolian vowel separator
+::u 1B80 ::r +ng  ::comment ᮀ Sundanese sign panyecek
+::u 1B81 ::r +r  ::comment ᮁ Sundanese sign panglayar
+::u 1B82 ::r +h  ::comment ᮂ Sundanese sign pangwisad
+::u 1BA1 ::r ya  ::comment ᮡ Sundanese consonant sign pamingkal
+::u 1BA2 ::r ra  ::comment ᮢ Sundanese consonant sign panyakr
+::u 1BA3 ::r la  ::comment ᮣ Sundanese consonant sign panyiku
+::u 1BA4 ::r i  ::comment ᮤ Sundanese consonant sign panghulu
+::u 1BA5 ::r u  ::comment ᮥ Sundanese consonant sign panyuku
+::u 1BA6 ::r e  ::comment ᮦ Sundanese vowel sign panaelaeng
+::u 1BA7 ::r o  ::comment ᮧ Sundanese vowel sign panolong
+::u 1BA8 ::r e  ::comment ᮨ Sundanese vowel sign pamepet
+::u 1BA9 ::r eu  ::comment ᮩ Sundanese vowel sign paneuleung
+::u 1BAA ::r ""  ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama)
+::u 1FBD ::r ""  ::comment ᾽ Greek koronis
+::u 1FFE ::r ""  ::comment Greek dasia (rough breathing)
+::u 2002 ::r " "  ::comment en space
+::u 2003 ::r " "  ::comment em space
+::u 2004 ::r " "  ::comment three-per-em space
+::u 2005 ::r " "  ::comment four-per-em space
+::u 2006 ::r " "  ::comment six-per-em space
+::u 2007 ::r " "  ::comment figure space
+::u 2008 ::r " "  ::comment punctuation space
+::u 2009 ::r " "  ::comment thin space
+::u 200A ::r " "  ::comment hair space
+::u 202F ::r " "  ::comment narrow no-break space
+::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ
+::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ
+::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ
+::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ
+::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ
+::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ
+::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ
+::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ
+::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ
+::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ
+::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ
+::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ
+::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ
+::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ
+::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ
+::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ
+::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ
+::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ
+::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ
+::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ
+::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ
+::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ
+::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ
+::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ
+::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ
+::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ
+::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ
+::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ
+::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ
+::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ
+::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ
+::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ
+::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ
+::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ
+::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ
+::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ
+::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ
+::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ
+::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ
+::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ
+::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ
+::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ
+::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ
+::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ
+::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ
+::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ
+::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ
+::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ
+::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ
+::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ
+::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ
+::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ
+::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ
+::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ
+::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ
+::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ
+::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ
+::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰
+::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿
+::u 3063 ::r tsu ::comment Hiragana letter small tsu
+::u 30C3 ::r tsu ::comment Katakana letter small tsu
+::u ABE3 ::r o  ::comment ꯣ Meetei Mayek vowel sign onap
+::u ABE7 ::r ou  ::comment ꯧ Meetei Mayek vowel sign sounap
+::u F008 ::r ""  ::comment Yoruba diacritic in private use area
+::u F00F ::r ""  ::comment Yoruba diacritic in private use area
+::u F023 ::r ""  ::comment Yoruba diacritic in private use area
+::u F025 ::r ""  ::comment Yoruba diacritic in private use area
+::u F8D0 ::r a   ::name KLINGON LETTER A
+::u F8D1 ::r b   ::name KLINGON LETTER B
+::u F8D2 ::r ch  ::name KLINGON LETTER CH
+::u F8D3 ::r D   ::name KLINGON LETTER D
+::u F8D4 ::r e   ::name KLINGON LETTER E
+::u F8D5 ::r gh  ::name KLINGON LETTER GH
+::u F8D6 ::r H   ::name KLINGON LETTER H
+::u F8D7 ::r I   ::name KLINGON LETTER I
+::u F8D8 ::r j   ::name KLINGON LETTER J
+::u F8D9 ::r l   ::name KLINGON LETTER L
+::u F8DA ::r m   ::name KLINGON LETTER M
+::u F8DB ::r n   ::name KLINGON LETTER N
+::u F8DC ::r ng  ::name KLINGON LETTER NG
+::u F8DD ::r o   ::name KLINGON LETTER O
+::u F8DE ::r p   ::name KLINGON LETTER P
+::u F8DF ::r q   ::name KLINGON LETTER Q
+::u F8E0 ::r Q   ::name KLINGON LETTER Q
+::u F8E1 ::r r   ::name KLINGON LETTER R
+::u F8E2 ::r S   ::name KLINGON LETTER S
+::u F8E3 ::r t   ::name KLINGON LETTER T
+::u F8E4 ::r tlh ::name KLINGON LETTER TLH
+::u F8E5 ::r u   ::name KLINGON LETTER U
+::u F8E6 ::r v   ::name KLINGON LETTER V
+::u F8E7 ::r w   ::name KLINGON LETTER W
+::u F8E8 ::r y   ::name KLINGON LETTER Y
+::u F8E9 ::r '   ::name KLINGON LETTER GLOTTAL STOP
+::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO
+::u F8F1 ::num 1 ::name KLINGON DIGIT ONE
+::u F8F2 ::num 2 ::name KLINGON DIGIT TWO
+::u F8F3 ::num 3 ::name KLINGON DIGIT THREE
+::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR
+::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE
+::u F8F6 ::num 6 ::name KLINGON DIGIT SIX
+::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN
+::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT
+::u F8F9 ::num 9 ::name KLINGON DIGIT NINE
+::u F8FD ::r ,   ::name KLINGON COMMA
+::u F8FE ::r .   ::name KLINGON FULL STOP
+::u F8FF ::name KLINGON MUMMIFICATION GLYPH
+::u 1163D ::r +m ::comment Modi sign anusvara
+::u 1163E ::r +h ::comment Modi sign visarga
+::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph
+::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth
+::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm
+::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand
+::u 130AD ::num 10000 ::comment Egyptian Hieroglyph
+::u 130AE ::num 20000 ::comment Egyptian Hieroglyph
+::u 130AF ::num 30000 ::comment Egyptian Hieroglyph
+::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph
+::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph
+::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph
+::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph
+::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph
+::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph
+::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph
+::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot
+::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent
+::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder
+::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture
+::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl
+::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick
+::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg
+::u 13190 ::num 100000 ::comment Egyptian Hieroglyph
+::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper
+::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra
+::u 131BC ::num 1000 ::comment Egyptian Hieroglyph
+::u 131BD ::num 2000 ::comment Egyptian Hieroglyph
+::u 131BE ::num 3000 ::comment Egyptian Hieroglyph
+::u 131BF ::num 4000 ::comment Egyptian Hieroglyph
+::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph
+::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph
+::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph
+::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph
+::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph
+::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed
+::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed
+::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope
+::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills
+::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water
+::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin
+::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter
+::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt
+::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool
+::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown
+::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth
+::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick
+::u 13362 ::num 100 ::comment Egyptian Hieroglyph
+::u 13363 ::num 200 ::comment Egyptian Hieroglyph
+::u 13364 ::num 300 ::comment Egyptian Hieroglyph
+::u 13365 ::num 400 ::comment Egyptian Hieroglyph
+::u 13366 ::num 500 ::comment Egyptian Hieroglyph
+::u 13367 ::num 600 ::comment Egyptian Hieroglyph
+::u 13368 ::num 700 ::comment Egyptian Hieroglyph
+::u 13369 ::num 800 ::comment Egyptian Hieroglyph
+::u 1336A ::num 900 ::comment Egyptian Hieroglyph
+::u 1336B ::num 500 ::comment Egyptian Hieroglyph
+::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso
+::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble
+::u 13386 ::num 10 ::comment Egyptian Hieroglyph
+::u 13387 ::num 20 ::comment Egyptian Hieroglyph
+::u 13388 ::num 30 ::comment Egyptian Hieroglyph
+::u 13389 ::num 40 ::comment Egyptian Hieroglyph
+::u 1338A ::num 50 ::comment Egyptian Hieroglyph
+::u 1338B ::num 60 ::comment Egyptian Hieroglyph
+::u 1338C ::num 70 ::comment Egyptian Hieroglyph
+::u 1338D ::num 80 ::comment Egyptian Hieroglyph
+::u 1338E ::num 90 ::comment Egyptian Hieroglyph
+::u 1338F ::num 20 ::comment Egyptian Hieroglyph
+::u 13390 ::num 30 ::comment Egyptian Hieroglyph
+::u 13391 ::num 40 ::comment Egyptian Hieroglyph
+::u 13392 ::num 50 ::comment Egyptian Hieroglyph
+::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax
+::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle
+::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant
+::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag
+::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand
+::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf
+::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes
+::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant
+::u 133FA ::num 1 ::comment Egyptian Hieroglyph
+::u 133FB ::num 2 ::comment Egyptian Hieroglyph
+::u 133FC ::num 3 ::comment Egyptian Hieroglyph
+::u 133FD ::num 4 ::comment Egyptian Hieroglyph
+::u 133FE ::num 5 ::comment Egyptian Hieroglyph
+::u 133FF ::num 6 ::comment Egyptian Hieroglyph
+::u 13400 ::num 7 ::comment Egyptian Hieroglyph
+::u 13401 ::num 8 ::comment Egyptian Hieroglyph
+::u 13402 ::num 9 ::comment Egyptian Hieroglyph
+::u 13403 ::num 5 ::comment Egyptian Hieroglyph
+::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta?
+::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm)