Koster commited on
Commit
b5f1359
·
verified ·
1 Parent(s): 2a9a49d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +2 -0
  2. LanguageTool.py +14 -0
  3. README.md +3 -9
  4. ScriptureReference.py +386 -0
  5. TrainingData.py +84 -0
  6. TranslationNoteFinder.py +252 -0
  7. TranslationNoteFinderLLMOnly.py +212 -0
  8. __pycache__/LanguageTool.cpython-312.pyc +0 -0
  9. __pycache__/ScriptureReference.cpython-312.pyc +0 -0
  10. __pycache__/ScriptureReference.cpython-39.pyc +0 -0
  11. __pycache__/TrainingData.cpython-312.pyc +0 -0
  12. __pycache__/TrainingData.cpython-39.pyc +0 -0
  13. __pycache__/TranslationNoteFinder.cpython-312.pyc +0 -0
  14. __pycache__/TranslationNoteFinder.cpython-39.pyc +0 -0
  15. __pycache__/TranslationNoteFinderLLMOnly.cpython-312.pyc +0 -0
  16. __pycache__/nltk.cpython-312.pyc +0 -0
  17. __pycache__/romanize.cpython-312.pyc +0 -0
  18. __pycache__/romanize.cpython-39.pyc +0 -0
  19. __pycache__/tfidf.cpython-312.pyc +0 -0
  20. __pycache__/tfidf.cpython-39.pyc +0 -0
  21. flagged/log.csv +2 -0
  22. highlightNote.css +7 -0
  23. highlightNote.js +10 -0
  24. main.py +22 -0
  25. main_gradio.py +72 -0
  26. main_gradio_js.py +107 -0
  27. romanize.py +43 -0
  28. tests/english_note_to_hindi.py +35 -0
  29. tests/find_greek_in_hindi.py +69 -0
  30. tests/guidance-ai-readme.md +731 -0
  31. tests/nltk-test.py +41 -0
  32. tests/test.py +36 -0
  33. tests/test2.py +3 -0
  34. tests/tfidf.py +152 -0
  35. tests/tsv_parse +27 -0
  36. translation_notes.json +32 -0
  37. translation_notes/tn_ROM.tsv +0 -0
  38. uroman-1.2.8/.gitignore +35 -0
  39. uroman-1.2.8/LICENSE.txt +11 -0
  40. uroman-1.2.8/README.md +163 -0
  41. uroman-1.2.8/README.txt +141 -0
  42. uroman-1.2.8/bin/de-accent.pl +201 -0
  43. uroman-1.2.8/bin/string-distance.pl +99 -0
  44. uroman-1.2.8/bin/uroman-quick.pl +58 -0
  45. uroman-1.2.8/bin/uroman-tsv.sh +28 -0
  46. uroman-1.2.8/bin/uroman.pl +138 -0
  47. uroman-1.2.8/data/Chinese_to_Pinyin.txt +0 -0
  48. uroman-1.2.8/data/Scripts.txt +135 -0
  49. uroman-1.2.8/data/UnicodeData.txt +0 -0
  50. uroman-1.2.8/data/UnicodeDataOverwrite.txt +442 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.gguf
2
+ bibles/*
LanguageTool.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langdetect import detect
2
+ import pycountry
3
+ import langid
4
+
5
+ class Lang:
6
+ def __init__(self, text, options=None):
7
+ if options:
8
+ langid.set_languages(options) # ISO 639-1 codes
9
+ self.lang_code, _ = langid.classify(text)
10
+ else:
11
+ self.lang_code = detect(text[:1000])
12
+
13
+
14
+ self.lang_name = pycountry.languages.get(alpha_2=self.lang_code).name
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Translation Note Alignment
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.20.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Translation_Note_Alignment
3
+ app_file: main_gradio_js.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.19.2
 
 
6
  ---
 
 
ScriptureReference.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from functools import cache
3
+
4
+ class ScriptureReference:
5
+ verse_ones = [
6
+ 1, 1534, 2747, 3606, 4895, 5854, 6512, 7130, 7215, 8026, 8721, 9538, 10257, 11200, 12022, 12302, 12707, 12874, 13944,
7
+ 16471, 17386, 17608, 17725, 19016, 20380, 20534, 21807, 22164, 22361, 22434, 22580, 22601, 22649, 22754, 22801, 22857,
8
+ 22910, 22948, 23159, 23214, 24285, 24963, 26114, 26993, 27999, 28432, 28869, 29125, 29274, 29429, 29533, 29628, 29717,
9
+ 29764, 29877, 29960, 30006, 30031, 30334, 30442, 30547, 30608, 30713, 30726, 30741, 30766, 31171
10
+ ]
11
+
12
+ book_codes = {
13
+ 'GEN': {
14
+ 'codes': ['Gen', 'Gn', '1M'],
15
+ 'verses': [31, 25, 24, 26, 32, 22, 24, 22, 29, 32, 32, 20, 18, 24, 21, 16, 27, 33, 38, 18, 34, 24, 20, 67, 34,
16
+ 35, 46, 22, 35, 43, 55, 32, 20, 31, 29, 43, 36, 30, 23, 23, 57, 38, 34, 34, 28, 34, 31, 22, 33, 26]
17
+ },
18
+ 'EXO': {
19
+ 'codes': ['Ex', '2M'],
20
+ 'verses': [22, 25, 22, 31, 23, 30, 25, 32, 35, 29, 10, 51, 22, 31, 27, 36, 16, 27, 25, 26, 36, 31, 33, 18, 40,
21
+ 37, 21, 43, 46, 38, 18, 35, 23, 35, 35, 38, 29, 31, 43, 38]
22
+ },
23
+ 'LEV': {
24
+ 'codes': ['Lev', 'Lv', '3M'],
25
+ 'verses': [17, 16, 17, 35, 19, 30, 38, 36, 24, 20, 47, 8, 59, 57, 33, 34, 16, 30, 37, 27, 24, 33, 44, 23, 55,
26
+ 46, 34]
27
+ },
28
+ 'NUM': {
29
+ 'codes': ['Nm', 'Nu', '4M'],
30
+ 'verses': [54, 34, 51, 49, 31, 27, 89, 26, 23, 36, 35, 16, 33, 45, 41, 50, 13, 32, 22, 29, 35, 41, 30, 25, 18,
31
+ 65, 23, 31, 40, 16, 54, 42, 56, 29, 34, 13]
32
+ },
33
+ 'DEU': {
34
+ 'codes': ['Deut', 'Dt', '5M'],
35
+ 'verses': [46, 37, 29, 49, 33, 25, 26, 20, 29, 22, 32, 32, 18, 29, 23, 22, 20, 22, 21, 20, 23, 30, 25, 22, 19,
36
+ 19, 26, 68, 29, 20, 30, 52, 29, 12]
37
+ },
38
+ 'JOS': {
39
+ 'codes': ['Josh', 'Jos'],
40
+ 'verses': [18, 24, 17, 24, 15, 27, 26, 35, 27, 43, 23, 24, 33, 15, 63, 10, 18, 28, 51, 9, 45, 34, 16, 33]
41
+ },
42
+ 'JDG': {
43
+ 'codes': ['Jdg', 'Judg'],
44
+ 'verses': [36, 23, 31, 24, 31, 40, 25, 35, 57, 18, 40, 15, 25, 20, 20, 31, 13, 31, 30, 48, 25]
45
+ },
46
+ 'RUT': {
47
+ 'codes': ['Ru', 'Rth'],
48
+ 'verses': [22, 23, 18, 22]
49
+ },
50
+ '1SA': {
51
+ 'codes': ['1Sam', '1Sm'],
52
+ 'verses': [28, 36, 21, 22, 12, 21, 17, 22, 27, 27, 15, 25, 23, 52, 35, 23, 58, 30, 24, 42, 15, 23, 29, 22, 44,
53
+ 25, 12, 25, 11, 31, 13]
54
+ },
55
+ '2SA': {
56
+ 'codes': ['2Sam', '2Sm'],
57
+ 'verses': [27, 32, 39, 12, 25, 23, 29, 18, 13, 19, 27, 31, 39, 33, 37, 23, 29, 33, 43, 26, 22, 51, 39, 25]
58
+ },
59
+ '1KI': {
60
+ 'codes': ['1Kg', '1K'],
61
+ 'verses': [53, 46, 28, 34, 18, 38, 51, 66, 28, 29, 43, 33, 34, 31, 34, 34, 24, 46, 21, 43, 29, 53]
62
+ },
63
+ '2KI': {
64
+ 'codes': ['2Kg', '2K'],
65
+ 'verses': [18, 25, 27, 44, 27, 33, 20, 29, 37, 36, 21, 21, 25, 29, 38, 20, 41, 37, 37, 21, 26, 20, 37, 20, 30]
66
+ },
67
+ '1CH': {
68
+ 'codes': ['1Ch'],
69
+ 'verses': [54, 55, 24, 43, 26, 81, 40, 40, 44, 14, 47, 40, 14, 17, 29, 43, 27, 17, 19, 8, 30, 19, 32, 31, 31,
70
+ 32, 34, 21, 30]
71
+ },
72
+ '2CH': {
73
+ 'codes': ['2Ch'],
74
+ 'verses': [17, 18, 17, 22, 14, 42, 22, 18, 31, 19, 23, 16, 22, 15, 19, 14, 19, 34, 11, 37, 20, 12, 21, 27, 28,
75
+ 23, 9, 27, 36, 27, 21, 33, 25, 33, 27, 23]
76
+ },
77
+ 'EZR': {
78
+ 'codes': ['Ezr'],
79
+ 'verses': [11, 70, 13, 24, 17, 22, 28, 36, 15, 44]
80
+ },
81
+ 'NEH': {
82
+ 'codes': ['Neh'],
83
+ 'verses': [11, 20, 32, 23, 19, 19, 73, 18, 38, 39, 36, 47, 31]
84
+ },
85
+ 'EST': {
86
+ 'codes': ['Est'],
87
+ 'verses': [22, 23, 15, 17, 14, 14, 10, 17, 32, 3]
88
+ },
89
+ 'JOB': {
90
+ 'codes': ['Jb', 'Job'],
91
+ 'verses': [22, 13, 26, 21, 27, 30, 21, 22, 35, 22, 20, 25, 28, 22, 35, 22, 16, 21, 29, 29, 34, 30, 17, 25, 6,
92
+ 14, 23, 28, 25, 31, 40, 22, 33, 37, 16, 33, 24, 41, 30, 24, 34, 17]
93
+ },
94
+ 'PSA': {
95
+ 'codes': ['Ps'],
96
+ 'verses': [6, 12, 8, 8, 12, 10, 17, 9, 20, 18, 7, 8, 6, 7, 5, 11, 15, 50, 14, 9, 13, 31, 6, 10, 22, 12, 14, 9,
97
+ 11, 12, 24, 11, 22, 22, 28, 12, 40, 22, 13, 17, 13, 11, 5, 26, 17, 11, 9, 14, 20, 23, 19, 9, 6, 7,
98
+ 23, 13, 11, 11, 17, 12, 8, 12, 11, 10, 13, 20, 7, 35, 36, 5, 24, 20, 28, 23, 10, 12, 20, 72, 13, 19,
99
+ 16, 8, 18, 12, 13, 17, 7, 18, 52, 17, 16, 15, 5, 23, 11, 13, 12, 9, 9, 5, 8, 28, 22, 35, 45, 48, 43,
100
+ 13, 31, 7, 10, 10, 9, 8, 18, 19, 2, 29, 176, 7, 8, 9, 4, 8, 5, 6, 5, 6, 8, 8, 3, 18, 3, 3, 21, 26,
101
+ 9, 8, 24, 13, 10, 7, 12, 15, 21, 10, 20, 14, 9, 6]
102
+ },
103
+ 'PRO': {
104
+ 'codes': ['Pr'],
105
+ 'verses': [33, 22, 35, 27, 23, 35, 27, 36, 18, 32, 31, 28, 25, 35, 33, 33, 28, 24, 29, 30, 31, 29, 35, 34, 28,
106
+ 28, 27, 28, 27, 33, 31]
107
+ },
108
+ 'ECC': {
109
+ 'codes': ['Ec', 'Qoh'],
110
+ 'verses': [18, 26, 22, 16, 20, 12, 29, 17, 18, 20, 10, 14]
111
+ },
112
+ 'SNG': {
113
+ 'codes': ['Sos', 'Song'],
114
+ 'verses': [17, 17, 11, 16, 16, 13, 13, 14]
115
+ },
116
+ 'ISA': {
117
+ 'codes': ['Isa'],
118
+ 'verses': [31, 22, 26, 6, 30, 13, 25, 22, 21, 34, 16, 6, 22, 32, 9, 14, 14, 7, 25, 6, 17, 25, 18, 23, 12, 21,
119
+ 13, 29, 24, 33, 9, 20, 24, 17, 10, 22, 38, 22, 8, 31, 29, 25, 28, 28, 25, 13, 15, 22, 26, 11, 23,
120
+ 15, 12, 17, 13, 12, 21, 14, 21, 22, 11, 12, 19, 12, 25, 24]
121
+ },
122
+ 'JER': {
123
+ 'codes': ['Jer', 'Jr'],
124
+ 'verses': [19, 37, 25, 31, 31, 30, 34, 22, 26, 25, 23, 17, 27, 22, 21, 21, 27, 23, 15, 18, 14, 30, 40, 10, 38,
125
+ 24, 22, 17, 32, 24, 40, 44, 26, 22, 19, 32, 21, 28, 18, 16, 18, 22, 13, 30, 5, 28, 7, 47, 39, 46, 64,
126
+ 34]
127
+ },
128
+ 'LAM': {
129
+ 'codes': ['Lam', 'Lm'],
130
+ 'verses': [22, 22, 66, 22, 22]
131
+ },
132
+ 'EZK': {
133
+ 'codes': ['Ezek', 'Ezk'],
134
+ 'verses': [28, 10, 27, 17, 17, 14, 27, 18, 11, 22, 25, 28, 23, 23, 8, 63, 24, 32, 14, 49, 32, 31, 49, 27, 17, 21,
135
+ 36, 26, 21, 26, 18, 32, 33, 31, 15, 38, 28, 23, 29, 49, 26, 20, 27, 31, 25, 24, 23, 35]
136
+ },
137
+ 'DAN': {
138
+ 'codes': ['Dn', 'Dan'],
139
+ 'verses': [21, 49, 30, 37, 31, 28, 28, 27, 27, 21, 45, 13]
140
+ },
141
+ 'HOS': {
142
+ 'codes': ['Hos', 'Hs'],
143
+ 'verses': [11, 23, 5, 19, 15, 11, 16, 14, 17, 15, 12, 10, 14, 9]
144
+ },
145
+ 'JOL': {
146
+ 'codes': ['Joel', 'Jl'],
147
+ 'verses': [20, 32, 21]
148
+ },
149
+ 'AMO': {
150
+ 'codes': ['Am'],
151
+ 'verses': [15, 16, 15, 13, 27, 14, 17, 14, 15]
152
+ },
153
+ 'OBA': {
154
+ 'codes': ['Ob'],
155
+ 'verses': [21]
156
+ },
157
+ 'JON': {
158
+ 'codes': ['Jon'],
159
+ 'verses': [17, 10, 10, 11]
160
+ },
161
+ 'MIC': {
162
+ 'codes': ['Mi', 'Mc'],
163
+ 'verses': [16, 13, 12, 13, 15, 16, 20]
164
+ },
165
+ 'NAM': {
166
+ 'codes': ['Na'],
167
+ 'verses': [15, 13, 19]
168
+ },
169
+ 'HAB': {
170
+ 'codes': ['Hab'],
171
+ 'verses': [17, 20, 19]
172
+ },
173
+ 'ZEP': {
174
+ 'codes': ['Zep', 'Zp'],
175
+ 'verses': [18, 15, 20]
176
+ },
177
+ 'HAG': {
178
+ 'codes': ['Hag', 'Hg'],
179
+ 'verses': [15, 23]
180
+ },
181
+ 'ZEC': {
182
+ 'codes': ['Zc', 'Zec'],
183
+ 'verses': [21, 13, 10, 14, 11, 15, 14, 20, 12, 21, 17, 14, 20, 9, 15, 21]
184
+ },
185
+ 'MAL': {
186
+ 'codes': ['Mal', 'Ml'],
187
+ 'verses': [14, 17, 18, 6]
188
+ },
189
+ 'MAT': {
190
+ 'codes': ['Mt', 'Mat'],
191
+ 'verses': [25, 23, 17, 25, 48, 34, 29, 34, 38, 42, 30, 50, 58, 36, 39, 28, 30, 34, 34, 46, 30, 46, 39, 28, 34,
192
+ 31, 46, 46, 38, 71, 66, 20]
193
+ },
194
+ 'MRK': {
195
+ 'codes': ['Mk', 'Mar'],
196
+ 'verses': [45, 28, 35, 41, 43, 56, 29, 38, 50, 52, 33, 44, 37, 72, 47, 20]
197
+ },
198
+ 'LUK': {
199
+ 'codes': ['Lk', 'Lu'],
200
+ 'verses': [80, 52, 38, 44, 39, 49, 50, 56, 62, 42, 54, 59, 35, 35, 32, 31, 37, 43, 48, 47, 38, 71, 56, 39, 49,
201
+ 57, 80, 55, 28, 35, 32, 31, 37, 50, 26, 46, 51, 66, 53, 59, 37, 35, 50, 40, 46, 51, 69, 53, 56, 20]
202
+ },
203
+ 'JHN': {
204
+ 'codes': ['Jn', 'Joh', 'Jhn'],
205
+ 'verses': [51, 25, 36, 54, 47, 71, 53, 59, 41, 42, 57, 50, 38, 31, 27, 33, 26, 40, 42, 31, 25]
206
+ },
207
+ 'ACT': {
208
+ 'codes': ['Ac'],
209
+ 'verses': [26, 47, 26, 37, 42, 15, 60, 40, 43, 48, 30, 25, 52, 28, 41, 40, 34, 28, 40, 38, 40, 30, 35, 27, 27,
210
+ 32, 44, 31]
211
+ },
212
+ 'ROM': {
213
+ 'codes': ['Ro', 'Rm'],
214
+ 'verses': [32, 29, 31, 25, 21, 23, 25, 39, 33, 21, 36, 21, 14, 23, 33, 27]
215
+ },
216
+ '1CO': {
217
+ 'codes': ['1Co'],
218
+ 'verses': [31, 16, 23, 21, 13, 20, 40, 13, 27, 33, 34, 31, 13, 40, 58, 24]
219
+ },
220
+ '2CO': {
221
+ 'codes': ['2Co'],
222
+ 'verses': [24, 17, 18, 18, 21, 18, 16, 24, 15, 18, 33, 21, 14]
223
+ },
224
+ 'GAL': {
225
+ 'codes': ['Gal', 'Gl'],
226
+ 'verses': [24, 21, 29, 31, 26, 18]
227
+ },
228
+ 'EPH': {
229
+ 'codes': ['Ep'],
230
+ 'verses': [23, 22, 21, 32, 33, 24]
231
+ },
232
+ 'PHP': {
233
+ 'codes': ['Php', 'Philip'],
234
+ 'verses': [30, 30, 21, 23]
235
+ },
236
+ 'COL': {
237
+ 'codes': ['Col'],
238
+ 'verses': [29, 23, 25, 18]
239
+ },
240
+ '1TH': {
241
+ 'codes': ['1Th'],
242
+ 'verses': [10, 20, 13, 18, 28]
243
+ },
244
+ '2TH': {
245
+ 'codes': ['2Th'],
246
+ 'verses': [12, 17, 18]
247
+ },
248
+ '1TI': {
249
+ 'codes': ['1Ti', '1Tm'],
250
+ 'verses': [20, 15, 16, 16, 25, 21, 25]
251
+ },
252
+ '2TI': {
253
+ 'codes': ['2Ti', '2Tm'],
254
+ 'verses': [18, 26, 17, 22]
255
+ },
256
+ 'TIT': {
257
+ 'codes': ['Tit'],
258
+ 'verses': [16, 15, 15]
259
+ },
260
+ 'PHM': {
261
+ 'codes': ['Phile', 'Phm'],
262
+ 'verses': [25]
263
+ },
264
+ 'HEB': {
265
+ 'codes': ['Hb', 'Heb'],
266
+ 'verses': [14, 18, 19, 16, 14, 20, 28, 13, 28, 39, 40, 29, 25]
267
+ },
268
+ 'JAS': {
269
+ 'codes': ['Ja', 'Jm'],
270
+ 'verses': [27, 26, 18, 17, 20]
271
+ },
272
+ '1PE': {
273
+ 'codes': ['1Pe', '2Pt'],
274
+ 'verses': [25, 25, 22, 19, 14]
275
+ },
276
+ '2PE': {
277
+ 'codes': ['2Pe', '2Pt'],
278
+ 'verses': [21, 22, 18]
279
+ },
280
+ '1JN': {
281
+ 'codes': ['1Jn', '1Jo', '1Jh'],
282
+ 'verses': [10, 29, 24, 21, 21]
283
+ },
284
+ '2JN': {
285
+ 'codes': ['2Jn', '2Jo', '2Jh'],
286
+ 'verses': [13]
287
+ },
288
+ '3JN': {
289
+ 'codes': ['3Jn', '3Jo', '3Jh'],
290
+ 'verses': [14]
291
+ },
292
+ 'JUD': {
293
+ 'codes': ['Ju', 'Jd'],
294
+ 'verses': [25]
295
+ },
296
+ 'REV': {
297
+ 'codes': ['Rev', 'Rv'],
298
+ 'verses': [20, 29, 22, 18, 14, 20, 17, 18, 20, 15, 23, 19, 21, 18, 18, 24, 22, 21, 21, 15, 27, 21]
299
+ }
300
+ }
301
+
302
+
303
+ def __init__(self, reference):
304
+ self.reference = reference
305
+ self._structured_ref = self.parse_scripture_reference(reference)
306
+
307
+ @classmethod
308
+ def parse_scripture_reference(cls, input_ref):
309
+ normalized_input = re.sub(r"\s+", "", input_ref).upper()
310
+ regex = re.compile(r"^(\d)?(\D+)(\d+)?(?::(\d+))?(?:-(\d+)?(?::(\d+))?)?$")
311
+ match = regex.match(normalized_input)
312
+
313
+ if not match:
314
+ return {'bookCode': '', 'startChapter': 0, 'endChapter': 0, 'startVerse': 0, 'endVerse': 0}
315
+
316
+ bookPrefix, bookName, startChapter, startVerse, endChapterOrVerse, endVerse = match.groups()
317
+
318
+ fullBookName = f"{bookPrefix or ''}{bookName}".upper()
319
+ bookCode = ''
320
+ for code, book in cls.book_codes.items():
321
+ if any(fullBookName.startswith(name.upper()) for name in book['codes']):
322
+ bookCode = code
323
+ break
324
+
325
+ startChap = int(startChapter) if startChapter else 0
326
+ endChap = int(endChapterOrVerse) if endChapterOrVerse and endVerse else startChap
327
+ startVer = int(startVerse) if startVerse else 0
328
+ endVer = int(endVerse) if endVerse else int(endChapterOrVerse) if endChapterOrVerse and not endVerse else startVer
329
+
330
+ if startVer != 0 and endVer == 0:
331
+ endVer = startVer
332
+
333
+ return {
334
+ 'bookCode': bookCode,
335
+ 'startChapter': startChap,
336
+ 'endChapter': endChap,
337
+ 'startVerse': startVer,
338
+ 'endVerse': endVer,
339
+ }
340
+
341
+ @property
342
+ @cache
343
+ def structured_ref(self):
344
+ return self._structured_ref
345
+
346
+
347
+ @property
348
+ @cache
349
+ def line_number(self):
350
+ book_code = self.structured_ref['bookCode']
351
+ start_chapter = self.structured_ref['startChapter']
352
+ start_verse = self.structured_ref['startVerse']
353
+
354
+ # Find the index of the book to get the starting line number
355
+ book_index = list(self.book_codes.keys()).index(book_code)
356
+ start_line_of_book = self.verse_ones[book_index]
357
+
358
+ # Calculate the number of verses before the specified chapter
359
+ verses_before = sum(self.book_codes[book_code]['verses'][:start_chapter - 1])
360
+
361
+ # Calculate the line number of the verse
362
+ line_number = start_line_of_book + verses_before + start_verse - 1
363
+
364
+ return line_number
365
+
366
+ # Override eq method to allow comparison of ScriptureReference objects based on line number
367
+ # def __eq__(self, other):
368
+ # return self.line_number == other.line_number
369
+
370
+ # def __hash__(self):
371
+ # book_code = self.structured_ref['bookCode']
372
+ # start_chapter = self.structured_ref['startChapter']
373
+ # start_verse = self.structured_ref['startVerse']
374
+ # book_index = list(self.book_codes.keys()).index(book_code)
375
+ # start_line_of_book = self.verse_ones[book_index]
376
+ # verses_before = sum(self.book_codes[book_code]['verses'][:start_chapter - 1])
377
+ # line_number = start_line_of_book + verses_before + start_verse - 1
378
+ # return hash(line_number)
379
+
380
+
381
+
382
+ # # Example usage:
383
+ # reference = "1Jn 5:7-8"
384
+ # scripture_ref = ScriptureReference(reference)
385
+ # print("Structured Reference:", scripture_ref.get_structured_ref())
386
+ # print("Line Number:", scripture_ref.line_number)
TrainingData.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ greek_to_lang = {
2
+ 'Hindi':'''
3
+ What is a good translation of πίστις into Hindi? विश्वास
4
+ What is a good translation of χάρις into Hindi? अनुग्रह
5
+ What is a good translation of σωτηρία into Hindi? उद्धार
6
+ What is a good translation of εὐαγγέλιον into Hindi? सुसमाचार
7
+ What is a good translation of ἀπόστολος into Hindi? प्रेरित
8
+ What is a good translation of ἀγάπη into Hindi? प्रेम
9
+ What is a good translation of ἐκκλησία into Hindi? कलीसिया
10
+ What is a good translation of ἁμαρτία into Hindi? पाप
11
+ What is a good translation of μετάνοια into Hindi? पश्चाताप
12
+ What is a good translation of κύριος into Hindi? प्रभु
13
+ What is a good translation of ἅγιον πνεῦμα into Hindi? पवित्र आत्मा
14
+ What is a good translation of ἀνάστασις into Hindi? पुनरुत्थान
15
+ What is a good translation of ζωὴ αἰώνιος into Hindi? अनन्त जीवन
16
+ What is a good translation of βασιλεία τοῦ Θεοῦ into Hindi? परमेश्वर का राज्य
17
+ What is a good translation of μαθητής into Hindi? शिष्य
18
+ What is a good translation of δύναμις into Hindi? चमत्कार
19
+ What is a good translation of ἐντολή into Hindi? आज्ञा
20
+ What is a good translation of δικαιοσύνη into Hindi? धार्मिकता
21
+ What is a good translation of εἰρήνη into Hindi? शांति
22
+ What is a good translation of ὁ ἔσχατος δεῖπνον into Hindi? अंतिम भोज
23
+ What is a good translation of υἱὸς τοῦ Θεοῦ into Hindi? परमेश्वर का पुत्र
24
+ What is a good translation of βασιλεὺς τῶν βασιλευόντων into Hindi? राजाओं का राजा
25
+ What is a good translation of ἀμνὸς τοῦ Θεοῦ into Hindi? परमेश्वर का मेमना
26
+ What is a good translation of καρπὸς τοῦ Πνεύματος into Hindi? आत्मा का फल
27
+ What is a good translation of δωρεὰ τοῦ Ἁγίου Πνεύματος into Hindi? पवित्र आत्मा की देन
28
+ What is a good translation of ὁ καλὸς ποιμήν into Hindi? भला चरवाहा
29
+ What is a good translation of ἡ μεγάλη ἐντολή into Hindi? महान आज्ञा
30
+ What is a good translation of γεννηθῆναι ἄνωθεν into Hindi? पुनर्जन्म
31
+ What is a good translation of ἡ προσευχὴ τοῦ Κυρίου into Hindi? प्रभु की प्रार्थना
32
+ What is a good translation of Ὄρος τῶν Ἐλαιῶν into Hindi? जैतून का पहाड़
33
+ What is a good translation of ὕδωρ ζῶν into Hindi? जीवित जल
34
+ What is a good translation of ἄρτος τῆς ζωῆς into Hindi? जीवन की रोटी
35
+ What is a good translation of φῶς τοῦ κόσμου into Hindi? संसार का प्रकाश
36
+ What is a good translation of ποτήριον τῆς ὀργῆς into Hindi? क्रोध का प्याला
37
+ What is a good translation of σφραγὶς τοῦ Θεοῦ into Hindi? परमेश्वर की मुहर
38
+ What is a good translation of καινὴ διαθήκη into Hindi? नया नियम
39
+ What is a good translation of πανοπλία τοῦ Θεοῦ into Hindi? परमेश्वर का कवच
40
+ What is a good translation of θρόνος τῆς χάριτος into Hindi? अनुग्रह का सिंहासन
41
+ ''',
42
+ 'Spanish':'''
43
+ What is a good translation of Χριστός into Spanish? Cristo
44
+ What is a good translation of πίστις into Spanish? fe
45
+ What is a good translation of χάρις into Spanish? Gracia
46
+ What is a good translation of σωτηρία into Spanish? Salvación
47
+ What is a good translation of εὐαγγέλιον into Spanish? Evangelio
48
+ What is a good translation of ἀπόστολος into Spanish? Apóstol
49
+ What is a good translation of ἀγάπη into Spanish? Amor
50
+ What is a good translation of ἐκκλησία into Spanish? Iglesia
51
+ What is a good translation of ἁμαρτία into Spanish? Pecado
52
+ What is a good translation of μετάνοια into Spanish? Arrepentimiento
53
+ What is a good translation of κύριος into Spanish? Señor
54
+ What is a good translation of ἅγιον πνεῦμα into Spanish? Espíritu Santo
55
+ What is a good translation of ἀνάστασις into Spanish? Resurrección
56
+ What is a good translation of ζωὴ αἰώνιος into Spanish? Vida Eterna
57
+ What is a good translation of βασιλεία τοῦ Θεοῦ into Spanish? Reino de Dios
58
+ What is a good translation of μαθητής into Spanish? Discípulo
59
+ What is a good translation of δύναμις into Spanish? Poder
60
+ What is a good translation of ἐντολή into Spanish? Mandamiento
61
+ What is a good translation of δικαιοσύνη into Spanish? Justicia
62
+ What is a good translation of εἰρήνη into Spanish? Paz
63
+ What is a good translation of ὁ ἔσχατος δεῖπνον into Spanish? Última Cena
64
+ What is a good translation of υἱὸς τοῦ Θεοῦ into Spanish? Hijo de Dios
65
+ What is a good translation of βασιλεὺς τῶν βασιλευόντων into Spanish? Rey de reyes
66
+ What is a good translation of ἀμνὸς τοῦ Θεοῦ into Spanish? Cordero de Dios
67
+ What is a good translation of καρπὸς τοῦ Πνεύματος into Spanish? Fruto del Espíritu
68
+ What is a good translation of δωρεὰ τοῦ Ἁγίου Πνεύματος into Spanish? Don del Espíritu Santo
69
+ What is a good translation of ὁ καλὸς ποιμήν into Spanish? Buen Pastor
70
+ What is a good translation of ἡ μεγάλη ἐντολή into Spanish? Gran Mandamiento
71
+ What is a good translation of γεννηθῆναι ἄνωθεν into Spanish? Nacer de nuevo
72
+ What is a good translation of ἡ προσευχὴ τοῦ Κυρίου into Spanish? Oración del Señor
73
+ What is a good translation of Ὄρος τῶν Ἐλαιῶν into Spanish? Monte de los Olivos
74
+ What is a good translation of ὕδωρ ζῶν into Spanish? Agua Viva
75
+ What is a good translation of ἄρτος τῆς ζωῆς into Spanish? Pan de Vida
76
+ What is a good translation of φῶς τοῦ κόσμου into Spanish? Luz del Mundo
77
+ What is a good translation of ποτήριον τῆς ὀργῆς into Spanish? Copa de la Ira
78
+ What is a good translation of σφραγὶς τοῦ Θεοῦ into Spanish? Sello de Dios
79
+ What is a good translation of καινὴ διαθήκη into Spanish? Nuevo Pacto
80
+ What is a good translation of πανοπλία τοῦ Θεοῦ into Spanish? Armadura de Dios
81
+ What is a good translation of θρόνος τῆς χάριτος into Spanish? Trono de Gracia
82
+ What is a good translation of βιβλίον τῆς ζωῆς into Spanish? Libro de la Vida
83
+ '''
84
+ }
TranslationNoteFinder.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+ import re
4
+ from langdetect import detect
5
+ import pycountry
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
8
+ import openai
9
+ from romanize import uroman
10
+ from ScriptureReference import ScriptureReference as SR
11
+ import stanza
12
+ import difflib
13
+ import requests
14
+ from TrainingData import greek_to_lang
15
+
16
+
17
+ class TranslationNoteFinder:
18
+ verses = SR.verse_ones
19
+
20
+ greek_bible_path = 'bibles/grc-grctcgnt.txt'
21
+
22
+ # Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
23
+ # lang_code follows ISO 639-1 standard
24
+ def __init__(self, bible_text_path, api_key, model_path=None, lang_code=None):
25
+
26
+ # Load Bibles
27
+ self.verses = TranslationNoteFinder.verses
28
+ self.greek_bible_text = self.load_bible(self.greek_bible_path)
29
+ self.target_bible_text = self.load_bible(bible_text_path)
30
+ first_line_nt = self.target_bible_text.splitlines()[23213]
31
+
32
+ # Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
33
+ if lang_code:
34
+ self.language = lang_code
35
+ self.lang_name = pycountry.languages.get(alpha_2=self.language).name
36
+ print(f'Language of target Bible text: {self.lang_name}')
37
+ else:
38
+ self.language = detect(first_line_nt)
39
+ self.lang_name = pycountry.languages.get(alpha_2=self.language).name
40
+ print(f'Detected language of target Bible text: {self.lang_name}')
41
+
42
+ # Local model currently not in use
43
+ if model_path:
44
+ self.model_path = model_path
45
+
46
+ # Download target language data for use in tokenizer
47
+ stanza.download(self.language)
48
+ self.nlp = stanza.Pipeline(lang=self.language, processors='tokenize')
49
+
50
+ # Assign instance variables
51
+ self.target_bible_text = self.load_bible(bible_text_path)
52
+ self.api_key = api_key
53
+
54
+ # Get tf-idf vectorizer, matrix for target Bible text
55
+ self.tfidf_vectorizer, self.tfidf_matrix = self.create_tfidf_vectorizer_matrix()
56
+
57
+
58
+ def parse_tsv_to_json(self, file_content, book_abbrev):
59
+ result = [] # Initialize an empty list to store the dictionaries.
60
+
61
+ # Turn tsv content into reader
62
+ tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
63
+
64
+ for row in tsv_reader:
65
+ # Check if the row contains a Greek term (non-empty) in the expected position.
66
+ if row and len(row) > 3 and row[4].strip():
67
+ # Construct a dictionary for the current row.
68
+ entry = {
69
+ "source_term": row[4].strip(),
70
+ "translation_note": row[6].strip(),
71
+ "verse": book_abbrev + row[0].strip()
72
+ }
73
+ # Append the dictionary to the result list.
74
+ result.append(entry)
75
+
76
+ return result
77
+
78
+
79
+ def load_translation_notes(self, book_abbrev):
80
+ # If filepath ends with json
81
+ translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
82
+ response = requests.get(translation_notes_path)
83
+ if response.status_code == 200:
84
+ translation_notes_raw = response.text
85
+ else:
86
+ translation_notes_raw = ''
87
+
88
+ translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
89
+
90
+ return translation_notes
91
+
92
+
93
+ def load_bible(self, bible_path):
94
+ # Check if the path starts with "http://" or "https://"
95
+ if bible_path.startswith('http'):
96
+ # Use requests to fetch the Bible text from the URL
97
+ response = requests.get(bible_path)
98
+ # Check if the request was successful
99
+ if response.status_code == 200:
100
+ bible_text = response.text
101
+ else:
102
+ bible_text = '' # Or handle errors as needed
103
+ else:
104
+ # Load the Bible text from a local file
105
+ with open(bible_path, 'r', encoding='utf-8') as file:
106
+ bible_text = file.read()
107
+ return bible_text
108
+
109
+
110
+ # Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
111
+ # i.e., documents = [Genesis content, Exodus content, ...]
112
+ def segment_corpus(self, bible_text):
113
+ documents = []
114
+ current_document = []
115
+ verse_lines = bible_text.splitlines()
116
+ for i, line in enumerate(verse_lines, start=1):
117
+ if i in self.verses:
118
+ if current_document:
119
+ joined_doc_string = " ".join(current_document)
120
+ documents.append(joined_doc_string)
121
+ current_document = []
122
+ current_document.append(line.strip())
123
+ # Add the last document
124
+ if current_document:
125
+ joined_doc_string = " ".join(current_document)
126
+ documents.append(joined_doc_string)
127
+ return documents
128
+
129
+
130
+ # A method created for the tokenizer arg of the TfidfVectorizer class constructor
131
+ # See create_tfidf_vectorizer_matrix method
132
+ def stanza_tokenizer(self, text):
133
+ # Use the Stanza pipeline to process the text
134
+ doc = self.nlp(text)
135
+ # Extract tokens from the Stanza Document object
136
+ tokens = [word.text for sent in doc.sentences for word in sent.words]
137
+ return tokens
138
+
139
+
140
+ # Create a tf-idf vectorizer and matrix for the target Bible text
141
+ def create_tfidf_vectorizer_matrix(self):
142
+ tfidf_vectorizer = TfidfVectorizer(tokenizer=self.stanza_tokenizer, ngram_range=(1, 10))
143
+ segmented_corpus = self.segment_corpus(self.target_bible_text)
144
+ tfidf_matrix = tfidf_vectorizer.fit_transform(segmented_corpus)
145
+ return tfidf_vectorizer, tfidf_matrix
146
+
147
+
148
+ # Use the tf-idf matrix to get the tf-idf scores for the features (n-grams) of a specific book
149
+ def get_tfidf_book_features(self, book_code):
150
+ book_index = list(SR.book_codes.keys()).index(book_code)
151
+ feature_names = self.tfidf_vectorizer.get_feature_names_out()
152
+ dense = self.tfidf_matrix[book_index].todense()
153
+ document_tfidf_scores = dense.tolist()[0]
154
+ feature_scores = dict(zip(feature_names, document_tfidf_scores))
155
+
156
+ # Filter out zero scores
157
+ filtered_feature_scores = {feature: score for feature, score in feature_scores.items() if score > 0}
158
+ # Sort by score in descending order (just because...)
159
+ sorted_feature_scores = dict(sorted(filtered_feature_scores.items(), key=lambda item: item[1], reverse=True))
160
+ return sorted_feature_scores
161
+
162
+
163
+ # For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated Greek term
164
+ def best_ngram_for_note(self, note, verse_ngrams, language):
165
+ # local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens
166
+
167
+ openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
168
+ openai_lm = openai_llm
169
+
170
+ print(f'All ngrams in verse guidance is selecting from: {[key for key in verse_ngrams.keys()]}')
171
+ # print(f'All ngrams in verse guidance is selecting from: {[uroman(key) for key in verse_ngrams.keys()]}')
172
+ source_term = note['source_term'].strip()
173
+ # source_term = uroman(note['source_term']).strip()
174
+
175
+ with system():
176
+ openai_lm += f'You are an expert at translating from Greek into {language}.'
177
+ openai_lm += 'When asked to translate, provide only the translation of the term. Nothing else. Do not provide any additional information or context.'
178
+ openai_lm += 'Be extrememly succinct in your translations.'
179
+ openai_lm += 'You must choose only from the list of translation options you are given. Choose the single best option.'
180
+ # with instruction():
181
+ with user():
182
+ openai_lm += f'What is a good translation of {source_term} from Greek into {language} and is found here: {verse_ngrams.keys()}?'
183
+ with assistant():
184
+ openai_lm += gen('openai_translation', stop='.')
185
+ print(f'OpenAI translation: {openai_lm["openai_translation"]}')
186
+
187
+ try:
188
+ ngram = difflib.get_close_matches(openai_lm["openai_translation"].strip(), verse_ngrams.keys(), n=1, cutoff=0.3)[0]
189
+ except IndexError:
190
+ ngram = "No close match found"
191
+
192
+
193
+ print(f'Best ngram found for note: {ngram}')
194
+ return ngram
195
+
196
+
197
+ def verse_notes(self, verse_ref):
198
+ # Get the Greek form of the verse
199
+ v_ref = SR(verse_ref)
200
+ gk_verse_text = self.greek_bible_text.splitlines()[v_ref.line_number - 1]
201
+
202
+ # Get all relevant translation notes for the verse (based on Greek terms found in Greek verse)
203
+ # with open('translation_notes.json', 'r', encoding='utf-8') as file:
204
+ # translation_notes = json.load(file)
205
+ translation_notes_in_verse = []
206
+ print(f'Let\'s see if there are any translation notes for this verse: \n\t {gk_verse_text}')
207
+ translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
208
+ for note in translation_notes:
209
+ note_v_ref = SR(note['verse'])
210
+ if note_v_ref.line_number != v_ref.line_number:
211
+ continue
212
+ print('Note verse:', note_v_ref.structured_ref)
213
+ print(f'Checking for existence of: {note["source_term"]}')
214
+ if note['source_term'].lower() in gk_verse_text.lower():
215
+ translation_notes_in_verse.append(note)
216
+ print(f'Greek terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
217
+
218
+ # Get the target language form of the verse
219
+ target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]
220
+
221
+ # Find n-grams from the book of the verse which exist in the verse
222
+ bookCode = v_ref.structured_ref['bookCode']
223
+ book_ngrams = self.get_tfidf_book_features(bookCode)
224
+ print(f'First 30 n-grams of the book: {list(book_ngrams.keys())[:30]}')
225
+ verse_ngrams = {feature: score for feature, score in book_ngrams.items() if feature.lower() in target_verse_text.lower()}
226
+ print(f'First five n-grams of the verse along with their scores: {list(verse_ngrams.items())[:5]}')
227
+
228
+ ngrams = []
229
+ for note in translation_notes_in_verse:
230
+ ngram = self.best_ngram_for_note(note, verse_ngrams, self.lang_name)
231
+ start_pos = target_verse_text.lower().find(ngram.lower())
232
+ end_pos = start_pos + len(ngram)
233
+ source_term = note['source_term']
234
+ trans_note = note['translation_note']
235
+ ngrams.append(
236
+ {
237
+ 'ngram': ngram,
238
+ 'start_pos': start_pos,
239
+ 'end_pos': end_pos,
240
+ 'source_term': source_term,
241
+ 'trans_note': trans_note
242
+ })
243
+
244
+ print(f'Verse notes to be returned: {ngrams}')
245
+ return {
246
+ 'target_verse_text': target_verse_text,
247
+ 'verse_ref': v_ref.structured_ref,
248
+ 'line_number': v_ref.line_number,
249
+ 'ngrams': ngrams
250
+ }
251
+
252
+
TranslationNoteFinderLLMOnly.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+ import re
4
+ from langdetect import detect
5
+ import pycountry
6
+ from LanguageTool import Lang
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
9
+ import openai
10
+ from romanize import uroman
11
+ from ScriptureReference import ScriptureReference as SR
12
+ import stanza
13
+ import difflib
14
+ import requests
15
+ # from TrainingData import greek_to_lang
16
+
17
+
18
+ class TranslationNoteFinder:
19
+ verses = SR.verse_ones
20
+
21
+ # greek_bible_path = 'bibles/grc-grctcgnt.txt'
22
+ # hebrew_bible_path = 'bibles/heb-hebrewtanakh.txt'
23
+ # english_bible_path = 'bibles/eng-web.txt'
24
+
25
+ # Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
26
+ # lang_code follows ISO 639-1 standard
27
+ def __init__(self, bible_text_path, api_key, lang_code=None):
28
+
29
+ # Load Bibles
30
+ self.verses = TranslationNoteFinder.verses
31
+ # self.greek_bible_text = self.load_bible('bibles/grc-grctcgnt.txt')
32
+ # self.hebrew_bible_text = self.load_bible('bibles/heb-heb.txt')
33
+ # self.english_bible_text = self.load_bible('bibles/eng-engwebp.txt')
34
+ self.target_bible_text = self.load_bible(bible_text_path)
35
+
36
+ # Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
37
+ if lang_code:
38
+ self.language = lang_code
39
+ self.lang_name = pycountry.languages.get(alpha_2=self.language).name
40
+ print(f'Language of target Bible text: {self.lang_name}')
41
+ else:
42
+ first_line_nt = self.target_bible_text.splitlines()[23213]
43
+ self.language = detect(first_line_nt)
44
+ self.lang_name = pycountry.languages.get(alpha_2=self.language).name
45
+ print(f'Detected language of target Bible text: {self.lang_name}')
46
+
47
+ # Assign instance variables
48
+ self.target_bible_text = self.load_bible(bible_text_path)
49
+ self.api_key = api_key
50
+
51
+
52
+ def parse_tsv_to_json(self, file_content, book_abbrev):
53
+ result = [] # Initialize an empty list to store the dictionaries.
54
+
55
+ # Turn tsv content into reader
56
+ tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
57
+
58
+ for row in tsv_reader:
59
+ # Check if the row contains a source term (non-empty) in the expected position.
60
+ if row and len(row) > 3 and row[4].strip():
61
+ # Construct a dictionary for the current row.
62
+ entry = {
63
+ "source_term": row[4].strip(),
64
+ "translation_note": row[6].strip(),
65
+ "verse": book_abbrev + row[0].strip()
66
+ }
67
+ # Append the dictionary to the result list.
68
+ result.append(entry)
69
+
70
+ return result
71
+
72
+
73
+ def load_translation_notes(self, book_abbrev):
74
+ # If filepath ends with json
75
+ translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
76
+ response = requests.get(translation_notes_path)
77
+ if response.status_code == 200:
78
+ translation_notes_raw = response.text
79
+ else:
80
+ translation_notes_raw = ''
81
+
82
+ translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
83
+
84
+ return translation_notes
85
+
86
+
87
+ def load_bible(self, bible_path):
88
+ # Check if the path starts with "http://" or "https://"
89
+ if bible_path.startswith('http'):
90
+ # Use requests to fetch the Bible text from the URL
91
+ response = requests.get(bible_path)
92
+ # Check if the request was successful
93
+ if response.status_code == 200:
94
+ bible_text = response.text
95
+ else:
96
+ bible_text = '' # Or handle errors as needed
97
+ else:
98
+ # Load the Bible text from a local file
99
+ with open(bible_path, 'r', encoding='utf-8') as file:
100
+ bible_text = file.read()
101
+ return bible_text
102
+
103
+
104
+ # Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
105
+ # i.e., documents = [Genesis content, Exodus content, ...]
106
+ def segment_corpus(self, bible_text):
107
+ documents = []
108
+ current_document = []
109
+ verse_lines = bible_text.splitlines()
110
+ for i, line in enumerate(verse_lines, start=1):
111
+ if i in self.verses:
112
+ if current_document:
113
+ joined_doc_string = " ".join(current_document)
114
+ documents.append(joined_doc_string)
115
+ current_document = []
116
+ current_document.append(line.strip())
117
+ # Add the last document
118
+ if current_document:
119
+ joined_doc_string = " ".join(current_document)
120
+ documents.append(joined_doc_string)
121
+ return documents
122
+
123
+
124
+ # For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated source term
125
+ def best_ngram_for_note(self, note, target_verse_text, language):
126
+ # local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens
127
+
128
+ openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
129
+ openai_lm = openai_llm
130
+
131
+ source_term = note['source_term'].strip()
132
+ source_lang = Lang(source_term, options=['en', 'he', 'el']).lang_name # Can only choose between English, Hebrew, and Greek
133
+ print(f'Source term: {source_term}, \nSource language: {source_lang}')
134
+ # source_term = uroman(note['source_term']).strip()
135
+
136
+ with system():
137
+ openai_lm += f'You are an expert at translating between {source_lang} and {language}.'
138
+ openai_lm += f'When asked to translate, provide only the {language} translation of the {source_lang} term found in the {language} verse.'
139
+ openai_lm += 'Nothing else. Do not provide any additional information or context. Be extrememly succinct in your translations.'
140
+ openai_lm += f'You must choose only an N-gram which already exists in the {language} verse.'
141
+
142
+ with user():
143
+ openai_lm += f'What is a good translation of {source_term} from {source_lang} into {language} and is also found within this verse: {target_verse_text}?'
144
+ # openai_lm += f'What part of the verse \"{target_verse_text}\" is a good translation of {source_term} from {source_lang} into {language}?'
145
+
146
+ with assistant():
147
+ openai_lm += gen('openai_translation', stop='.')
148
+ print(f'OpenAI translation: {openai_lm["openai_translation"]}')
149
+
150
+ # If openai_lm["openai_translation"] can be found in the verse, return it
151
+ llm_output = openai_lm["openai_translation"].strip()
152
+ print(f'LLM output: {llm_output}')
153
+ if llm_output in target_verse_text:
154
+ print(f'LLM output found in verse: {llm_output}')
155
+ return llm_output
156
+ else:
157
+ print(f'LLM output not found in verse: {llm_output}')
158
+ return ''
159
+
160
+
161
+ def verse_notes(self, verse_ref):
162
+ # Get the source form of the verse
163
+ v_ref = SR(verse_ref)
164
+ # source_verse_text = self.source_bible_text.splitlines()[v_ref.line_number - 1]
165
+
166
+ translation_notes_in_verse = []
167
+ # print(f'Let\'s see if there are any translation notes for this verse: \n\t {source_verse_text}')
168
+ translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
169
+ # for note in translation_notes:
170
+ # note_v_ref = SR(note['verse'])
171
+ # if note_v_ref.line_number != v_ref.line_number:
172
+ # continue
173
+ # print('Note verse:', note_v_ref.structured_ref)
174
+ # print(f'Checking for existence of: {note["source_term"]}')
175
+ # if note['source_term'].lower() in source_verse_text.lower():
176
+ # translation_notes_in_verse.append(note)
177
+ for note in translation_notes:
178
+ note_v_ref = SR(note['verse'])
179
+ if note_v_ref.line_number == v_ref.line_number: # Not checking for existence assumes there is a verse reference
180
+ translation_notes_in_verse.append(note)
181
+ print(f'Source terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
182
+
183
+ # Get the target language form of the verse
184
+ target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]
185
+
186
+ ngrams = []
187
+ for note in translation_notes_in_verse:
188
+ source_term = note['source_term']
189
+ trans_note = note['translation_note']
190
+ ngram = self.best_ngram_for_note(note, target_verse_text, self.lang_name)
191
+ start_pos = target_verse_text.lower().find(ngram.lower())
192
+ end_pos = start_pos + len(ngram)
193
+ ngrams.append(
194
+ {
195
+ 'ngram': ngram,
196
+ 'start_pos': start_pos,
197
+ 'end_pos': end_pos,
198
+ 'source_term': source_term,
199
+ 'trans_note': trans_note
200
+ })
201
+
202
+
203
+ print('Verse notes to be returned:')
204
+ print(json.dumps(ngrams, indent=4))
205
+ return {
206
+ 'target_verse_text': target_verse_text,
207
+ 'verse_ref': v_ref.structured_ref,
208
+ 'line_number': v_ref.line_number,
209
+ 'ngrams': ngrams
210
+ }
211
+
212
+
__pycache__/LanguageTool.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/ScriptureReference.cpython-312.pyc ADDED
Binary file (13.9 kB). View file
 
__pycache__/ScriptureReference.cpython-39.pyc ADDED
Binary file (11.1 kB). View file
 
__pycache__/TrainingData.cpython-312.pyc ADDED
Binary file (7.95 kB). View file
 
__pycache__/TrainingData.cpython-39.pyc ADDED
Binary file (4.54 kB). View file
 
__pycache__/TranslationNoteFinder.cpython-312.pyc ADDED
Binary file (12.8 kB). View file
 
__pycache__/TranslationNoteFinder.cpython-39.pyc ADDED
Binary file (7.28 kB). View file
 
__pycache__/TranslationNoteFinderLLMOnly.cpython-312.pyc ADDED
Binary file (8.84 kB). View file
 
__pycache__/nltk.cpython-312.pyc ADDED
Binary file (607 Bytes). View file
 
__pycache__/romanize.cpython-312.pyc ADDED
Binary file (1.89 kB). View file
 
__pycache__/romanize.cpython-39.pyc ADDED
Binary file (1.11 kB). View file
 
__pycache__/tfidf.cpython-312.pyc ADDED
Binary file (4.14 kB). View file
 
__pycache__/tfidf.cpython-39.pyc ADDED
Binary file (2.87 kB). View file
 
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Language Code,Verse References (comma-separated),Results,flag,username,timestamp
2
+ hi,eph1:3,,,,2024-03-02 19:58:40.869428
highlightNote.css ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .note {
2
+ cursor: pointer;
3
+ text-decoration: underline;
4
+ }
5
+ .highlight {
6
+ background-color: yellow;
7
+ }
highlightNote.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ function highlightNote(noteId, verseText, startPos, endPos) {
2
+ const noteElement = document.getElementById(noteId);
3
+ if(noteElement) {
4
+ const highlightedText = `<span class="highlight">${verseText.substring(startPos, endPos)}</span>`;
5
+ const verseTextElement = document.querySelector('.verse-text');
6
+ if (verseTextElement) {
7
+ verseTextElement.innerHTML = verseText.substring(0, startPos) + highlightedText + verseText.substring(endPos);
8
+ }
9
+ }
10
+ }
main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from TranslationNoteFinder import TranslationNoteFinder
3
+
4
+ api_key = os.getenv('OPENAI_API_KEY')
5
+ tnf = TranslationNoteFinder('translation_notes.json', 'bibles/hin-hin2017.txt', api_key=api_key, lang_code='hi')
6
+
7
+ print(tnf.verse_notes('rom3:22'))
8
+
9
+ # verse that includes en Christo
10
+ # print(tnf.verse_notes('eph1:1'))
11
+ # # verse with no translation note matches
12
+ # print(tnf.verse_notes('jn1:8'))
13
+ # # verse that includes logos
14
+ # print(tnf.verse_notes('jn1:1'))
15
+ # # verse that includes agape
16
+ # print(tnf.verse_notes('1cor13:13'))
17
+ # # verse that includes koinonia
18
+ # print(tnf.verse_notes('1jn1:3'))
19
+ # verse that includes dikaioo
20
+ print(tnf.verse_notes('rom3:22'))
21
+
22
+ # Expect ~1 min startup runtime, ~5 sec per verse
main_gradio.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio import HighlightedText
3
+ from TranslationNoteFinder import TranslationNoteFinder
4
+
5
+ # Updated dictionary mapping language codes to URLs of Bible text files
6
+ bible_urls = {
7
+ 'en': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/eng-kjvcpb.txt',
8
+ 'hi': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/hin-hin2017.txt',
9
+ 'es': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/spa-spabes.txt',
10
+ 'ru': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/rus-russyn.txt'
11
+ }
12
+
13
+ tnf = None
14
+
15
+ def load_resources(api_key, lang_code):
16
+ global tnf
17
+ bible_text_url = bible_urls.get(lang_code)
18
+ # 'translation_notes.json'
19
+ # 'translation_notes/tn_ROM.tsv'
20
+ tnf = TranslationNoteFinder('translation_notes/tn_ROM.tsv', bible_text_url, api_key, lang_code=lang_code)
21
+ return "Language resources loaded successfully.", "", "", ""
22
+
23
+ def find_notes(verse_ref):
24
+ global tnf
25
+ if tnf is None:
26
+ return "Please load language resources first.", "", "", ""
27
+
28
+ results = tnf.verse_notes(verse_ref)
29
+ verse_ref_formatted = f"{results['verse_ref']['bookCode']} {results['verse_ref']['startChapter']}:{results['verse_ref']['startVerse']}"
30
+
31
+ target_text = results['target_verse_text']
32
+ colors = ["yellow", "lightgreen", "lightblue", "pink", "lightgrey", "orange", "purple", "cyan", "magenta", "lime", "teal",
33
+ "maroon", "navy", "olive", "silver", "gold", "coral", "turquoise", "indigo", "violet"]
34
+ ngrams_highlights = {}
35
+ for i, ngram in enumerate(reversed(results['ngrams'])): # Reverse to not mess up the indices
36
+ start, end = ngram['start_pos'], ngram['end_pos']
37
+ highlight = f"<mark style='background-color:{colors[i]};'>{target_text[start:end]}</mark>"
38
+ target_text = target_text[:start] + highlight + target_text[end:]
39
+ # Map Greek terms to their corresponding highlight color
40
+ ngrams_highlights[ngram['greek_term']] = colors[i]
41
+
42
+ line_number = str(results['line_number'])
43
+ # Apply highlights to Greek terms in translation notes
44
+ ngrams_formatted = ""
45
+ for ngram in results['ngrams']:
46
+ greek_term_highlight = f"<span style='background-color:{ngrams_highlights[ngram['greek_term']]}'>{ngram['greek_term']}</span>"
47
+ ngrams_formatted += f"{greek_term_highlight}: {ngram['trans_note']}<br>"
48
+
49
+ # Since HTML component is used, all outputs must be strings
50
+ return verse_ref_formatted, target_text, line_number, ngrams_formatted
51
+
52
+
53
+ # Adjusting Gradio interface for HTML output
54
+ with gr.Blocks() as app:
55
+ api_key_input = gr.Textbox(label="API Key", type='password')
56
+ with gr.Row():
57
+ lang_dropdown = gr.Dropdown(choices=list(bible_urls.keys()), label="Language Code")
58
+ load_btn = gr.Button("Load Language")
59
+ verse_input = gr.Textbox(label="Verse Reference")
60
+ translate_btn = gr.Button("Translate")
61
+
62
+ verse_ref_output = gr.Textbox(label="Verse Reference")
63
+ target_text_output = gr.HTML(label="Target Verse Text") # Changed to HTML component
64
+ # target_text_output = gr.HighlightedText(label="Target Verse Text")
65
+ line_number_output = gr.Textbox(label="Line Number")
66
+ ngrams_output = gr.HTML(label="N-grams") # Changed to HTML for formatted output
67
+
68
+ load_btn.click(fn=load_resources, inputs=[api_key_input, lang_dropdown], outputs=[verse_ref_output, target_text_output, line_number_output, ngrams_output])
69
+ translate_btn.click(fn=find_notes, inputs=verse_input, outputs=[verse_ref_output, target_text_output, line_number_output, ngrams_output])
70
+
71
+
72
+ app.launch()
main_gradio_js.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ # from TranslationNoteFinder import TranslationNoteFinder
3
+ from TranslationNoteFinderLLMOnly import TranslationNoteFinder
4
+
5
+ # Updated dictionary mapping language codes to URLs of Bible text files
6
+ bible_urls = {
7
+ 'en': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/eng-engkjvcpb.txt',
8
+ 'hi': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/hin-hin2017.txt',
9
+ 'es': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/spa-spabes.txt',
10
+ 'ru': 'https://raw.githubusercontent.com/BibleNLP/ebible/main/corpus/rus-russyn.txt'
11
+ }
12
+
13
+ tnf = None
14
+
15
+ def load_resources(api_key, lang_code):
16
+ global tnf
17
+ bible_text_url = bible_urls.get(lang_code)
18
+ # 'translation_notes.json'
19
+ # 'translation_notes/tn_ROM.tsv'
20
+ tnf = TranslationNoteFinder(bible_text_url, api_key, lang_code=lang_code)
21
+ return "Language resources loaded successfully.", "", "", ""
22
+
23
+
24
+ with gr.Blocks(css="highlightNote.css") as app:
25
+
26
+ def find_notes(verse_ref):
27
+ global tnf
28
+ if tnf is None:
29
+ return "Please load language resources first.", "", "", ""
30
+
31
+ # Clear the output fields by returning empty strings
32
+ # yield "", "", "", ""
33
+
34
+ results = tnf.verse_notes(verse_ref)
35
+ verse_ref_formatted = f"{results['verse_ref']['bookCode']} {results['verse_ref']['startChapter']}:{results['verse_ref']['startVerse']}"
36
+
37
+ target_text = results['target_verse_text']
38
+ ngrams_formatted = ""
39
+
40
+ line_number = str(results['line_number'])
41
+ # Apply highlights to Greek terms in translation notes
42
+ for i, ngram in enumerate(results['ngrams']):
43
+ note_id = f"note_{i}"
44
+ ngram_text = f"""<span id='{note_id}'
45
+ class='note'
46
+ data-verse-text='{target_text}'
47
+ data-start-pos='{ngram['start_pos']}'
48
+ data-end-pos='{ngram['end_pos']}'
49
+ onmouseover="
50
+ const noteElement = document.getElementById('{note_id}');
51
+ if (noteElement) {{
52
+ const highlightedText = `<span class='highlight'>{target_text[ngram['start_pos']:ngram['end_pos']]}</span>`;
53
+ const verseTextElement = document.querySelector('.verse-text');
54
+ if (verseTextElement) {{
55
+ verseTextElement.innerHTML = `{target_text[:ngram['start_pos']]}`
56
+ + highlightedText
57
+ + `{target_text[ngram['end_pos']:]}`;
58
+ }}
59
+ }}
60
+ "
61
+ onmouseout="document.querySelector('.verse-text').innerHTML = '{target_text}'"
62
+ >
63
+ {ngram['source_term']}: {ngram['trans_note']}
64
+ </span><br><br>"""
65
+ ngrams_formatted += ngram_text
66
+
67
+ # Since HTML component is used, all outputs must be strings
68
+ return verse_ref_formatted, target_text, line_number, ngrams_formatted
69
+
70
+
71
+ api_key_input = gr.Textbox(label="API Key", type='password')
72
+ with gr.Row():
73
+ lang_dropdown = gr.Dropdown(choices=list(bible_urls.keys()), label="Language Code")
74
+ load_btn = gr.Button("Load Language")
75
+ verse_input = gr.Textbox(label="Verse Reference")
76
+ translate_btn = gr.Button("Translate")
77
+
78
+ verse_ref_output = gr.Textbox(label="Verse Reference")
79
+ target_text_output = gr.HTML(label="Target Verse Text", elem_classes=["verse-text"])
80
+ line_number_output = gr.Textbox(label="Line Number")
81
+ notes_output = gr.HTML(label="N-grams") # needs elem_classes?
82
+
83
+ load_btn.click(
84
+ fn=load_resources,
85
+ inputs=[
86
+ api_key_input,
87
+ lang_dropdown],
88
+ outputs=[
89
+ verse_ref_output,
90
+ target_text_output,
91
+ line_number_output,
92
+ notes_output
93
+ ]
94
+ )
95
+ translate_btn.click(
96
+ fn=find_notes,
97
+ inputs=verse_input,
98
+ outputs=[
99
+ verse_ref_output,
100
+ target_text_output,
101
+ line_number_output,
102
+ notes_output
103
+ ]
104
+ )
105
+
106
+
107
+ app.launch(share=True)
romanize.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import requests
4
+ import subprocess
5
+
6
+ # Downloading the zip file
7
+ url = 'https://github.com/isi-nlp/uroman/archive/refs/tags/v1.2.8.zip'
8
+ zip_filename = 'uroman.zip'
9
+ with open(zip_filename, 'wb') as zip_file:
10
+ response = requests.get(url)
11
+ zip_file.write(response.content)
12
+
13
+ # Unzipping the downloaded file
14
+ with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
15
+ zip_ref.extractall()
16
+
17
+ # Function to call the unzipped code
18
+ def uroman(input_string, language=None, chart=False):
19
+ script_path = 'C:/Users/caleb/Bible Translation Project/guidance/uroman-1.2.8/bin/uroman.pl' # Adjust if necessary
20
+ command = ["perl", script_path]
21
+
22
+ # Add language flag if specified
23
+ if language:
24
+ command.extend(["-l", language])
25
+
26
+ # Add chart flag if specified
27
+ if chart:
28
+ command.append("--chart")
29
+
30
+ process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
31
+ stdout, stderr = process.communicate(input=input_string.encode())
32
+
33
+ if process.returncode != 0:
34
+ # There was an error
35
+ print(f"Error code {process.returncode}: {stderr.decode()}")
36
+ return None
37
+
38
+ # Return the output as a string
39
+ return stdout.decode()
40
+
41
+ # Example usage
42
+ # print(uroman("わたしはにほんじんです"))
43
+
tests/english_note_to_hindi.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import guidance
2
+ from guidance import models, select, gen, system, user, assistant, one_or_more
3
+ from openai import OpenAI
4
+
5
+
6
+ model_path = 'models/neural-chat-7b-v3-3.Q2_K.gguf'
7
+ llm = models.LlamaCpp(model_path, n_gpu_layers=1)
8
+ # llm = models.OpenAI("gpt-4")
9
+
10
+
11
+ verse = 'Blessed be the God and Father of our Lord Jesus Christ, who has blessed us in Christ with every spiritual blessing in the heavenly places,'
12
+ hin_verse = 'हमारे प्रभु यीशु मसीह का पिता और परमेश्वर धन्य हो। उसने हमें मसीह के रूप में स्वर्ग के क्षेत्र में हर तरह के आशीर्वाद दिये हैं।'
13
+ greek_term = 'ἐν Χριστῷ'
14
+ translation_note = 'illustrates the intimate union between believers and Christ. The preposition ἐν (in) goes beyond physical location, indicating a profound spiritual reality. Translators need to convey the concept of being "in Christ" as being part of a new creation, identity, and living within the sphere of Christ\'s influence and lordship.'
15
+ note = 'hey'
16
+
17
+ # OpenAI implementation
18
+
19
+ # with system():
20
+ # lm = llm + "You are an expert at translating into Hindi."
21
+
22
+ # with user():
23
+ # lm += "Translate the following translation note into Hindi: \n" + translation_note
24
+
25
+ # with assistant():
26
+ # lm += gen(max_tokens=1000)
27
+
28
+ # print(lm)
29
+
30
+ # Neural Chat implementation
31
+
32
+ lm = llm + f"Translate the following into Hindi:\n {translation_note}"
33
+ lm += gen('hin_note', max_tokens=400)
34
+ print(lm)
35
+ print(f"Translation note: {lm['hin_note']}")
tests/find_greek_in_hindi.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import guidance
2
+ from guidance import models, gen, system, user, assistant, instruction
3
+ from openai import OpenAI
4
+ import string
5
+ from itertools import islice
6
+ from romanize import uroman
7
+ from tfidf import analyze_verse_in_corpus
8
+
9
+
10
+
11
+ llm = models.OpenAI("gpt-3.5-turbo-instruct")
12
+
13
+ eng_verse = 'Blessed be the God and Father of our Lord Jesus Christ, who has blessed us in Christ with every spiritual blessing in the heavenly places,'
14
+ hin_verse = 'हमारे प्रभु यीशु मसीह का पिता और परमेश्वर धन्य हो। उसने हमें मसीह के रूप में स्वर्ग के क्षेत्र में हर तरह के आशीर्वाद दिये हैं।'
15
+ greek_term = 'ἐν Χριστῷ'
16
+ translation_note = 'illustrates the intimate union between believers and Christ. The preposition ἐν (in) goes beyond physical location, indicating a profound spiritual reality. Translators need to convey the concept of being "in Christ" as being part of a new creation, identity, and living within the sphere of Christ\'s influence and lordship.'
17
+
18
+ from guidance import models, select
19
+
20
+ model_path = 'models/neural-chat-7b-v3-3.Q2_K.gguf'
21
+ # llm = models.LlamaCpp(model_path, n_gpu_layers=1)
22
+
23
+ lm = llm
24
+ with instruction():
25
+ lm += "What is a popular flavor?"
26
+ lm += select(['chocolate', 'vanilla', 'strawberry'], name='flavor')
27
+ print(lm['flavor'])
28
+ # print(uroman(greek_term))
29
+
30
+ language = 'Greek'
31
+ romanize = False
32
+
33
+ # lm = llm
34
+ # with instruction():
35
+ # lm += f'The best translation of {uroman(greek_term)} from Romanized Greek into {language} is '
36
+ # # lm += select(['fat albert', 'in heavenly places', 'not found'], name='translation')
37
+ # # Generate only english letters from lm
38
+ # lm += gen('translation', stop='.')
39
+ # translation = lm['translation']
40
+
41
+ translation = greek_term
42
+
43
+ if romanize:
44
+ translation = uroman(translation)
45
+
46
+ # Remove punctuation
47
+ translation = translation.translate(str.maketrans('', '', string.punctuation)).lower()
48
+ print(translation)
49
+
50
+ if language == 'English':
51
+ file_path = 'bibles/eng-engkjvcpb.txt'
52
+ if language == 'Hindi':
53
+ file_path = 'bibles/hin-hin2017.txt'
54
+ if language == 'Greek':
55
+ file_path = 'bibles/grc-grctcgnt.txt'
56
+
57
+ interested_line = 29276 # Example line (verse) number
58
+ verse_scores = analyze_verse_in_corpus(file_path, interested_line, romanize=romanize)
59
+
60
+ # verse_scores is a dictionary with n-grams as keys and their respective TF-IDF scores as values in descending order
61
+ # Print n-grams and respective scores in the verse in descending score order
62
+ for ngram, score in verse_scores.items():
63
+ print(f"{ngram}: {score:.4f}")
64
+
65
+ # If any of the n-grams contains 'translation', print the n-gram with the highest score, and print its score
66
+ for ngram, score in verse_scores.items():
67
+ if translation in ngram:
68
+ print(f"The n-gram '{ngram}' has the highest score of {score:.4f} in the verse.")
69
+ break
tests/guidance-ai-readme.md ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="right"><a href="https://guidance.readthedocs.org"><img src="https://readthedocs.org/projects/guidance/badge/?version=latest&style=flat" /></a></div>
2
+ <div align="center"><picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="docs/figures/guidance_logo_blue_dark.svg">
4
+ <img alt="guidance" src="docs/figures/guidance_logo_blue.svg" width=300">
5
+ </picture></div>
6
+ <br/>
7
+
8
+
9
+ > *Note that v0.1 is a dramatically new version developed while releases had to be paused over the summer. If you are looking for the old version based on handlebars, you can use v0.0.64, but you should instead try porting over to the much better new version :)*
10
+
11
+ **`guidance`** is a programming paradigm that offers superior control and efficiency compared to conventional prompting and chaining. It allows users to constrain generation (e.g. with regex and CFGs) as well as to interleave control (conditional, loops) and generation seamlessly. Here are some important features:
12
+
13
+ 1. **Pure, beautiful python** with additional LM functionality. E.g. here is [basic generation](#basic-generation):
14
+ ```python
15
+ from guidance import models, gen
16
+
17
+ # load a model (could be Transformers, LlamaCpp, VertexAI, OpenAI...)
18
+ llama2 = models.LlamaCpp(path)
19
+
20
+ # append text or generations to the model
21
+ llama2 + f'Do you want a joke or a poem? ' + gen(stop='.')
22
+ ```
23
+ <img alt="Do you want a joke or a poem? I'll give you a poem" src="docs/figures/simple_gen_llama2_7b.png" width="354">
24
+
25
+ 2. [**Constrained generation**](#constrained-generation) with [selects](#select-basic), [regular expressions](#regular-expressions), and [context-free grammars](#context-free-grammars).
26
+ ```python
27
+ from guidance import select
28
+
29
+ # a simple select between two options
30
+ llama2 + f'Do you want a joke or a poem? A ' + select(['joke', 'poem'])
31
+ ```
32
+ <img alt="Do you want a joke or a poem? A poem" src="docs/figures/simple_select_llama2_7b.png" width="277">
33
+
34
+ 3. **Rich templates with f-strings**:
35
+ ```python
36
+ llama2 + f'''\
37
+ Do you want a joke or a poem? A {select(['joke', 'poem'])}.
38
+ Okay, here is a one-liner: "{gen(stop='"')}"
39
+ '''
40
+ ```
41
+ <img width="358" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/486ca968-89b1-4c02-b914-3b9714fe5890"><br>
42
+
43
+ 4. [**Stateful control + generation**](#stateful-control--generation) makes it easy to interleave prompting / logic / generation, no need for intermediate parsers:
44
+ ```python
45
+ # capture our selection under the name 'answer'
46
+ lm = llama2 + f"Do you want a joke or a poem? A {select(['joke', 'poem'], name='answer')}.\n"
47
+
48
+ # make a choice based on the model's previous selection
49
+ if lm["answer"] == "joke":
50
+ lm += f"Here is a one-line joke about cats: " + gen('output', stop='\n')
51
+ else:
52
+ lm += f"Here is a one-line poem about dogs: " + gen('output', stop='\n')
53
+ ```
54
+ <img width="393" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/66d47ce7-1d5a-4dbd-b676-66b9c1094184"><br>
55
+
56
+
57
+ 5. **Abstract chat interface** that uses the correct special tokens for any chat model:
58
+ ```python
59
+ from guidance import user, assistant
60
+
61
+ # load a chat model
62
+ chat_lm = models.LlamaCppChat(path)
63
+
64
+ # wrap with chat block contexts
65
+ with user():
66
+ lm = chat_lm + 'Do you want a joke or a poem?'
67
+
68
+ with assistant():
69
+ lm += f"A {select(['joke', 'poem'])}."`
70
+ ```
71
+ <img width="331" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/89c3e0e2-ed0a-4715-8366-2efca74b7b71"><br>
72
+
73
+ 6. **Easy to write reusable components**
74
+ ```python
75
+ import guidance
76
+
77
+ @guidance
78
+ def one_line_thing(lm, thing, topic):
79
+ lm += f'Here is a one-line {thing} about {topic}: ' + gen(stop='\n')
80
+ return lm # return our updated model
81
+
82
+ # pick either a joke or a poem
83
+ lm = llama2 + f"Do you want a joke or a poem? A {select(['joke', 'poem'], name='thing')}.\n"
84
+
85
+ # call our guidance function
86
+ lm += one_line_thing(lm['thing'], 'cats')
87
+ ```
88
+ <img width="386" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/60071680-8bbb-4fa5-a298-613d4fd55fa7"><br>
89
+
90
+ 7. **A library of pre-built components**, e.g. substring:
91
+ ```python
92
+ from guidance import substring
93
+
94
+ # define a set of possible statements
95
+ text = 'guidance is awesome. guidance is so great. guidance is the best thing since sliced bread.'
96
+
97
+ # force the model to make an exact quote
98
+ llama2 + f'Here is a true statement about the guidance library: "{substring(text)}"'
99
+ ```
100
+ <img width="589" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/9a7178ad-ed73-4e6b-b418-f9d2a3a76b88"><br>
101
+
102
+ 8. [**Easy tool use**](#automatic-interleaving-of-control-and-generation-tool-use), where the model stops generation when a tool is called, calls the tool, then resumes generation. For example, here is a simple version of a calculator, via four separate 'tools':
103
+ ```python
104
+ @guidance
105
+ def add(lm, input1, input2):
106
+ lm += f' = {int(input1) + int(input2)}'
107
+ return lm
108
+ @guidance
109
+ def subtract(lm, input1, input2):
110
+ lm += f' = {int(input1) - int(input2)}'
111
+ return lm
112
+ @guidance
113
+ def multiply(lm, input1, input2):
114
+ lm += f' = {float(input1) * float(input2)}'
115
+ return lm
116
+ @guidance
117
+ def divide(lm, input1, input2):
118
+ lm += f' = {float(input1) / float(input2)}'
119
+ return lm
120
+ ```
121
+ Now we call `gen` with these tools as options. Notice how generation is stopped and restarted automatically:
122
+ ```python
123
+ lm = llama2 + '''\
124
+ 1 + 1 = add(1, 1) = 2
125
+ 2 - 3 = subtract(2, 3) = -1
126
+ '''
127
+ lm + gen(max_tokens=15, tools=[add, subtract, multiply, divide])
128
+ ```
129
+ <img width="201" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/646e1a7d-0206-419b-8206-1d835c3a0e0a"><br>
130
+
131
+ 9. **Speed**: In contrast to chaining, `guidance` programs are the equivalent of a single LLM call. More so, whatever non-generated text that gets appended is batched, so that `guidance` programs are **faster** than having the LM generate intermediate text when you have a set structure.
132
+
133
+ 10. **Token healing**: Users deal with text (or bytes) rather than tokens, and thus don't have to worry about [perverse token boundaries issues](https://towardsdatascience.com/the-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38) such as 'prompt ending in whitespace'.
134
+
135
+ 11. **Streaming support**, also integrated with jupyter notebooks:
136
+ ```python
137
+ lm = llama2 + 'Here is a cute 5-line poem about cats and dogs:\n'
138
+ for i in range(5):
139
+ lm += f"LINE {i+1}: " + gen(temperature=0.8, suffix="\n")
140
+ ```
141
+ <img src="docs/figures/simple_streaming_example.gif" width="337">
142
+
143
+ 13. **High compatibility:** works with Transformers, llama.cpp, VertexAI, OpenAI. Users can write one guidance program and execute it on many backends. (note that the most powerful control features require endpoint integration, and for now work best with Transformers and llama.cpp).
144
+ ```python
145
+ gpt = models.OpenAI("gpt-3.5-turbo")
146
+
147
+ with user():
148
+ lm = gpt + "What is the capital of France?"
149
+
150
+ with assistant():
151
+ lm += gen("capital")
152
+
153
+ with user():
154
+ lm += "What is one short surprising fact about it?"
155
+
156
+ with assistant():
157
+ lm += gen("fact")
158
+ ```
159
+ <img width="645" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/f31ed7b8-1868-44d2-b14c-4842b0a40e5c"><br>
160
+
161
+ 14. **Multi-modal support.**
162
+ ```python
163
+ from guidance import image
164
+
165
+ gemini = models.VertexAI("gemini-pro-vision")
166
+
167
+ with user():
168
+ lm = gemini + "What is this a picture of?" + image("longs_peak.jpg")
169
+
170
+ with assistant():
171
+ lm += gen("answer")
172
+ ```
173
+ <img width="673" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/6450d05d-52e9-4ef5-b280-8b57e733d46d">
174
+
175
+
176
+
177
+ ## Table of Contents
178
+ * [Install](#install)
179
+ * [Loading models](#loading-models)
180
+ * [llama.cpp](#llamacpp)
181
+ * [transformers](#transformers)
182
+ * [Vertex](#vertex-ai)
183
+ * [OpenAI](#openai)
184
+ * [Example notebooks](#example-notebooks)
185
+ * [Basic generation](#basic-generation)
186
+ * [Constrained Generation](#constrained-generation)
187
+ * [Select (basic)](#select-basic)
188
+ * [Regular expressions](#regular-expressions)
189
+ * [Regex to constrain generation](#regex-to-constrain-generation)
190
+ * [Regex as stopping criterion](#regex-as-stopping-criterion)
191
+ * [Context-free grammars](#context-free-grammars)
192
+ * [Stateful control + generation](#stateful-control--generation)
193
+ * [State in immutable objects](#state-in-immutable-objects)
194
+ * [Stateful guidance functions](#stateful-guidance-functions)
195
+ * [Example: ReAct](#example-react)
196
+ * [Example: Changing intermediate step of a Chat session](#example-changing-intermediate-step-of-a-chat-session)
197
+ * [Automatic interleaving of control and generation: tool use](#automatic-interleaving-of-control-and-generation-tool-use)
198
+ * [Gsm8k example](#gsm8k-example)
199
+ * [Automatic call grammar for @guidance functions](#automatic-call-grammar-for-guidance-functions)
200
+ * [Text, not tokens](#text-not-tokens)
201
+ * [Fast](#fast)
202
+ * [Integrated stateful control is faster](#integrated-stateful-control-is-faster)
203
+ * [Guidance acceleration](#guidance-acceleration)
204
+
205
+ ## Install
206
+ ```bash
207
+ pip install guidance
208
+ ```
209
+ ## Loading models
210
+ ### llama.cpp
211
+ Install the python bindings:
212
+ ```bash
213
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
214
+ ```
215
+ Loading the model:
216
+ ```python
217
+ from guidance import models
218
+ lm = models.LlamaCpp(path_to_model, n_gpu_layers=-1)
219
+ ```
220
+
221
+ ### Transformers
222
+ Install transformers:
223
+ ```python
224
+ from guidance import models
225
+ lm = models.Transformers(model_name_or_path)
226
+ ```
227
+
228
+ ### Vertex AI
229
+ Remote endpoints that don't have explicit guidance integration are run "optimistically". This means that all the text that can be forced is given to the model as a prompt (or chat context) and then the model is run in streaming mode without hard constrants (since the remote API doesn't support them). If the model ever violates the contraints then the model stream is stopped and we optionally try it again at that point. This means that all the API-supported control work as expected, and more complex controls/parsing that is not supported by the API work if the model stays consistent with the program.
230
+ ```python
231
+ palm2 = models.VertexAI("text-bison@001")
232
+
233
+ with instruction():
234
+ lm = palm2 + "What is one funny fact about Seattle?"
235
+
236
+ lm + gen("fact", max_tokens=100)
237
+ ```
238
+ <img width="635" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/693ae08f-68f7-4368-bd25-19afc9bfc0a5"><br>
239
+
240
+ ### OpenAI
241
+ OpenAI endpoint don't have direct support for guidance grammars, but through optimistic running we can still control them in ways that match the model type:
242
+
243
+ *Legacy completion models:*
244
+ ```python
245
+ curie = models.OpenAI("text-curie-001")
246
+
247
+ curie + "The smallest cats are" + gen(stop=".")
248
+ ```
249
+ <img width="263" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/116a906c-ea77-4a13-a83a-682029d5e5c8"><br>
250
+
251
+ *Instruct tuned models:*
252
+ ```python
253
+ gpt_instruct = models.OpenAI("gpt-3.5-turbo-instruct")
254
+
255
+ with instruction():
256
+ lm = gpt_instruct + "What are the smallest cats?"
257
+
258
+ lm += gen(stop=".")
259
+ ```
260
+ <img width="574" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/56a53ce1-89f5-4e9d-bdb8-86fb3eebf309"><br>
261
+
262
+ *Chat models:*
263
+ ```python
264
+ gpt = models.OpenAI("gpt-3.5-turbo")
265
+
266
+ with system():
267
+ lm = gpt + "You are a cat expert."
268
+
269
+ with user():
270
+ lm += "What are the smallest cats?"
271
+
272
+ with assistant():
273
+ lm += gen("answer", stop=".")
274
+ ```
275
+ <img width="367" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/46102f0f-37dc-4bb1-99b7-e5895bdee772"><br>
276
+
277
+
278
+
279
+ ## Example notebooks
280
+ We are working on updating our example notebooks. The following ones have been updated:
281
+ - [Basic tutorial](notebooks/tutorials/intro_to_guidance.ipynb)
282
+ - [Chatbot with search](notebooks/chat_with_search.ipynb)
283
+
284
+ More coming soon
285
+
286
+ ## Basic generation
287
+ An `lm` object is immutable, so you change it by creating new copies of it. By default, when you append things to `lm`, it creates a copy, e.g.:
288
+ ```python
289
+ from guidance import models, gen, select
290
+ llama2 = models.LlamaCpp(model)
291
+
292
+ # llama2 is not modified, `lm` is a copy of `llama2` with 'This is a prompt' appended to its state
293
+ lm = llama2 + 'This is a prompt'
294
+ ```
295
+ <img width="124" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/c1e96b2b-8f4a-44ee-a8f4-a694a8d7784b"><br>
296
+
297
+ You can append _generation_ calls to model objects, e.g.
298
+ ```python
299
+ lm = llama2 + 'This is a prompt' + gen(max_tokens=10)
300
+ ```
301
+ <img width="267" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/d2e5ed34-ba9d-4bdd-872d-2b76f8e3cf85"><br>
302
+
303
+ You can also interleave generation calls with plain text, or control flows:
304
+ ```python
305
+ # Note how we set stop tokens
306
+ lm = llama2 + 'I like to play with my ' + gen(stop=' ') + ' in' + gen(stop=['\n', '.', '!'])
307
+ ```
308
+ <img width="279" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/2d47fd65-1982-4dd8-9ba9-a01e62fba455"><br>
309
+
310
+ ## Constrained Generation
311
+ ### Select (basic)
312
+ `select` constrains generation to a set of options:
313
+ ```python
314
+ lm = llama2 + 'I like the color ' + select(['red', 'blue', 'green'])
315
+ ```
316
+ <img width="137" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/f0b97629-78a9-439d-90b2-06af31fdc40e"><br>
317
+
318
+ ### Regular expressions
319
+ `gen` has optional arguments `regex` and `stop_regex`, which allow generation (and stopping, respectively) to be controlled by a regex.
320
+
321
+ #### Regex to constrain generation
322
+ Unconstrained:
323
+
324
+ ```python
325
+ lm = llama2 + 'Question: Luke has ten balls. He gives three to his brother.\n'
326
+ lm += 'How many balls does he have left?\n'
327
+ lm += 'Answer: ' + gen(stop='\n')
328
+ ```
329
+ <img width="405" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/55fb66ea-a717-417a-8a70-14c46eba4c66"><br>
330
+
331
+ Constrained by regex:
332
+
333
+ ```python
334
+ lm = llama2 + 'Question: Luke has ten balls. He gives three to his brother.\n'
335
+ lm += 'How many balls does he have left?\n'
336
+ lm += 'Answer: ' + gen(regex='\d+')
337
+ ```
338
+ <img width="404" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/b45a5a79-55e0-4c15-884a-fba830c0a153"><br>
339
+
340
+
341
+ #### Regex as stopping criterion
342
+ Unconstrained:
343
+ ```python
344
+ lm = llama2 + '19, 18,' + gen(max_tokens=50)
345
+ ```
346
+ <img width="359" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/5dd13454-cc42-4e27-a52c-19a31237891c"><br>
347
+
348
+ Stop with traditional stop text, whenever the model generates the number 7:
349
+ ```python
350
+ lm = llama2 + '19, 18,' + gen(max_tokens=50, stop='7')
351
+ ```
352
+ <img width="73" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/fc96d7c3-381d-4766-8bee-c930669f518a"><br>
353
+
354
+
355
+ Stop whenever the model generates the character `7` without any numbers around it:
356
+ ```python
357
+ lm = llama2 + '19, 18,' + gen(max_tokens=50, stop_regex='[^\d]7[^\d]')
358
+ ```
359
+ <img width="293" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/a657e566-b1a4-447a-82a5-b88977b5fedf"><br>
360
+
361
+
362
+ ### Context-free grammars
363
+ We expose a variety of operators that make it easy to define CFGs, which in turn can be used to constrain generation.
364
+ For example, we can use the `select` operator (it accepts CFGs as options), `zero_or_more` and `one_or_more` to define a grammar for mathematical expressions:
365
+ ```python
366
+ import guidance
367
+ from guidance import one_or_more, select, zero_or_more
368
+ # stateless=True indicates this function does not depend on LLM generations
369
+ @guidance(stateless=True)
370
+ def number(lm):
371
+ n = one_or_more(select(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))
372
+ # Allow for negative or positive numbers
373
+ return lm + select(['-' + n, n])
374
+
375
+ @guidance(stateless=True)
376
+ def operator(lm):
377
+ return lm + select(['+' , '*', '**', '/', '-'])
378
+
379
+ @guidance(stateless=True)
380
+ def expression(lm):
381
+ # Either
382
+ # 1. A number (terminal)
383
+ # 2. two expressions with an operator and optional whitespace
384
+ # 3. An expression with parentheses around it
385
+ return lm + select([
386
+ number(),
387
+ expression() + zero_or_more(' ') + operator() + zero_or_more(' ') + expression(),
388
+ '(' + expression() + ')'
389
+ ])
390
+ ```
391
+
392
+ The `@guidance(stateless=True)` decorator makes it such that a function (e.g. `expression`) lives as a stateless grammar that does not get 'executed' until we call call `lm + expression()` or `lm += expression()`. For example, here is an example of _unconstrained_ generation:
393
+ ```python
394
+ # Without constraints
395
+ lm = llama2 + 'Problem: Luke has a hundred and six balls. He then loses thirty six.\n'
396
+ lm += 'Equivalent arithmetic expression: ' + gen(stop='\n') + '\n'
397
+ ```
398
+ <img width="462" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/54af1909-cad4-4fb1-8987-dfdfc02f8f42"><br>
399
+
400
+ Notice how the model wrote the right equation but solved it (incorrectly). If we wanted to constrain the model such that it only writes valid expressions (without trying to solve them), we can just append our grammar to it:
401
+ ```python
402
+ grammar = expression()
403
+ lm = llama2 + 'Problem: Luke has a hundred and six balls. He then loses thirty six.\n'
404
+ lm += 'Equivalent arithmetic expression: ' + grammar + '\n'
405
+ ```
406
+ <img width="460" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/dbda0ff8-8edd-4384-b63d-fc98792e0689"><br>
407
+
408
+ Grammars are very easy to compose. For example, let's say we want a grammar that generates either a mathematical expression or an expression followed by a solution followed by another expression. Creating this grammar is easy:
409
+
410
+ ```python
411
+ from guidance import regex
412
+ grammar = select([expression(), expression() + regex(' = \d+; ') + expression()])
413
+ ```
414
+ We can generate according to it:
415
+ ```python
416
+ llama2 + 'Here is a math expression for two plus two: ' + grammar
417
+ ```
418
+ <img width="346" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/283e6973-0b8d-4153-a82b-9f5db1460da9"><br>
419
+
420
+ ```python
421
+ llama2 + '2 + 2 = 4; 3+3\n' + grammar
422
+ ```
423
+ <img width="109" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/d584a93c-bf24-43d5-8f8d-501e7eb88422"><br>
424
+
425
+ Even if you don't like thinking in terms of recursive grammars, this formalism makes it easy to constrain generation. For example, let's say we have the following one-shot prompt:
426
+ ```python
427
+ @guidance(stateless=True)
428
+ def ner_instruction(lm, input):
429
+ lm += f'''\
430
+ Please tag each word in the input with PER, ORG, LOC, or nothing
431
+ ---
432
+ Input: John worked at Apple.
433
+ Output:
434
+ John: PER
435
+ worked:
436
+ at:
437
+ Apple: ORG
438
+ .:
439
+ ---
440
+ Input: {input}
441
+ Output:
442
+ '''
443
+ return lm
444
+ input = 'Julia never went to Morocco in her life!!'
445
+ llama2 + ner_instruction(input) + gen(stop='---')
446
+ ```
447
+ <img width="465" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/8ecf5ad4-68b8-4e7a-b107-b1a5613e4c68"><br>
448
+
449
+ Notice that the model did not spell the word 'Morocco' correctly. Sometimes the model might also hallucinate a tag that doesn't exist. We can improve this by adding more few-shot examples, etc, but we can also constrain generation to the exact format we want:
450
+ ```python
451
+ import re
452
+
453
+ @guidance(stateless=True)
454
+ def constrained_ner(lm, input):
455
+ # Split into words
456
+ words = [x for x in re.split('([^a-zA-Z0-9])', input) if x and not re.match('\s', x)]
457
+ ret = ''
458
+ for x in words:
459
+ ret += x + ': ' + select(['PER', 'ORG', 'LOC', '']) + '\n'
460
+ return lm + ret
461
+ llama2 + ner_instruction(input) + constrained_ner(input)
462
+ ```
463
+ <img width="462" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/72545093-ef16-479a-b666-bd97c54a5dc7">
464
+
465
+ While `constrained_ner(input)` **is** a grammar that constrains the model generation, it _feels_ like you're just writing normal imperative python code with `+=` and `selects`.
466
+
467
+
468
+ ## Stateful control + generation
469
+ ### State in immutable objects
470
+ Whenever you do `lm + grammar` or `lm + gen`, `lm + select`, etc, you return a new lm object with additional state. For example:
471
+
472
+ ```python
473
+ lm = llama2 + 'This is a prompt' + gen(name='test', max_tokens=10)
474
+ lm += select(['this', 'that'], name='test2')
475
+ lm['test'], lm['test2']
476
+ ```
477
+ <img width="296" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/f0f9d180-6209-40df-9401-40da35d46e1a"><br>
478
+
479
+ ### Stateful `guidance` functions
480
+ The guidance decorator is `@guidance(stateless=False)` by default, meaning that a function with this decorator depends on the lm state to execute (either prior state or state generated within the function). For example:
481
+ ```python
482
+ @guidance(stateless=False)
483
+ def test(lm):
484
+ lm += 'Should I say "Scott"?\n' + select(['yes', 'no'], name='answer') + '\n'
485
+ if lm['answer'] == 'yes':
486
+ lm += 'Scott'
487
+ else:
488
+ lm += 'Not Scott'
489
+ return lm
490
+ llama2 + test()
491
+ ```
492
+ <img width="159" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/5a55496b-aea0-46e9-8de6-b63655027653"><br>
493
+
494
+
495
+ ### Example: ReAct
496
+ A big advantage of stateful control is that you don't have to write any intermediate parsers, and adding follow-up 'prompting' is easy, even if the follow up depends on what the model generates.
497
+ For example, let's say we want to implement the first example of ReAct prompt in [this](https://www.promptingguide.ai/techniques/react), and let's say the valid acts are only 'Search' or 'Finish'. We might write it like this:
498
+ ```python
499
+ @guidance
500
+ def react_prompt_example(lm, question, max_rounds=10):
501
+ lm += f'Question: {question}\n'
502
+ i = 1
503
+ while True:
504
+ lm += f'Thought {i}: ' + gen(suffix='\n')
505
+ lm += f'Act {i}: ' + select(['Search', 'Finish'], name='act')
506
+ lm += '[' + gen(name='arg', suffix=']') + '\n'
507
+ if lm['act'] == 'Finish' or i == max_rounds:
508
+ break
509
+ else:
510
+ lm += f'Observation {i}: ' + search(lm['arg']) + '\n'
511
+ i += 1
512
+ return lm
513
+ ```
514
+ Notice how we don't have to write a parser for Act and argument and hope that the model generates something valid: we enforce it. Notice also that the loop only stops once the model chooses to act with 'Finish' (or once we hit a maximum number of rounds).
515
+
516
+ ### Example: Changing intermediate step of a Chat session
517
+ We can also hide or change some of what the model generates. For example, below we get a Chat model (notice we use special `role` blocks) to name some experts to answer a question, but we always remove 'Ferriss' from the list if he is mentioned:
518
+ ```python
519
+ from guidance import user, system, assistant
520
+ lm = llama2
521
+ query = 'How can I be more productive?'
522
+ with system():
523
+ lm += 'You are a helpful and terse assistant.'
524
+ with user():
525
+ lm += f'I want a response to the following question:\n{query}\n'
526
+ lm += 'Name 3 world-class experts (past or present) who would be great at answering this.'
527
+ with assistant():
528
+ temp_lm = lm
529
+ for i in range(1, 4):
530
+ # This regex only allows strings that look like names (where every word is capitalized)
531
+ # list_append appends the result to a list
532
+ temp_lm += f'{i}. ' + gen(regex='([A-Z][a-z]*\s*)+', suffix='\n',
533
+ name='experts', list_append=True)
534
+ experts = [x for x in temp_lm['experts'] if 'Ferriss' not in x]
535
+ # Notice that even if the model generates 'Ferriss' above,
536
+ # it doesn't get added to `lm`, only to `temp_lm`
537
+ lm += ', '.join(experts)
538
+ with user():
539
+ lm += 'Please answer the question as if these experts had collaborated in writing an anonymous answer.'
540
+ with assistant():
541
+ lm += gen(max_tokens=100)
542
+ ```
543
+ <img width="688" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/d274f8b8-52e7-41a5-9635-b34f70ed50e0"><br>
544
+
545
+ ### Automatic interleaving of control and generation: tool use
546
+ Tool use is a common case of stateful control. To make it easy to do so, `gen` calls take `tools` as an optional argument, where each tool is defined by (1) a grammar that triggers its call and captures the arguments (if any), and (2) the actual tool call. Then, as generation unrolls, whenever the model generates something that matches the grammar of a tool call, it (1) stops generation, (2) calls the tool (which can append whatever it wants to the LM session), and (3) continues generation.
547
+
548
+ For example, here is how we might implement a calculator tool, leveraging our `expression` grammar above:
549
+ ```python
550
+ from guidance import capture, Tool
551
+ @guidance(stateless=True)
552
+ def calculator_call(lm):
553
+ # capture just 'names' the expression, to be saved in the LM state
554
+ return lm + 'calculator(' + capture(expression(), 'tool_args') + ')'
555
+
556
+ @guidance
557
+ def calculator(lm):
558
+ expression = lm['tool_args']
559
+ # You typically don't want to run eval directly for save reasons
560
+ # Here we are guaranteed to only have mathematical expressions
561
+ lm += f' = {eval(expression)}'
562
+ return lm
563
+ calculator_tool = Tool(calculator_call(), calculator)
564
+ lm = llama2 + 'Here are five expressions:\ncalculator(3 *3) = 33\ncalculator(2 + 1 * 3) = 5\n'
565
+ lm += gen(max_tokens=30, tools=[calculator_tool], stop='\n\n')
566
+ ```
567
+ <img width="201" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/2d9b840a-4fad-4dab-b3e7-20887539b447"><br>
568
+
569
+
570
+ ### Gsm8k example
571
+ Notice that the calculator is just called seamlessly during generation. Here is a more realistic exampe of the model solving a gsm8k question:
572
+
573
+ ```python
574
+ @guidance
575
+ def math_with_calc(lm, question):
576
+ # Two-shot example
577
+ lm += '''\
578
+ Question: John starts with 2 balls. He then quintupled his number of balls. Then he lost half of them. He then gave 3 to his brother. How many does he have left?
579
+ Reasoning:
580
+ 1. He quintupled his balls. So he has calculator(2 * 5) = 10 balls.
581
+ 1. He lost half. So he has calculator(10 / 2) = 5 balls.
582
+ 3. He gave 3 to his brother. So he has calculator(5 - 3) = 2 balls.
583
+ Answer: 2
584
+
585
+ Question: Jill get 7 dollars a day in allowance. She uses 1 each day to by a bus pass, then gives half away. How much does she have left each day?
586
+ Reasoning:
587
+ 1. She gets 7 dollars a day.
588
+ 1. She spends 1 on a bus pass. So she has calculator(5 - 1) = 6.
589
+ 3. She gives half away. So that makes calculator(6 / 2) = 3.
590
+ Answer: 3
591
+
592
+ '''
593
+ lm += f'Question: {question}\n'
594
+ lm += 'Reasoning:\n' + gen(max_tokens=200, tools=[calculator_tool], stop='Answer')
595
+ # Only numbers or commas
596
+ lm += 'Answer: ' + gen(regex='[-\d,]+')
597
+ return lm
598
+
599
+ question = '''Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?'''
600
+ llama2 + math_with_calc(question)
601
+ ```
602
+ <img width="685" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/0c7b8da0-b295-46cd-a312-604ecfba7b33"><br>
603
+
604
+ ### Automatic call grammar for @guidance functions
605
+ You can also initialize a `Tool` with any `@guidance`-decorated function, and the default call grammar will be like a python call. Here is an example of using multiple such tools in the same `gen` call:
606
+ ```python
607
+ @guidance
608
+ def say_scott(lm, n):
609
+ lm += '\n'
610
+ for _ in range(int(n)):
611
+ lm += 'Scott\n'
612
+ return lm
613
+
614
+ @guidance
615
+ def say_marco(lm, n):
616
+ lm += '\n'
617
+ for _ in range(int(n)):
618
+ lm += 'marco\n'
619
+ return lm
620
+
621
+ tools = [Tool(callable=say_scott), Tool(callable=say_marco)]
622
+ llama2 + '''\
623
+ I am going to call say_scott and say_marco a few times:
624
+ say_scott(1)
625
+ Scott
626
+ ''' + gen(max_tokens=20, tools=tools)
627
+ ```
628
+ <img width="395" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/8025699b-59a1-4a3f-8b1e-a895a54924e2"><br>
629
+
630
+
631
+ ## Text, not tokens
632
+ The standard greedy tokenizations used by most language models introduce a variety of subtle and powerful biases, which that can have all kinds of unintended consequences for your prompts.
633
+ For example, take the following prompt, given to gpt-2 (standard greedy tokenization):
634
+
635
+ hf_gen(prompt, max_tokens=10)
636
+ ```python
637
+ from transformers import pipeline
638
+ pipe = pipeline("text-generation", model="gpt2")
639
+ def hf_gen(prompt, max_tokens=100):
640
+ return pipe(prompt, do_sample=False, max_length=max_tokens, return_full_text=False)[0]['generated_text']
641
+
642
+ prompt = 'http:'
643
+ hf_gen(prompt, max_tokens=10)
644
+ ```
645
+ <img width="198" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/a0fe3e81-89e0-4b4a-8981-edf8b1a8a723"><br>
646
+
647
+
648
+ Notice how the output generated by the LLM does not complete the URL with the obvious next characters (two forward slashes). It instead creates an invalid URL string with a space in the middle. Why? Because the string `://` is its own token, and so once the model sees a colon by itself, it assumes that the next characters cannot be `//`; otherwise, the tokenizer would not have used `:`, and instead would have used `://`. This is why there are warnings about ending prompts in whitespace, but the problem is way more pervasive than that: any boundary that may span multiple tokens will cause problems, e.g. notice how a partial word causes incorrect completion:
649
+
650
+ ```python
651
+ prompt = 'John is a'
652
+ hf_gen(prompt, max_tokens=5)
653
+ ```
654
+ <img width="133" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/44906e57-c4ca-4dc3-a1c3-2fdba040259b"><br>
655
+
656
+
657
+ ```python
658
+ prompt = 'John is a fo'
659
+ hf_gen(prompt, max_tokens=5)
660
+ ```
661
+ <img width="52" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/df649320-ec8e-468a-bb2f-e1994f16c9b6"><br>
662
+
663
+ While problematic enough for normal prompts, these problems would be a disaster in the kinds of prompts we wrote in this readme, where there is interleaving of prompting and generation happening multiple times (and thus multiple opportunities for problems). This is why `guidance` implements [token healing](https://towardsdatascience.com/the-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38), a feature that deals with prompt boundaries automatically, allowing users to just think in terms of **text** rather than tokens. For example:
664
+
665
+ ```python
666
+ from guidance import models
667
+ gpt = models.Transformers('gpt2')
668
+ prompt = 'http:'
669
+ gpt + prompt + gen(max_tokens=10)
670
+ ```
671
+ <img width="244" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/c9f26a58-52f2-457c-958a-e048f68eb388"><br>
672
+
673
+
674
+
675
+ ```python
676
+ prompt = 'John is a fo'
677
+ gpt + prompt + gen(max_tokens=2)
678
+ ```
679
+ <img width="186" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/bc5e4cd4-9b82-4c09-9db2-9e890dad1d69"><br>
680
+
681
+ ## Fast
682
+ ### Integrated stateful control is faster
683
+ We have full control of the decoding loop in our integration with `transformers` and `llamacpp`, allowing us to add control and additional prompt without any extra cost.
684
+ If instead we're calling a server, we pay the extra cost of making additional requests, which might be ok if the server has caching, but quickly becomes impractical if the server does not have fine-grained caching. For example, note again the output from the [gsm8k example with calculator](#gsm8k-example) above:
685
+
686
+ <img width="624" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/2c75b0f2-6997-43d9-b10e-cb9f6f2e2de5">
687
+
688
+ Every time we call `calculator`, we have to stop generation, append the result to the prompt, and resume generation. To avoid slowing down after the first call, a server would need to keep the KV cache up to '3 for breakfast. So she has calculator(16 - 3)', then roll forward generation from that point on. Even servers that _do_ have caching often don't have a way to guarantee state is preserved at each stop and start, and so user's pay a significant overhead at each interruption. The normal approach of considering everything as a new prompt would cause significant slow downs every time `calculator` is called.
689
+
690
+ ### Guidance acceleration
691
+ In addition to the benefit above, `guidance` calls are often **faster** than running equivalent prompts the traditional way, because we can batch any additional text that is added by the user as execution unrolls (rather than generating it). Take the example below, where we generate a json with a GGUF compressed `llama2` 7B executed using llama.cpp:
692
+ ```python
693
+ @guidance
694
+ def character_maker(lm, id, description, valid_weapons):
695
+ lm += f"""\
696
+ The following is a character profile for an RPG game in JSON format.
697
+ ```json
698
+ {{
699
+ "id": "{id}",
700
+ "description": "{description}",
701
+ "name": "{gen('name', stop='"')}",
702
+ "age": {gen('age', regex='[0-9]+', stop=',')},
703
+ "armor": "{select(options=['leather', 'chainmail', 'plate'], name='armor')}",
704
+ "weapon": "{select(options=valid_weapons, name='weapon')}",
705
+ "class": "{gen('class', stop='"')}",
706
+ "mantra": "{gen('mantra', stop='"')}",
707
+ "strength": {gen('strength', regex='[0-9]+', stop=',')},
708
+ "items": ["{gen('item', list_append=True, stop='"')}", "{gen('item', list_append=True, stop='"')}", "{gen('item', list_append=True, stop='"')}"]
709
+ }}```"""
710
+ return lm
711
+ a = time.time()
712
+ lm = llama2 + character_maker(1, 'A nimble fighter', ['axe', 'sword', 'bow'])
713
+ time.time() - a
714
+ ```
715
+ <img width="480" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/85b5a181-6e6a-4582-9203-730f49353aeb"><br>
716
+
717
+ Everything that is not green is not actually generated by the model, and is thus batched (much faster). This prompt takes about 1.2 seconds on an A100 GPU. Now, if we let the model generate everything (as in the roughly equivalent prompt below), it takes roughly `2.6` seconds (not only is it slower, we also have less control over generation).
718
+ ```python
719
+ @guidance
720
+ def character_maker2(lm, id, description):
721
+ lm += f"""\
722
+ The following is a character profile for an RPG game in JSON format. It has fields 'id', 'description', 'name', 'age', 'armor', weapon', 'class', 'mantra', 'strength', and 'items (just the names of 3 items)'
723
+ please set description to '{description}'
724
+ ```json""" + gen(stop='```')
725
+ return lm
726
+ a = time.time()
727
+ lm = llama2 + character_maker2(1, 'A nimble fighter')
728
+ time.time() - a
729
+ ```
730
+ <img width="586" alt="image" src="https://github.com/guidance-ai/guidance/assets/3740613/9c55500d-4c90-4f42-9343-43aa2a25efa4"><br>
731
+
tests/nltk-test.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk import FreqDist
3
+ from nltk.util import ngrams
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.corpus import stopwords
6
+ import string
7
+
8
+ # Step 0: Ensure necessary NLTK resources are downloaded
9
+ nltk.download('punkt')
10
+ nltk.download('stopwords')
11
+
12
+ # Step 1: Load your Bible text
13
+ with open('bibles/eng-engkjvcpb.txt', 'r', encoding='utf-8') as file:
14
+ bible_text = file.read()
15
+
16
+ # Step 2: Preprocess the text
17
+ tokens = word_tokenize(bible_text.lower()) # Tokenize and normalize case
18
+ # Remove punctuation and stop words
19
+ stop_words = set(stopwords.words('english'))
20
+ tokens = [token for token in tokens if token not in string.punctuation]
21
+
22
+ # Step 3: Generate n-grams
23
+ all_ngrams = []
24
+ for n in range(2, 5):
25
+ all_ngrams.extend(ngrams(tokens, n))
26
+
27
+ # Step 4: Analyze frequency of n-grams
28
+ freq_dist = FreqDist(all_ngrams)
29
+ most_common_ngrams = freq_dist.most_common(30) # Adjust the number to get more or fewer common n-grams
30
+
31
+ # Display the most common n-grams
32
+ for ngram, occurrence in most_common_ngrams:
33
+ print("{}: {}".format(' '.join(ngram), occurrence))
34
+
35
+ # Find the index (rank) of the specific n-gram in freq_dist
36
+ ngram_rank = sorted(freq_dist, key=freq_dist.get, reverse=True).index(('in', 'christ')) + 1
37
+
38
+ # If 'in christ' found in the most common n-grams, print the number of occurrences and how it ranks among the most common n-grams
39
+ if ('in', 'christ') in freq_dist:
40
+ print("The n-gram 'in christ' occurs {} times and is ranked {} among the most common n-grams.".format(freq_dist[('in', 'christ')], ngram_rank))
41
+
tests/test.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from guidance import models, select
2
+
3
+ import nltk
4
+ from nltk import FreqDist
5
+ from nltk.util import ngrams
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.corpus import stopwords
8
+ import string
9
+
10
+ nltk.download('punkt')
11
+
12
+ from romanize import uroman
13
+
14
+ llm = models.LlamaCpp('models/neural-chat-7b-v3-3.Q2_K.gguf', n_gpu_layers=1)
15
+
16
+ hin_str = 'हमारे परमेश्‍वर और प्रभु यीशु मसीह के पिता का धन्यवाद हो कि उसने हमें मसीह में स्वर्गीय स्थानों में सब प्रकार की आत्मिक आशीष* दी है।'
17
+ hin_str = uroman(hin_str)
18
+ greek_term = "ἐν Χριστῷ"
19
+ greek_term = uroman(greek_term)
20
+
21
+
22
+ tokens= word_tokenize(hin_str.lower()) # Tokenize and normalize case
23
+ tokens = [token for token in tokens if token not in string.punctuation]
24
+ all_ngrams = []
25
+ for n in range(2, 4):
26
+ all_ngrams.extend(ngrams(tokens, n))
27
+
28
+ all_ngrams = [x[0] + x[1] for x in all_ngrams]
29
+ all_ngrams = []
30
+ print(all_ngrams)
31
+
32
+ lm = llm
33
+ lm += f'The best translation of {greek_term} from Greek into Hindi is '
34
+ lm += select(all_ngrams, name='ngram')
35
+
36
+ print(lm['ngram'])
tests/test2.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ verse = "हमारे परमेश्‍वर और प्रभु यीशु मसीह के पिता का धन्यवाद हो कि उसने हमें मसीह में स्वर्गीय स्थानों में सब प्रकार की आत्मिक आशीष* दी है।"
2
+ n_gram = 'मसीह में'
3
+ print(n_gram in verse) # This would print True
tests/tfidf.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ import pandas as pd
3
+ import numpy as np
4
+ from itertools import islice
5
+ from romanize import uroman
6
+
7
+
8
+ verses = [
9
+ 1,
10
+ 1534,
11
+ 2747,
12
+ 3606,
13
+ 4895,
14
+ 5854,
15
+ 6512,
16
+ 7130,
17
+ 7215,
18
+ 8026,
19
+ 8721,
20
+ 9538,
21
+ 10257,
22
+ 11200,
23
+ 12022,
24
+ 12302,
25
+ 12707,
26
+ 12874,
27
+ 13944,
28
+ 16471,
29
+ 17608,
30
+ 17725,
31
+ 19016,
32
+ 20380,
33
+ 20534,
34
+ 21807,
35
+ 22164,
36
+ 22361,
37
+ 22434,
38
+ 22580,
39
+ 22601,
40
+ 22649,
41
+ 22754,
42
+ 22857,
43
+ 22910,
44
+ 22948,
45
+ 23159,
46
+ 23214,
47
+ 24285,
48
+ 24963,
49
+ 26114,
50
+ 26993,
51
+ 27999,
52
+ 28432,
53
+ 28869,
54
+ 29125,
55
+ 29274,
56
+ 29429,
57
+ 29533,
58
+ 29628,
59
+ 29717,
60
+ 29764,
61
+ 29877,
62
+ 29960,
63
+ 30006,
64
+ 30031,
65
+ 30334,
66
+ 30442,
67
+ 30547,
68
+ 30608,
69
+ 30713,
70
+ 30726,
71
+ 30741,
72
+ 30766,
73
+ 31171
74
+ ]
75
+
76
+ # Adjust verses to be zero-indexed for Python
77
+ verses = [x-1 for x in verses]
78
+
79
+ # Function to extract the verse of interest from the corpus
80
+ def extract_interested_verse(file_path, line_number, romanize=False):
81
+ with open(file_path, 'r', encoding='utf-8') as file:
82
+ for i, line in enumerate(file):
83
+ if i == line_number:
84
+ if romanize:
85
+ return uroman(line.strip())
86
+ else:
87
+ return line.strip()
88
+ return None
89
+
90
+
91
+ # Function to segment the corpus into documents based on the verses list
92
+ def segment_corpus(file_path, romanize=False):
93
+ documents = []
94
+ current_document = []
95
+ with open(file_path, 'r', encoding='utf-8') as file:
96
+ for i, line in enumerate(file, start=1):
97
+ if i in verses:
98
+ if current_document:
99
+ joined_doc_string = " ".join(current_document)
100
+ if romanize:
101
+ joined_doc_string = uroman(joined_doc_string)
102
+ documents.append(joined_doc_string)
103
+ current_document = []
104
+ current_document.append(line.strip())
105
+ # Don't forget to add the last document
106
+ if current_document:
107
+ joined_doc_string = " ".join(current_document)
108
+ if romanize:
109
+ joined_doc_string = uroman(joined_doc_string)
110
+ documents.append(joined_doc_string)
111
+ return documents
112
+
113
+ # Function to perform TF-IDF on the corpus and extract scores for a specific verse
114
+ def analyze_verse_in_corpus(file_path, interested_line, romanize=False):
115
+ documents = segment_corpus(file_path, romanize=romanize)
116
+ tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 4))
117
+ tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
118
+ feature_names = tfidf_vectorizer.get_feature_names_out()
119
+
120
+ # Identify the document index for the interested line
121
+ document_index = next(i for i, v in enumerate(verses) if v > interested_line) - 1
122
+
123
+ # Extract TF-IDF scores for the document containing the interested line
124
+ scores = np.array(tfidf_matrix[document_index].todense()).flatten()
125
+ scores_dict = dict(zip(feature_names, scores))
126
+
127
+ # Extract the interested verse text
128
+ interested_verse = extract_interested_verse(file_path, interested_line - 1, romanize=romanize)
129
+
130
+ # Map n-grams in verse to their TF-IDF scores
131
+ if interested_verse:
132
+ tfidf_vectorizer_verse = TfidfVectorizer(ngram_range=(2, 4))
133
+ tfidf_vectorizer_verse.fit([interested_verse])
134
+ verse_ngrams = tfidf_vectorizer_verse.get_feature_names_out()
135
+ verse_scores = {ngram: scores_dict.get(ngram, 0) for ngram in verse_ngrams}
136
+ # Get ngrams and respective scores in the verse in descending score order
137
+ sorted_verse_scores = dict(sorted(verse_scores.items(), key=lambda item: item[1], reverse=True))
138
+ return sorted_verse_scores
139
+ else:
140
+ return "Verse not found."
141
+
142
+
143
+ # file_path = 'bibles/eng-engkjvcpb.txt'
144
+ # interested_line = 29276 # Example line number
145
+ # verse_scores = analyze_verse_in_corpus(file_path, kjv_verses, interested_line)
146
+
147
+ # Print or return the results
148
+ # print(verse_scores)
149
+
150
+ # Print ngrams and respective scores in the verse in descending score order
151
+ # for ngram, score in islice(sorted_verse_scores.items(), 30):
152
+ # print(f"{ngram}: {score:.4f}")
tests/tsv_parse ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+ def parse_tsv_to_json(filepath, book_abbrev):
4
+ result = [] # Initialize an empty list to store the dictionaries.
5
+
6
+ with open(filepath, mode='r', encoding='utf-8') as file:
7
+ tsv_reader = csv.reader(file, delimiter='\t')
8
+
9
+ for row in tsv_reader:
10
+ # Check if the row contains a Greek term (non-empty) in the expected position.
11
+ if row and len(row) > 3 and row[4].strip():
12
+ # Construct a dictionary for the current row.
13
+ entry = {
14
+ "greek_term": row[4].strip(),
15
+ "translation_note": row[6].strip(),
16
+ "verse": book_abbrev + row[0].strip()
17
+ }
18
+ # Append the dictionary to the result list.
19
+ result.append(entry)
20
+
21
+ return result
22
+
23
+ # Example usage
24
+ result = parse_tsv_to_json('./translation_notes/tn_ROM.tsv', 'rom')
25
+
26
+ # Print first 5 entries
27
+ print(result[:5])
translation_notes.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "greek_term": "λόγος",
4
+ "translation_note": "often translated as 'word,' 'λόγος' in the Johannine prologue conveys a complex concept that includes divine revelation, reason, and creative power. In the context of John 1:1, it refers to Jesus as the pre-existent divine Word through whom all things were made and who reveals God to humanity. Translators should capture the multi-faceted nature of 'λόγος' as both communication and the personification of divine wisdom and presence.",
5
+ "verse": ""
6
+ },
7
+ {
8
+ "greek_term": "ἀγάπη",
9
+ "translation_note": "represents a form of love that is selfless, sacrificial, and unconditional, often distinguishing divine love from other types of love. In the New Testament, 'ἀγάπη' describes God's love for humanity and the love believers are called to have for one another. Translators should convey the depth and sacrificial nature of this love, differentiating it from feelings or affection.",
10
+ "verse": ""
11
+ },
12
+ {
13
+ "greek_term": "κοινωνία",
14
+ "translation_note": "translates to 'fellowship' or 'communion,' but encompasses much more than mere social interaction. 'κοινωνία' in the New Testament implies a deep, spiritual connection among believers, rooted in their shared participation in Christ and the Holy Spirit. It involves mutual support, sharing, and a common commitment to Christ's mission. Translators need to convey the richness of this fellowship as an expression of shared life and unity in the Spirit.",
15
+ "verse": ""
16
+ },
17
+ {
18
+ "greek_term": "δικαιοσύνη",
19
+ "translation_note": "often rendered as 'righteousness,' this term reflects a status of being in right relationship with God, conforming to His standards and will. In the New Testament, 'δικαιοσύνη' is closely related to justification through faith in Christ, signifying not only ethical behavior but also a legal standing of acquittal and acceptance by God. Translators should highlight the forensic aspect of righteousness as well as its ethical implications, emphasizing its source in God's grace through faith.",
20
+ "verse": ""
21
+ },
22
+ {
23
+ "greek_term": "ἐν Χριστῷ",
24
+ "translation_note": "illustrates the intimate union between believers and Christ. The preposition ἐν (in) goes beyond physical location, indicating a profound spiritual reality. Translators need to convey the concept of being 'in Christ' as being part of a new creation, identity, and living within the sphere of Christ's influence and lordship.",
25
+ "verse": ""
26
+ },
27
+ {
28
+ "greek_term": "διὰ πίστεως Ἰησοῦ Χριστοῦ",
29
+ "translation_note": "Here, faith in Jesus Christ is a possessive form that indicates faith that is associated with Jesus Christ. This could refer to: (1) trust in Jesus Christ. Alternate translation: “by trusting in Jesus Christ” or “by believing in Jesus Christ” (2) the faithfulness of Jesus Christ. Alternate translation: “through the faithfulness that Jesus Christ possesses” or “through how faithful Jesus Christ is”",
30
+ "verse": ""
31
+ }
32
+ ]
translation_notes/tn_ROM.tsv ADDED
The diff for this file is too large to render. See raw diff
 
uroman-1.2.8/.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !Build/
2
+ .last_cover_stats
3
+ /META.yml
4
+ /META.json
5
+ /MYMETA.*
6
+ *.o
7
+ *.pm.tdy
8
+ *.bs
9
+
10
+ # Devel::Cover
11
+ cover_db/
12
+
13
+ # Devel::NYTProf
14
+ nytprof.out
15
+
16
+ # Dizt::Zilla
17
+ /.build/
18
+
19
+ # Module::Build
20
+ _build/
21
+ Build
22
+ Build.bat
23
+
24
+ # Module::Install
25
+ inc/
26
+
27
+ # ExtUtils::MakeMaker
28
+ /blib/
29
+ /_eumm/
30
+ /*.gz
31
+ /Makefile
32
+ /Makefile.old
33
+ /MANIFEST.bak
34
+ /pm_to_blib
35
+ /*.zip
uroman-1.2.8/LICENSE.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)".
8
+ Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11
+
uroman-1.2.8/README.md ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # URoman
2
+
3
+ *uroman* is a *universal romanizer*. It converts text in any script to the Latin alphabet.
4
+
5
+ Version: 1.2.8
6
+ Release date: April 23, 2021
7
+ Author: Ulf Hermjakob, USC Information Sciences Institute
8
+
9
+
10
+ ### Usage
11
+ ```bash
12
+ $ uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
13
+ where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
14
+ grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
15
+ --chart specifies chart output (in JSON format) to represent alternative romanizations.
16
+ --no-cache disables caching.
17
+ ```
18
+ ### Examples
19
+ ```bash
20
+ $ bin/uroman.pl < text/zho.txt
21
+ $ bin/uroman.pl -l tur < text/tur.txt
22
+ $ bin/uroman.pl -l heb --chart < text/heb.txt
23
+ $ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
24
+ ```
25
+
26
+ Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
27
+ Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
28
+ Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or
29
+ Yiddish will improve romanization for those languages as some letters in those
30
+ languages have different sound values from other languages using the same script
31
+ (French, Russian, Hebrew respectively).
32
+ No effect for other languages in this version.
33
+
34
+ ### Bibliography
35
+ Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. ACL-2018 Best Demo Paper Award. [Paper in ACL Anthology](https://www.aclweb.org/anthology/P18-4003) | [Poster](https://www.isi.edu/~ulf/papers/poster-uroman-acl2018.pdf) | [BibTex](https://www.aclweb.org/anthology/P18-4003.bib)
36
+
37
+ ### Change History
38
+ Changes in version 1.2.8
39
+ * Updated to Unicode 13.0 (2021), which supports several new scripts (10% larger UnicodeData.txt).
40
+ * Improved support for Georgian.
41
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
42
+ * Various small improvements.
43
+
44
+ Changes in version 1.2.7
45
+ * Improved support for Pashto.
46
+
47
+ Changes in version 1.2.6
48
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
49
+ * Added support for English Braille.
50
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
51
+ reflecting a casual style that many native speakers of those languages use
52
+ when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
53
+ rather than phonetically motivated combinations of letters (e.g. "sh").
54
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
55
+ that language for that line. This is used for the new reference test file.
56
+ * Various small improvements.
57
+
58
+ Changes in version 1.2.5
59
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
60
+ -- For Serbian and Macedonian, which are often written in both Cyrillic
61
+ and Latin scripts, uroman will map both official versions to the same
62
+ romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
63
+ properly reflects the pronunciation of the city's name).
64
+ For both Serbian and Macedonian, casual writers often use a simplified
65
+ Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
66
+ and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
67
+ other such pairs. The casual romanization can be simulated by using
68
+ alternative uroman language codes "srp2" and "mkd2", which romanize
69
+ both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
70
+ * Various small improvements.
71
+
72
+ Changes in version 1.2.4
73
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
74
+
75
+ Changes in version 1.2
76
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
77
+ romanization (identity) of ASCII strings for default 1-best (non-chart)
78
+ output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
79
+ large size texts.
80
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
81
+ languages.
82
+ * Richer lattice structure (more alternatives) for "Romanization" of English
83
+ to support better matching to romanizations of other languages.
84
+ Changes output only when --chart option is specified. No change in output for
85
+ default 1-best output, which for ASCII characters is always the input string.
86
+
87
+ Changes in version 1.1 (major upgrade)
88
+ * Offers chart output (in JSON format) to represent alternative romanizations.
89
+ -- Location of first character is defined to be "line: 1, start:0, end:0".
90
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
91
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
92
+ -- Shows corresponding original and romanization text in red
93
+ when hovering over a text segment.
94
+ -- Shows alternative romanizations when hovering over romanized text
95
+ marked by dotted underline.
96
+ -- Added right-to-left script detection and improved display for right-to-left
97
+ script text (as determined line by line).
98
+ -- On-page support for some scripts that are often not pre-installed on users'
99
+ computers (Burmese, Egyptian, Klingon).
100
+
101
+ Changes in version 1.0 (major upgrade)
102
+ * Upgraded principal internal data structure from string to lattice.
103
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
104
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
105
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
106
+ * Japanese Katakana middle dots now mapped to ASCII space.
107
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
108
+ * Some corrections regarding analysis of Chinese numbers.
109
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
110
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
111
+ * Spaces normalized to ASCII space.
112
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
113
+ * Tested against previous version of uroman with a new uroman visual diff tool.
114
+ * Almost an order of magnitude faster.
115
+
116
+ Changes in version 0.7 (minor upgrade)
117
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
118
+ Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
119
+ Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
120
+ or Chinese characters in Uyghur texts.
121
+
122
+ Changes in version 0.6 (minor upgrade)
123
+ * Added support for two letter characters used in Uzbek:
124
+ (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
125
+ (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
126
+ Both are now mapped to "'" (plain ASCII apostrophe).
127
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
128
+ even when they are not preceded by "ئ" (yeh with hamza above).
129
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
130
+ ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
131
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
132
+ However, it is strongly recommended to normalize any presentation form Arabic letters
133
+ to their non-presentation form before calling uroman.
134
+ * Added force flush directive ($|=1;).
135
+
136
+ Changes in version 0.5 (minor upgrade)
137
+ * Improvements for Uyghur (make sure to use language option: -l uig)
138
+
139
+ Changes in version 0.4 (minor upgrade)
140
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
141
+ * Minor change for Arabic (added "alef+fathatan" = "an")
142
+
143
+ New features in version 0.3
144
+ * Covers Mandarin (Chinese)
145
+ * Improved romanization for numerous languages
146
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
147
+ * Maps from native digits to Western numbers
148
+ * Faster for South Asian languages
149
+
150
+ ### Other features
151
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
152
+ * Vowelization is provided when locally computable, e.g. for many South Asian languages and Tibetan.
153
+
154
+ ### Limitations
155
+ * The current version of uroman has a few limitations, some of which we plan to address in future versions.
156
+ For Japanese, *uroman* currently romanizes hiragana and katakana as expected, but kanji are interpreted as Chinese characters and romanized as such.
157
+ For Egyptian hieroglyphs, only single-sound phonetic characters and numbers are currently romanized.
158
+ For Linear B, only phonetic syllabic characters are romanized.
159
+ For some other extinct scripts such as cuneiform, no romanization is provided.
160
+ * A romanizer is not a full transliterator. For example, this version of
161
+ uroman does not vowelize text that lacks explicit vowelization such as
162
+ normal text in Arabic and Hebrew (without diacritics/points).
163
+
uroman-1.2.8/README.txt ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uroman version 1.2.8
2
+ Release date: April 23, 2021
3
+ Author: Ulf Hermjakob, USC Information Sciences Institute
4
+
5
+ uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
6
+
7
+ Usage: uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
8
+ where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
9
+ grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
10
+ --chart specifies chart output (in JSON format) to represent alternative romanizations.
11
+ --no-cache disables caching.
12
+ Examples: bin/uroman.pl < text/zho.txt
13
+ bin/uroman.pl -l tur < text/tur.txt
14
+ bin/uroman.pl -l heb --chart < text/heb.txt
15
+ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
16
+
17
+ Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
18
+ Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
19
+ Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or Yiddish
20
+ will improve romanization for those languages as some letters in those languages
21
+ have different sound values from other languages using the same script.
22
+ No effect for other languages in this version.
23
+
24
+ Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. [Best Demo Paper Award]
25
+
26
+ Changes in version 1.2.8
27
+ * Improved support for Georgian.
28
+ * Updated UnicodeData.txt to version 13 (2021) with several new scripts (10% larger).
29
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
30
+ * Various small improvements.
31
+ Changes in version 1.2.7
32
+ * Improved support for Pashto.
33
+ Changes in version 1.2.6
34
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
35
+ * Added support for English Braille.
36
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
37
+ reflecting a casual style that many native speakers of those languages use
38
+ when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
39
+ rather than phonetically motivated combinations of letters (e.g. "sh").
40
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
41
+ that language for that line. This is used for the new reference test file.
42
+ * Various small improvements.
43
+ Changes in version 1.2.5
44
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
45
+ -- For Serbian and Macedonian, which are often written in both Cyrillic
46
+ and Latin scripts, uroman will map both official versions to the same
47
+ romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
48
+ properly reflects the pronunciation of the city's name).
49
+ For both Serbian and Macedonian, casual writers often use a simplified
50
+ Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
51
+ and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
52
+ other such pairs. The casual romanization can be simulated by using
53
+ alternative uroman language codes "srp2" and "mkd2", which romanize
54
+ both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
55
+ * Various small improvements.
56
+ Changes in version 1.2.4
57
+ * Added support for Tifinagh (a script used for Berber languages).
58
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
59
+ Changes in version 1.2.3
60
+ * Exclude emojis, dingbats, many other pictographs from being romanized (e.g. to "face")
61
+ Changes in version 1.2
62
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
63
+ romanization (identity) of ASCII strings for default 1-best (non-chart)
64
+ output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
65
+ large size texts.
66
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
67
+ languages.
68
+ * Richer lattice structure (more alternatives) for "Romanization" of English
69
+ to support better matching to romanizations of other languages.
70
+ Changes output only when --chart option is specified. No change in output for
71
+ default 1-best output, which for ASCII characters is always the input string.
72
+ Changes in version 1.1 (major upgrade)
73
+ * Offers chart output (in JSON format) to represent alternative romanizations.
74
+ -- Location of first character is defined to be "line: 1, start:0, end:0".
75
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
76
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
77
+ -- Shows corresponding original and romanization text in red
78
+ when hovering over a text segment.
79
+ -- Shows alternative romanizations when hovering over romanized text
80
+ marked by dotted underline.
81
+ -- Added right-to-left script detection and improved display for right-to-left
82
+ script text (as determined line by line).
83
+ -- On-page support for some scripts that are often not pre-installed on users'
84
+ computers (Burmese, Egyptian, Klingon).
85
+ Changes in version 1.0 (major upgrade)
86
+ * Upgraded principal internal data structure from string to lattice.
87
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
88
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
89
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
90
+ * Japanese Katakana middle dots now mapped to ASCII space.
91
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
92
+ * Some corrections regarding analysis of Chinese numbers.
93
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
94
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
95
+ * Spaces normalized to ASCII space.
96
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
97
+ * Tested against previous version of uroman with a new uroman visual diff tool.
98
+ * Almost an order of magnitude faster.
99
+ Changes in version 0.7 (minor upgrade)
100
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
101
+ Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
102
+ Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
103
+ or Chinese characters in Uyghur texts.
104
+ Changes in version 0.6 (minor upgrade)
105
+ * Added support for two letter characters used in Uzbek:
106
+ (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
107
+ (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
108
+ Both are now mapped to "'" (plain ASCII apostrophe).
109
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
110
+ even when they are not preceded by "ئ" (yeh with hamza above).
111
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
112
+ ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
113
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
114
+ However, it is strongly recommended to normalize any presentation form Arabic letters
115
+ to their non-presentation form before calling uroman.
116
+ * Added force flush directive ($|=1;).
117
+ Changes in version 0.5 (minor upgrade)
118
+ * Improvements for Uyghur (make sure to use language option: -l uig)
119
+ Changes in version 0.4 (minor upgrade)
120
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
121
+ * Minor change for Arabic (added "alef+fathatan" = "an")
122
+ New features in version 0.3
123
+ * Covers Mandarin (Chinese)
124
+ * Improved romanization for numerous languages
125
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
126
+ * Maps from native digits to Western numbers
127
+ * Faster for South Asian languages
128
+
129
+ Other features
130
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
131
+ * Vowelization is provided when locally computable, e.g. for many South Asian
132
+ languages and Tibetan.
133
+
134
+ Limitations
135
+ * This version of uroman assumes all CJK ideographs to be Mandarin (Chinese).
136
+ This means that Japanese kanji are incorrectly romanized; however, Japanese
137
+ hiragana and katakana are properly romanized.
138
+ * A romanizer is not a full transliterator. For example, this version of
139
+ uroman does not vowelize text that lacks explicit vowelization such as
140
+ normal text in Arabic and Hebrew (without diacritics/points).
141
+
uroman-1.2.8/bin/de-accent.pl ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ sub print_version {
4
+ print STDERR "$0 version 1.1\n";
5
+ print STDERR " Author: Ulf Hermjakob\n";
6
+ print STDERR " Last changed: March 14, 2011\n";
7
+ }
8
+
9
+ sub print_usage {
10
+ print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
11
+ print STDERR " -h or -help\n";
12
+ print STDERR " -v or -version\n";
13
+ }
14
+
15
+ sub de_accent_string {
16
+ local($s) = @_;
17
+
18
+ # $s =~ tr/A-Z/a-z/;
19
+ unless (0) {
20
+ # Latin-1
21
+ if ($s =~ /\xC3[\x80-\xBF]/) {
22
+ $s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
23
+ $s =~ s/Æ/Ae/g;
24
+ $s =~ s/Ç/C/g;
25
+ $s =~ s/Ð/D/g;
26
+ $s =~ s/(È|É|Ê|Ë)/E/g;
27
+ $s =~ s/(Ì|Í|Î|Ï)/I/g;
28
+ $s =~ s/Ñ/N/g;
29
+ $s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
30
+ $s =~ s/(Ù|Ú|Û|Ü)/U/g;
31
+ $s =~ s/Þ/Th/g;
32
+ $s =~ s/Ý/Y/g;
33
+ $s =~ s/(à|á|â|ã|ä|å)/a/g;
34
+ $s =~ s/æ/ae/g;
35
+ $s =~ s/ç/c/g;
36
+ $s =~ s/(è|é|ê|ë)/e/g;
37
+ $s =~ s/(ì|í|î|ï)/i/g;
38
+ $s =~ s/ð/d/g;
39
+ $s =~ s/ñ/n/g;
40
+ $s =~ s/(ò|ó|ô|õ|ö)/o/g;
41
+ $s =~ s/ß/ss/g;
42
+ $s =~ s/þ/th/g;
43
+ $s =~ s/(ù|ú|û|ü)/u/g;
44
+ $s =~ s/(ý|ÿ)/y/g;
45
+ }
46
+ # Latin Extended-A
47
+ if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
48
+ $s =~ s/(Ā|Ă|Ą)/A/g;
49
+ $s =~ s/(ā|ă|ą)/a/g;
50
+ $s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
51
+ $s =~ s/(ć|ĉ|ċ|č)/c/g;
52
+ $s =~ s/(Ď|Đ)/D/g;
53
+ $s =~ s/(ď|đ)/d/g;
54
+ $s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
55
+ $s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
56
+ $s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
57
+ $s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
58
+ $s =~ s/(Ĥ|Ħ)/H/g;
59
+ $s =~ s/(ĥ|ħ)/h/g;
60
+ $s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
61
+ $s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
62
+ $s =~ s/IJ/Ij/g;
63
+ $s =~ s/ij/ij/g;
64
+ $s =~ s/Ĵ/J/g;
65
+ $s =~ s/ĵ/j/g;
66
+ $s =~ s/Ķ/K/g;
67
+ $s =~ s/(ķ|ĸ)/k/g;
68
+ $s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
69
+ $s =~ s/(ļ|ľ|ŀ|ł)/l/g;
70
+ $s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
71
+ $s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g;
72
+ $s =~ s/(Ō|Ŏ|Ő)/O/g;
73
+ $s =~ s/(ō|ŏ|ő)/o/g;
74
+ $s =~ s/Œ/Oe/g;
75
+ $s =~ s/œ/oe/g;
76
+ $s =~ s/(Ŕ|Ŗ|Ř)/R/g;
77
+ $s =~ s/(ŕ|ŗ|ř)/r/g;
78
+ $s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
79
+ $s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
80
+ $s =~ s/(Ţ|Ť|Ŧ)/T/g;
81
+ $s =~ s/(ţ|ť|ŧ)/t/g;
82
+ $s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
83
+ $s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
84
+ $s =~ s/Ŵ/W/g;
85
+ $s =~ s/ŵ/w/g;
86
+ $s =~ s/(Ŷ|Ÿ)/Y/g;
87
+ $s =~ s/ŷ/y/g;
88
+ $s =~ s/(Ź|Ż|Ž)/Z/g;
89
+ $s =~ s/(ź|ż|ž)/z/g;
90
+ }
91
+ # Latin Extended Additional
92
+ if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
93
+ $s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
94
+ $s =~ s/(ḃ|ḅ|ḇ)/b/g;
95
+ $s =~ s/(ḉ)/c/g;
96
+ $s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
97
+ $s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
98
+ $s =~ s/(ḟ)/f/g;
99
+ $s =~ s/(ḡ)/g/g;
100
+ $s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
101
+ $s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
102
+ $s =~ s/(ḱ|ḳ|ḵ)/k/g;
103
+ $s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
104
+ $s =~ s/(ḿ|ṁ|ṃ)/m/g;
105
+ $s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
106
+ $s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
107
+ $s =~ s/(ṕ|ṗ)/p/g;
108
+ $s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
109
+ $s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
110
+ $s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
111
+ $s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
112
+ $s =~ s/(ṽ|ṿ)/v/g;
113
+ $s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
114
+ $s =~ s/(ẋ|ẍ)/x/g;
115
+ $s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
116
+ $s =~ s/(ẑ|ẓ|ẕ)/z/g;
117
+ $s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
118
+ $s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
119
+ $s =~ s/(Ḉ)/C/g;
120
+ $s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
121
+ $s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
122
+ $s =~ s/(Ḟ)/F/g;
123
+ $s =~ s/(Ḡ)/G/g;
124
+ $s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
125
+ $s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
126
+ $s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
127
+ $s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
128
+ $s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
129
+ $s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
130
+ $s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
131
+ $s =~ s/(Ṕ|Ṗ)/P/g;
132
+ $s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
133
+ $s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
134
+ $s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
135
+ $s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
136
+ $s =~ s/(Ṽ|Ṿ)/V/g;
137
+ $s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
138
+ $s =~ s/(Ẍ)/X/g;
139
+ $s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
140
+ $s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
141
+ }
142
+ # Greek letters
143
+ if ($s =~ /\xCE[\x86-\xAB]/) {
144
+ $s =~ s/ά/α/g;
145
+ $s =~ s/έ/ε/g;
146
+ $s =~ s/ί/ι/g;
147
+ $s =~ s/ϊ/ι/g;
148
+ $s =~ s/ΐ/ι/g;
149
+ $s =~ s/ό/ο/g;
150
+ $s =~ s/ύ/υ/g;
151
+ $s =~ s/ϋ/υ/g;
152
+ $s =~ s/ΰ/υ/g;
153
+ $s =~ s/ώ/ω/g;
154
+ $s =~ s/Ά/Α/g;
155
+ $s =~ s/Έ/Ε/g;
156
+ $s =~ s/Ή/Η/g;
157
+ $s =~ s/Ί/Ι/g;
158
+ $s =~ s/Ϊ/Ι/g;
159
+ $s =~ s/Ύ/Υ/g;
160
+ $s =~ s/Ϋ/Υ/g;
161
+ $s =~ s/Ώ/Ω/g;
162
+ }
163
+ # Cyrillic letters
164
+ if ($s =~ /\xD0[\x80-\xAF]/) {
165
+ $s =~ s/Ѐ/Е/g;
166
+ $s =~ s/Ё/Е/g;
167
+ $s =~ s/Ѓ/Г/g;
168
+ $s =~ s/Ќ/К/g;
169
+ $s =~ s/Ѝ/И/g;
170
+ $s =~ s/Й/И/g;
171
+ $s =~ s/ѐ/е/g;
172
+ $s =~ s/ё/е/g;
173
+ $s =~ s/ѓ/г/g;
174
+ $s =~ s/ќ/к/g;
175
+ $s =~ s/ѝ/и/g;
176
+ $s =~ s/й/и/g;
177
+ }
178
+ }
179
+ return $s;
180
+ }
181
+
182
+ while (@ARGV) {
183
+ $arg = shift @ARGV;
184
+ if ($arg =~ /^-*(h|help)$/i) {
185
+ &print_usage;
186
+ exit 1;
187
+ } elsif ($arg =~ /^-*(v|version)$/i) {
188
+ &print_version;
189
+ exit 1;
190
+ } else {
191
+ print STDERR "Ignoring unrecognized argument $arg\n";
192
+ }
193
+ }
194
+
195
+ $line_number = 0;
196
+ while (<>) {
197
+ $line_number++;
198
+ print &de_accent_string($_);
199
+ }
200
+ exit 0;
201
+
uroman-1.2.8/bin/string-distance.pl ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # Author: Ulf Hermjakob
4
+ # Release date: October 13, 2019
5
+
6
+ # Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
7
+ # Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
8
+ # Example: string-distance.pl < ../test/string-similarity-test-input.txt
9
+ # Input format: two strings per line (tab-separated, in Latin script)
10
+ # Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
11
+ # Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
12
+ # Additional output meta info lines at the top are marked with an initial #.
13
+ #
14
+ # The script uses data from a string-distance-cost-rules file that lists costs,
15
+ # where the default cost is "1" with lower costs for differences in vowels,
16
+ # duplicate consonants, "f" vs. "ph" etc.
17
+ # Language cost rules can be language-specific and context-sensitive.
18
+
19
+ $|=1;
20
+
21
+ use FindBin;
22
+ use Cwd "abs_path";
23
+ use File::Basename qw(dirname);
24
+ use File::Spec;
25
+
26
+ my $bin_dir = abs_path(dirname($0));
27
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
28
+ my $data_dir = File::Spec->catfile($root_dir, "data");
29
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
30
+
31
+ use lib "$FindBin::Bin/../lib";
32
+ use List::Util qw(min max);
33
+ use NLP::utilities;
34
+ use NLP::stringDistance;
35
+ $util = NLP::utilities;
36
+ $sd = NLP::stringDistance;
37
+ $verbose = 0;
38
+ $separator = "\t";
39
+
40
+ $cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
41
+
42
+ $lang_code1 = "eng";
43
+ $lang_code2 = "eng";
44
+ %ht = ();
45
+
46
+ while (@ARGV) {
47
+ $arg = shift @ARGV;
48
+ if ($arg =~ /^-+lc1$/) {
49
+ $lang_code_candidate = shift @ARGV;
50
+ $lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
51
+ } elsif ($arg =~ /^-+lc2$/) {
52
+ $lang_code_candidate = shift @ARGV;
53
+ $lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
54
+ } elsif ($arg =~ /^-+(v|verbose)$/) {
55
+ $verbose = shift @ARGV;
56
+ } else {
57
+ print STDERR "Ignoring unrecognized arg $arg\n";
58
+ }
59
+ }
60
+
61
+ $sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
62
+ print STDERR "Loaded resources.\n" if $verbose;
63
+
64
+ my $chart_id = 0;
65
+ my $line_number = 0;
66
+ print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
67
+ while (<>) {
68
+ $line_number++;
69
+ if ($verbose) {
70
+ if ($line_number =~ /000$/) {
71
+ if ($line_number =~ /0000$/) {
72
+ print STDERR $line_number;
73
+ } else {
74
+ print STDERR ".";
75
+ }
76
+ }
77
+ }
78
+ my $line = $_;
79
+ $line =~ s/^\xEF\xBB\xBF//;
80
+ next if $line =~ /^\s*(\#.*)?$/;
81
+ my $s1;
82
+ my $s2;
83
+ if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
84
+ $s1 = $util->dequote_string($s1);
85
+ $s2 = $util->dequote_string($s2);
86
+ } elsif ($line =~ /^\s*(#.*)$/) {
87
+ } else {
88
+ print STDERR "Could not process line $line_number: $line" if $verbose;
89
+ print "\n";
90
+ next;
91
+ }
92
+
93
+ $cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
94
+ print "$s1\t$s2\t$cost\n";
95
+ }
96
+ print STDERR "\n" if $verbose;
97
+
98
+ exit 0;
99
+
uroman-1.2.8/bin/uroman-quick.pl ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # uroman Nov. 12, 2015 - July 25, 2016
4
+ # version v0.7
5
+ # Author: Ulf Hermjakob
6
+
7
+ # Usage: uroman-quick.pl {-l [tur|uig|ukr|yid]} < STDIN
8
+ # currently only for Arabic script languages, incl. Uyghur
9
+
10
+ $|=1;
11
+
12
+ use FindBin;
13
+ use Cwd "abs_path";
14
+ use File::Basename qw(dirname);
15
+ use File::Spec;
16
+
17
+ my $bin_dir = abs_path(dirname($0));
18
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
19
+ my $data_dir = File::Spec->catfile($root_dir, "data");
20
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
21
+
22
+ use lib "$FindBin::Bin/../lib";
23
+ use NLP::Romanizer;
24
+ use NLP::UTF8;
25
+ $romanizer = NLP::Romanizer;
26
+ %ht = ();
27
+ $lang_code = "";
28
+
29
+ while (@ARGV) {
30
+ $arg = shift @ARGV;
31
+ if ($arg =~ /^-+(l|lc|lang-code)$/) {
32
+ $lang_code = lc (shift @ARGV || "")
33
+ } else {
34
+ print STDERR "Ignoring unrecognized arg $arg\n";
35
+ }
36
+ }
37
+
38
+ $romanization_table_arabic_block_filename = File::Spec->catfile($data_dir, "romanization-table-arabic-block.txt");
39
+ $romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
40
+
41
+ $romanizer->load_romanization_table(*ht, $romanization_table_arabic_block_filename);
42
+ $romanizer->load_romanization_table(*ht, $romanization_table_filename);
43
+
44
+ $line_number = 0;
45
+ while (<>) {
46
+ $line_number++;
47
+ my $line = $_;
48
+ print $romanizer->quick_romanize($line, $lang_code, *ht) . "\n";
49
+ if ($line_number =~ /0000$/) {
50
+ print STDERR $line_number;
51
+ } elsif ($line_number =~ /000$/) {
52
+ print STDERR ".";
53
+ }
54
+ }
55
+ print STDERR "\n";
56
+
57
+ exit 0;
58
+
uroman-1.2.8/bin/uroman-tsv.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Created by Thamme Gowda on June 17, 2019
3
+
4
+ DIR=$(dirname "${BASH_SOURCE[0]}") # get the directory name
5
+ # DIR=$(realpath "${DIR}") # resolve its full path if need be
6
+
7
+ if [[ $# -lt 1 || $# -gt 2 ]]; then
8
+ >&2 echo "ERROR: invalid args"
9
+ >&2 echo "Usage: <input.tsv> [<output.tsv>]"
10
+ exit 2
11
+ fi
12
+
13
+ INP=$1
14
+ OUT=$2
15
+
16
+ CMD=$DIR/uroman.pl
17
+
18
+ function romanize(){
19
+ paste <(cut -f1 $INP) <(cut -f2 $INP | $CMD)
20
+ }
21
+
22
+ if [[ -n $OUT ]]; then
23
+ romanize > $OUT
24
+ else
25
+ romanize
26
+ fi
27
+
28
+
uroman-1.2.8/bin/uroman.pl ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # uroman Nov. 12, 2015 - Apr. 23, 2021
4
+ $version = "v1.2.8";
5
+ # Author: Ulf Hermjakob
6
+
7
+ # Usage: uroman.pl {-l [ara|bel|bul|deu|ell|eng|fas|grc|heb|kaz|kir|lav|lit|mkd|mkd2|oss|pnt|rus|srp|srp2|tur|uig|ukr|yid]} {--chart|--offset-mapping} {--no-cache} {--workset} < STDIN
8
+ # Example: cat workset.txt | uroman.pl --offset-mapping --workset
9
+
10
+ $|=1;
11
+
12
+ use FindBin;
13
+ use Cwd "abs_path";
14
+ use File::Basename qw(dirname);
15
+ use File::Spec;
16
+
17
+ my $bin_dir = abs_path(dirname($0));
18
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
19
+ my $data_dir = File::Spec->catfile($root_dir, "data");
20
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
21
+
22
+ use lib "$FindBin::Bin/../lib";
23
+ use NLP::Chinese;
24
+ use NLP::Romanizer;
25
+ use NLP::UTF8;
26
+ use NLP::utilities;
27
+ use JSON;
28
+ $chinesePM = NLP::Chinese;
29
+ $romanizer = NLP::Romanizer;
30
+ $util = NLP::utilities;
31
+ %ht = ();
32
+ %pinyin_ht = ();
33
+ $lang_code = "";
34
+ $return_chart_p = 0;
35
+ $return_offset_mappings_p = 0;
36
+ $workset_p = 0;
37
+ $cache_rom_tokens_p = 1;
38
+
39
+ $script_data_filename = File::Spec->catfile($data_dir, "Scripts.txt");
40
+ $unicode_data_overwrite_filename = File::Spec->catfile($data_dir, "UnicodeDataOverwrite.txt");
41
+ $unicode_data_filename = File::Spec->catfile($data_dir, "UnicodeData.txt");
42
+ $romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
43
+ $chinese_tonal_pinyin_filename = File::Spec->catfile($data_dir, "Chinese_to_Pinyin.txt");
44
+
45
+ while (@ARGV) {
46
+ $arg = shift @ARGV;
47
+ if ($arg =~ /^-+(l|lc|lang-code)$/) {
48
+ $lang_code = lc (shift @ARGV || "")
49
+ } elsif ($arg =~ /^-+chart$/i) {
50
+ $return_chart_p = 1;
51
+ } elsif ($arg =~ /^-+workset$/i) {
52
+ $workset_p = 1;
53
+ } elsif ($arg =~ /^-+offset[-_]*map/i) {
54
+ $return_offset_mappings_p = 1;
55
+ } elsif ($arg =~ /^-+unicode[-_]?data/i) {
56
+ $filename = shift @ARGV;
57
+ if (-r $filename) {
58
+ $unicode_data_filename = $filename;
59
+ } else {
60
+ print STDERR "Ignoring invalid UnicodeData filename $filename\n";
61
+ }
62
+ } elsif ($arg =~ /^-+(no-tok-cach|no-cach)/i) {
63
+ $cache_rom_tokens_p = 0;
64
+ } else {
65
+ print STDERR "Ignoring unrecognized arg $arg\n";
66
+ }
67
+ }
68
+
69
+ $romanizer->load_script_data(*ht, $script_data_filename);
70
+ $romanizer->load_unicode_data(*ht, $unicode_data_filename);
71
+ $romanizer->load_unicode_overwrite_romanization(*ht, $unicode_data_overwrite_filename);
72
+ $romanizer->load_romanization_table(*ht, $romanization_table_filename);
73
+ $chinese_to_pinyin_not_yet_loaded_p = 1;
74
+ $current_date = $util->datetime("dateTtime");
75
+ $lang_code_clause = ($lang_code) ? " \"lang-code\":\"$lang_code\",\n" : "";
76
+
77
+ print "{\n \"romanizer\":\"uroman $version (Ulf Hermjakob, USC/ISI)\",\n \"date\":\"$current_date\",\n$lang_code_clause \"romanization\": [\n" if $return_chart_p;
78
+ my $line_number = 0;
79
+ my $chart_result = "";
80
+ while (<>) {
81
+ $line_number++;
82
+ my $line = $_;
83
+ my $snt_id = "";
84
+ if ($workset_p) {
85
+ next if $line =~ /^#/;
86
+ if (($i_value, $s_value) = ($line =~ /^(\S+\.\d+)\s(.*)$/)) {
87
+ $snt_id = $i_value;
88
+ $line = "$s_value\n";
89
+ } else {
90
+ next;
91
+ }
92
+ }
93
+ if ($chinese_to_pinyin_not_yet_loaded_p && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($line)) {
94
+ $chinesePM->read_chinese_tonal_pinyin_files(*pinyin_ht, $chinese_tonal_pinyin_filename);
95
+ $chinese_to_pinyin_not_yet_loaded_p = 0;
96
+ }
97
+ if ($return_chart_p) {
98
+ print $chart_result;
99
+ *chart_ht = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return chart", $line_number);
100
+ $chart_result = $romanizer->chart_to_json_romanization_elements(0, $chart_ht{N_CHARS}, *chart_ht, $line_number);
101
+ } elsif ($return_offset_mappings_p) {
102
+ ($best_romanization, $offset_mappings) = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return offset mappings", $line_number, 0);
103
+ print "::snt-id $snt_id\n" if $workset_p;
104
+ print "::orig $line";
105
+ print "::rom $best_romanization\n";
106
+ print "::align $offset_mappings\n\n";
107
+ } elsif ($cache_rom_tokens_p) {
108
+ print $romanizer->romanize_by_token_with_caching($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
109
+ } else {
110
+ print $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
111
+ }
112
+ }
113
+ $chart_result =~ s/,(\s*)$/$1/;
114
+ print $chart_result;
115
+ print " ]\n}\n" if $return_chart_p;
116
+
117
+ $dev_test_p = 0;
118
+ if ($dev_test_p) {
119
+ $n_suspicious_code_points = 0;
120
+ $n_instances = 0;
121
+ foreach $char_name (sort { hex($ht{UTF_NAME_TO_UNICODE}->{$a}) <=> hex($ht{UTF_NAME_TO_UNICODE}->{$b}) }
122
+ keys %{$ht{SUSPICIOUS_ROMANIZATION}}) {
123
+ $unicode_value = $ht{UTF_NAME_TO_UNICODE}->{$char_name};
124
+ $utf8_string = $ht{UTF_NAME_TO_CODE}->{$char_name};
125
+ foreach $romanization (sort keys %{$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}}) {
126
+ $count = $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization};
127
+ $s = ($count == 1) ? "" : "s";
128
+ print STDERR "*** Suspiciously lengthy romanization:\n" unless $n_suspicious_code_points;
129
+ print STDERR "::s $utf8_string ::t $romanization ::comment $char_name (U+$unicode_value)\n";
130
+ $n_suspicious_code_points++;
131
+ $n_instances += $count;
132
+ }
133
+ }
134
+ print STDERR " *** Total of $n_suspicious_code_points suspicious code points ($n_instances instance$s)\n" if $n_suspicious_code_points;
135
+ }
136
+
137
+ exit 0;
138
+
uroman-1.2.8/data/Chinese_to_Pinyin.txt ADDED
The diff for this file is too large to render. See raw diff
 
uroman-1.2.8/data/Scripts.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::script-name Aegean
2
+ ::script-name Ahom
3
+ ::script-name Anatolian Hieroglyph
4
+ ::script-name Arabic ::direction right-to-left
5
+ ::script-name Armenian
6
+ ::script-name Avestan
7
+ ::script-name Balinese
8
+ ::script-name Bamum
9
+ ::script-name Bassa Vah
10
+ ::script-name Batak
11
+ ::script-name Bengali ::abugida-default-vowel a
12
+ ::script-name Bhaiksuki
13
+ ::script-name Bopomofo ::language Chinese
14
+ ::script-name Brahmi ::abugida-default-vowel a
15
+ ::script-name Braille
16
+ ::script-name Buginese
17
+ ::script-name Buhid
18
+ ::script-name Canadian Syllabics
19
+ ::script-name Carian
20
+ ::script-name Caucasian Albanian
21
+ ::script-name Chakma
22
+ ::script-name Cham
23
+ ::script-name Cherokee
24
+ ::script-name Coptic
25
+ ::script-name Cuneiform
26
+ ::script-name Cypriot
27
+ ::script-name Cyrillic
28
+ ::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin
29
+ ::script-name Deseret
30
+ ::script-name Devanagari ::abugida-default-vowel a
31
+ ::script-name Duployan
32
+ ::script-name Egyptian Hieroglyph
33
+ ::script-name Elbasan
34
+ ::script-name Ethiopic
35
+ ::script-name Georgian
36
+ ::script-name Glagolitic
37
+ ::script-name Gothic
38
+ ::script-name Grantha
39
+ ::script-name Greek
40
+ ::script-name Gujarati ::abugida-default-vowel a
41
+ ::script-name Gurmukhi ::abugida-default-vowel a
42
+ ::script-name Hangul ::language Korean
43
+ ::script-name Hanunoo
44
+ ::script-name Hatran
45
+ ::script-name Hebrew ::direction right-to-left
46
+ ::script-name Hiragana ::language Japanese
47
+ ::script-name Imperial Aramaic
48
+ ::script-name Inscriptional Pahlavi
49
+ ::script-name Inscriptional Parthian
50
+ ::script-name Javanese
51
+ ::script-name Kaithi
52
+ ::script-name Kannada ::abugida-default-vowel a
53
+ ::script-name Katakana ::language Japanese
54
+ ::script-name Kayah Li
55
+ ::script-name Kharoshthi
56
+ ::script-name Khmer ::abugida-default-vowel a, o
57
+ ::script-name Khojki
58
+ ::script-name Khudawadi
59
+ ::script-name Klingon
60
+ ::script-name Lao
61
+ ::script-name Lepcha
62
+ ::script-name Latin
63
+ ::script-name Limbu
64
+ ::script-name Linear A
65
+ ::script-name Linear B
66
+ ::script-name Lycian
67
+ ::script-name Lydian
68
+ ::script-name Mahajani
69
+ ::script-name Malayalam ::abugida-default-vowel a
70
+ ::script-name Mandaic
71
+ ::script-name Manichaean
72
+ ::script-name Marchen
73
+ ::script-name Meetei Mayek
74
+ ::script-name Meroitic Cursive
75
+ ::script-name Meroitic Hieroglyphic
76
+ ::script-name Miao
77
+ ::script-name Modi ::abugida-default-vowel a
78
+ ::script-name Mongolian
79
+ ::script-name Mro
80
+ ::script-name Multani
81
+ ::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a
82
+ ::script-name Nabataean
83
+ ::script-name New Tai Lue
84
+ ::script-name Newa
85
+ ::script-name Nko ::direction right-to-left
86
+ ::script-name Ogham
87
+ ::script-name Ol Chiki
88
+ ::script-name Old Hungarian
89
+ ::script-name Old Italic
90
+ ::script-name Old Permic
91
+ ::script-name Old Persian
92
+ ::script-name Old North Arabian
93
+ ::script-name Old South Arabian
94
+ ::script-name Old Turkic
95
+ ::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a
96
+ ::script-name Osage
97
+ ::script-name Osmanya
98
+ ::script-name Pahawh Hmong
99
+ ::script-name Palmyrene
100
+ ::script-name Pau Cin Hau
101
+ ::script-name Phags-pa
102
+ ::script-name Phaistos Disc
103
+ ::script-name Phoenician
104
+ ::script-name Psalter Pahlavi
105
+ ::script-name Rejang
106
+ ::script-name Runic
107
+ ::script-name Samaritan
108
+ ::script-name Saurashtra
109
+ ::script-name Sharada
110
+ ::script-name Shavian
111
+ ::script-name Siddham
112
+ ::script-name Sinhala ::abugida-default-vowel a
113
+ ::script-name Sora Sompeng
114
+ ::script-name Sundanese ::abugida-default-vowel a
115
+ ::script-name Syloti Nagri
116
+ ::script-name Syriac
117
+ ::script-name Tagalog
118
+ ::script-name Tagbanwa
119
+ ::script-name Tai Le
120
+ ::script-name Tai Tham
121
+ ::script-name Tai Viet
122
+ ::script-name Takri
123
+ ::script-name Tamil ::abugida-default-vowel a
124
+ ::script-name Tangut
125
+ ::script-name Telugu ::abugida-default-vowel a
126
+ ::script-name Thaana ::direction right-to-left
127
+ ::script-name Thai
128
+ ::script-name Tibetan ::abugida-default-vowel a
129
+ ::script-name Tifinagh
130
+ ::script-name Tirhuta
131
+ ::script-name Ugaritic
132
+ ::script-name Vai
133
+ ::script-name Vedic
134
+ ::script-name Warang Citi
135
+ ::script-name Yi
uroman-1.2.8/data/UnicodeData.txt ADDED
The diff for this file is too large to render. See raw diff
 
uroman-1.2.8/data/UnicodeDataOverwrite.txt ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## UnicodeDataOverwrite.txt
2
+ ::u 00A0 ::r " " ::comment no-break space
3
+ ::u 01BF ::r w ::comment ƿ Latin Character Wynn (Old English)
4
+ ::u 0294 ::r ' ::comment gottal stop
5
+ ::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative
6
+ ::u 0305 ::r "" ::comment ̅ Combining overline
7
+ ::u 0306 ::r "" ::comment ̆ Combining breve
8
+ ::u 0307 ::r "" ::comment ̇ Combining dot above
9
+ ::u 030A ::r "" ::comment ̊ Combining ring above
10
+ ::u 030C ::r "" ::comment ̌ Combining caron
11
+ ::u 0311 ::r "" ::comment ̑ Combining inverted breve
12
+ ::u 031D ::r "" ::comment ̝ Combining down up below
13
+ ::u 031E ::r "" ::comment ̞ Combining down tack below
14
+ ::u 031F ::r "" ::comment ̟ Combining plus sign below
15
+ ::u 0323 ::r "" ::comment ̣ Combining dot below
16
+ ::u 0325 ::r "" ::comment ̥ Combining ring below
17
+ ::u 0329 ::r "" ::comment ̩ Combining vertical line below
18
+ ::u 032A ::r "" ::comment ̪ Combining bridge below
19
+ ::u 032F ::r "" ::comment ̯ Combining inverted breve below
20
+ ::u 0342 ::r "" ::comment ͂ Combining Greek perispomeni (circumflex accent)
21
+ ::u 0343 ::r "" ::comment ̓ Combining Greek koronis
22
+ ::u 0361 ::r "" ::comment Combining double inverted breve
23
+ ::u 0384 ::r "" ::comment ΄ Greek tonos
24
+ ::u 0482 ::r 1000· ::comment ҂ Cyrillic thousands sign
25
+ ::u 0483 ::r "" ::comment ҃ Combining Cyrillic Titlo ::annotation titlo
26
+ ::u 0484 ::r "" ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization
27
+ ::u 055B ::r "" ::comment ՛ Armenian emphasis mark
28
+ ::u 055F ::r "" ::comment ՟ Armenian abbreviation mark ::annotation abbreviation
29
+
30
+ ::u 0901 ::r +m ::comment Devanagari sign candrabindu
31
+ ::u 0902 ::r +m ::comment Devanagari sign anusvara
32
+ ::u 0903 ::r +h ::comment Devanagari sign visarga
33
+ ::u 093D ::r ' ::comment Devanagari sign avagraha
34
+ ::u 0950 ::r om ::comment ॐ Devanagari om symbol
35
+ ::u 0951 ::r "" ::comment ॑ Devanagari stress sign "udatta"
36
+ ::u 0952 ::r "" ::comment ॒ Devanagari stress sign "anudatta"
37
+ ::u 0981 ::r +n ::comment Bengali sign candrabindu ("chôndrôbindu")
38
+ ::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar")
39
+ ::u 0983 ::r +h ::comment Bengali sign visarga ("bishôrgô")
40
+ ::u 099A ::r ch ::comment instead of Bengali C(A)
41
+ ::u 099B ::r chh ::comment instead of Bengali CC(A)
42
+ ::u 0A02 ::r +m ::comment Gurmukhi sign bindi
43
+ ::u 0A70 ::r +m ::comment Gurmukhi tippi
44
+ # ::u 0A72 ::r "" ::comment Gurmukhi addak
45
+ ::u 0A72 ::r "" ::comment Gurmukhi iri
46
+ ::u 0A73 ::r "" ::comment Gurmukhi ura
47
+ ::u 0B01 ::r +m ::comment Oriya sign candrabindu
48
+ ::u 0B03 ::r +h ::comment Oriya sign visarga
49
+ ::u 0B5F ::r ya ::comment ୟ Oriya letter yya
50
+ ::u 0B82 ::r +m ::comment Tamil sign anusvara (not to be used?)
51
+ ::u 0B83 ::r +h ::comment Tamil sign visarga ("āytam")
52
+ ::u 0B9F ::r t ::comment instead of Tamil TT(A)
53
+ ::u 0BA3 ::r n ::comment instead of Tamil NN(A)
54
+ ::u 0BA9 ::r n ::comment instead of Tamil NNN(A)
55
+ ::u 0BB1 ::r r ::comment instead of Tamil RR(A)
56
+ ::u 0BB3 ::r l ::comment instead of Tamil LL(A)
57
+ ::u 0BB4 ::r l ::comment instead of Tamil LLL(A)
58
+ ::u 0C03 ::r +h ::comment ః Telugu sign visarga
59
+ ::u 0C83 ::r +h ::comment Kannada sign visarga
60
+ ::u 0D02 ::r +m ::comment Malayalam sign anusvara
61
+ ::u 0D03 ::r +h ::comment Malayalam sign visarga
62
+ ::u 0D82 ::r +n ::comment Sinhala sign anusvaraya
63
+ ::u 0DA4 ::r ny ::comment Sinhala ඤ
64
+ ::u 0DA5 ::r gn ::comment Sinhala ඥ
65
+ ::u 0DCA ::r "" ::comment Sinhala sign al-lakuna (virama = no vowel)
66
+ ::u 0DCF ::r aa ::comment Sinhala ා
67
+ ::u 0DD0 ::r ae ::comment Sinhala ැ
68
+ ::u 0DD1 ::r ae ::comment Sinhala ෑ
69
+ ::u 0DD2 ::r i ::comment Sinhala ි
70
+ ::u 0DD3 ::r ii ::comment Sinhala ී
71
+ ::u 0DD4 ::r u ::comment Sinhala ු
72
+ ::u 0DD6 ::r uu ::comment Sinhala ූ
73
+ ::u 0DD8 ::r r ::comment Sinhala ෘ
74
+ ::u 0DD9 ::r e ::comment Sinhala ෙ
75
+ ::u 0DDA ::r ee ::comment Sinhala ේ
76
+ ::u 0DDB ::r ai ::comment Sinhala ෛ
77
+ ::u 0DDC ::r o ::comment Sinhala ො
78
+ ::u 0DDD ::r oo ::comment Sinhala ෝ
79
+ ::u 0DDE ::r au ::comment Sinhala ෞ
80
+ ::u 0DDF ::r aa ::comment Sinhala ා
81
+ ::u 0DF2 ::r rr ::comment Sinhala ෲ
82
+
83
+ ::u 0E02 ::r k ::comment Thai character KHO KHAI
84
+ ::u 0E03 ::r k ::comment Thai character KHO KHUAT
85
+ ::u 0E04 ::r k ::comment Thai character KHO KHWAI
86
+ ::u 0E05 ::r k ::comment Thai character KHO KHON
87
+ ::u 0E06 ::r k ::comment Thai character KHO RAKHANG
88
+ ::u 0E10 ::r t ::comment Thai character THO THAN
89
+ ::u 0E11 ::r t ::comment Thai character THO NANGMONTHO
90
+ ::u 0E12 ::r t ::comment Thai character THO PHUTHAO
91
+ ::u 0E16 ::r t ::comment Thai character THO THUNG
92
+ ::u 0E17 ::r t ::comment Thai character THO THAHAN
93
+ ::u 0E18 ::r t ::comment Thai character THO THONG
94
+ ::u 0E1C ::r p ::comment Thai character PHO PHUNG
95
+ ::u 0E1E ::r p ::comment Thai character PHO PHAN
96
+ ::u 0E20 ::r p ::comment Thai character PHO SAMPHAO
97
+ ::u 0E2D ::r o ::comment Thai character O ANG
98
+ ::u 0E2F ::r ... ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation)
99
+ ::u 0E31 ::r a ::comment Thai character MAI HAN-AKAT
100
+ ::u 0E3A ::r "" ::comment Thai character PHINTHU (Pali virama)
101
+ ::u 0E40 ::r e ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E
102
+ ::u 0E41 ::r ae ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE
103
+ ::u 0E42 ::r o ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O
104
+ ::u 0E43 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN
105
+ ::u 0E44 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI
106
+ ::u 0E45 ::r "" ::comment Thai character LAKKHANGYAO vowel lengthener
107
+ ::u 0E47 ::r o ::comment Thai character MAITAIKHU vowel shortener
108
+ ::u 0E48 ::r "" ::tone-mark non-standard ::comment Thai tone mark MAI EK
109
+ ::u 0E49 ::r "" ::tone-mark standard ::comment Thai tone mark MAI THO
110
+ ::u 0E4A ::r "" ::tone-mark high ::comment Thai tone mark MAI TRI
111
+ ::u 0E4B ::r "" ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA
112
+ ::u 0E4C ::r "" ::comment Thai character THANTHAKHAT cancellation mark (cf. virama)
113
+ ::u 0E4D ::r +m ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara)
114
+ ::u 0ECC ::r "" ::comment ໌ Lao cancellation mark ::annotation cancellation
115
+ ::u 0F0B ::r · ::comment ་ Tibetan mark intersyllabic tsheg
116
+ ::u 0F0C ::r "" ::comment ༌ Tibetan mark delimiter tsheg bstar
117
+ ::u 0F84 ::r "" ::comment ྄ Tibetan halanta
118
+ ::u 1036 ::r +n ::comment Myanmar sign anusvara ("auk myit")
119
+ ::u 1037 ::r "" ::tone-mark creaky ::comment Myanmar sign dot below
120
+ ::u 1038 ::r "" ::tone-mark high ::comment Myanmar sign visarga
121
+
122
+ ::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F
123
+ ::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V
124
+ ::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U
125
+ ::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR
126
+ ::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y
127
+ ::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W
128
+ ::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN
129
+ ::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH
130
+ ::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A
131
+ ::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O
132
+ ::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A
133
+ ::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC
134
+ ::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O
135
+ ::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O
136
+ ::u 16AE ::r o ::comment ᚮ RUNIC LETTER O
137
+ ::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE
138
+ ::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON
139
+ ::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R
140
+ ::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA
141
+ ::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN
142
+ ::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K
143
+ ::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G
144
+ ::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG
145
+ ::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G
146
+ ::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR
147
+ ::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W
148
+ ::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H
149
+ ::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H
150
+ ::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H
151
+ ::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H
152
+ ::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N
153
+ ::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N
154
+ ::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N
155
+ ::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I
156
+ ::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E
157
+ ::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J
158
+ ::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER
159
+ ::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE
160
+ ::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A
161
+ ::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH
162
+ ::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P
163
+ ::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX
164
+ ::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S
165
+ ::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S
166
+ ::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S
167
+ ::u 16CD ::r c ::comment ᛍ RUNIC LETTER C
168
+ ::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z
169
+ ::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T
170
+ ::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T
171
+ ::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D
172
+ ::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B
173
+ ::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B
174
+ ::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P
175
+ ::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P
176
+ ::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E
177
+ ::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M
178
+ ::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M
179
+ ::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M
180
+ ::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L
181
+ ::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L
182
+ ::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ
183
+ ::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING
184
+ ::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D
185
+ ::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O
186
+ ::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR
187
+ ::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR
188
+ ::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH
189
+ ::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC
190
+ ::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC
191
+ ::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN
192
+ ::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR
193
+ ::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR
194
+ ::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR
195
+ ::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q
196
+ ::u 16EA ::r x ::comment ᛪ RUNIC LETTER X
197
+
198
+ ::u 17B9 ::r oe ::comment Khmer vowel sign y (short)
199
+ ::u 17BA ::r oe ::comment Khmer vowel sign yy (long)
200
+ ::u 17C6 ::r +m ::comment Khmer sign nikahit (cf. anusvara)
201
+ ::u 17C7 ::r +h ::comment Khmer sign reahmuk (cf. visarga)
202
+ ::u 17C8 ::r ' ::comment Khmer sign yuukaleapintu (short vowel and glottal stop)
203
+ ::u 17C9 ::r "" ::comment Khmer sign muusikatoan: changes the second register to the first
204
+ ::u 17CA ::r "" ::comment Khmer sign triisap: changes the first register to the second
205
+ ::u 17CB ::r "" ::comment Khmer sign bantoc (vowel shortener)
206
+ ::u 17D2 ::r "" ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel)
207
+ ::u 17D5 ::r . ::comment Khmer sign bariyoosan; period ending entire text or chapter
208
+
209
+ ::u 180E ::r ' ::comment ᠎ Mongolian vowel separator
210
+
211
+ ::u 1B80 ::r +ng ::comment ᮀ Sundanese sign panyecek
212
+ ::u 1B81 ::r +r ::comment ᮁ Sundanese sign panglayar
213
+ ::u 1B82 ::r +h ::comment ᮂ Sundanese sign pangwisad
214
+ ::u 1BA1 ::r ya ::comment ᮡ Sundanese consonant sign pamingkal
215
+ ::u 1BA2 ::r ra ::comment ᮢ Sundanese consonant sign panyakr
216
+ ::u 1BA3 ::r la ::comment ᮣ Sundanese consonant sign panyiku
217
+ ::u 1BA4 ::r i ::comment ᮤ Sundanese consonant sign panghulu
218
+ ::u 1BA5 ::r u ::comment ᮥ Sundanese consonant sign panyuku
219
+ ::u 1BA6 ::r e ::comment ᮦ Sundanese vowel sign panaelaeng
220
+ ::u 1BA7 ::r o ::comment ᮧ Sundanese vowel sign panolong
221
+ ::u 1BA8 ::r e ::comment ᮨ Sundanese vowel sign pamepet
222
+ ::u 1BA9 ::r eu ::comment ᮩ Sundanese vowel sign paneuleung
223
+ ::u 1BAA ::r "" ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama)
224
+
225
+ ::u 1FBD ::r "" ::comment ᾽ Greek koronis
226
+ ::u 1FFE ::r "" ::comment Greek dasia (rough breathing)
227
+
228
+ ::u 2002 ::r " " ::comment en space
229
+ ::u 2003 ::r " " ::comment em space
230
+ ::u 2004 ::r " " ::comment three-per-em space
231
+ ::u 2005 ::r " " ::comment four-per-em space
232
+ ::u 2006 ::r " " ::comment six-per-em space
233
+ ::u 2007 ::r " " ::comment figure space
234
+ ::u 2008 ::r " " ::comment punctuation space
235
+ ::u 2009 ::r " " ::comment thin space
236
+ ::u 200A ::r " " ::comment hair space
237
+ ::u 202F ::r " " ::comment narrow no-break space
238
+
239
+ ::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ
240
+ ::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ
241
+ ::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ
242
+ ::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ
243
+ ::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ
244
+ ::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ
245
+ ::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ
246
+ ::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ
247
+ ::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ
248
+ ::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ
249
+ ::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ
250
+ ::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ
251
+ ::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ
252
+ ::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ
253
+ ::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ
254
+ ::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ
255
+ ::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ
256
+ ::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ
257
+ ::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ
258
+ ::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ
259
+ ::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ
260
+ ::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ
261
+ ::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ
262
+ ::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ
263
+ ::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ
264
+ ::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ
265
+ ::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ
266
+ ::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ
267
+ ::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ
268
+ ::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ
269
+ ::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ
270
+ ::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ
271
+ ::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ
272
+ ::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ
273
+ ::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ
274
+ ::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ
275
+ ::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ
276
+ ::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ
277
+ ::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ
278
+ ::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ
279
+ ::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ
280
+ ::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ
281
+ ::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ
282
+ ::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ
283
+ ::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ
284
+ ::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ
285
+ ::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ
286
+ ::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ
287
+ ::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ
288
+ ::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ
289
+ ::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ
290
+ ::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ
291
+ ::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ
292
+ ::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ
293
+ ::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ
294
+ ::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ
295
+ ::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ
296
+ ::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰
297
+ ::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿
298
+
299
+ ::u 3063 ::r tsu ::comment Hiragana letter small tsu
300
+ ::u 30C3 ::r tsu ::comment Katakana letter small tsu
301
+
302
+ ::u ABE3 ::r o ::comment ꯣ Meetei Mayek vowel sign onap
303
+ ::u ABE7 ::r ou ::comment ꯧ Meetei Mayek vowel sign sounap
304
+
305
+ ::u F008 ::r "" ::comment Yoruba diacritic in private use area
306
+ ::u F00F ::r "" ::comment Yoruba diacritic in private use area
307
+ ::u F023 ::r "" ::comment Yoruba diacritic in private use area
308
+ ::u F025 ::r "" ::comment Yoruba diacritic in private use area
309
+
310
+ ::u F8D0 ::r a ::name KLINGON LETTER A
311
+ ::u F8D1 ::r b ::name KLINGON LETTER B
312
+ ::u F8D2 ::r ch ::name KLINGON LETTER CH
313
+ ::u F8D3 ::r D ::name KLINGON LETTER D
314
+ ::u F8D4 ::r e ::name KLINGON LETTER E
315
+ ::u F8D5 ::r gh ::name KLINGON LETTER GH
316
+ ::u F8D6 ::r H ::name KLINGON LETTER H
317
+ ::u F8D7 ::r I ::name KLINGON LETTER I
318
+ ::u F8D8 ::r j ::name KLINGON LETTER J
319
+ ::u F8D9 ::r l ::name KLINGON LETTER L
320
+ ::u F8DA ::r m ::name KLINGON LETTER M
321
+ ::u F8DB ::r n ::name KLINGON LETTER N
322
+ ::u F8DC ::r ng ::name KLINGON LETTER NG
323
+ ::u F8DD ::r o ::name KLINGON LETTER O
324
+ ::u F8DE ::r p ::name KLINGON LETTER P
325
+ ::u F8DF ::r q ::name KLINGON LETTER Q
326
+ ::u F8E0 ::r Q ::name KLINGON LETTER Q
327
+ ::u F8E1 ::r r ::name KLINGON LETTER R
328
+ ::u F8E2 ::r S ::name KLINGON LETTER S
329
+ ::u F8E3 ::r t ::name KLINGON LETTER T
330
+ ::u F8E4 ::r tlh ::name KLINGON LETTER TLH
331
+ ::u F8E5 ::r u ::name KLINGON LETTER U
332
+ ::u F8E6 ::r v ::name KLINGON LETTER V
333
+ ::u F8E7 ::r w ::name KLINGON LETTER W
334
+ ::u F8E8 ::r y ::name KLINGON LETTER Y
335
+ ::u F8E9 ::r ' ::name KLINGON LETTER GLOTTAL STOP
336
+ ::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO
337
+ ::u F8F1 ::num 1 ::name KLINGON DIGIT ONE
338
+ ::u F8F2 ::num 2 ::name KLINGON DIGIT TWO
339
+ ::u F8F3 ::num 3 ::name KLINGON DIGIT THREE
340
+ ::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR
341
+ ::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE
342
+ ::u F8F6 ::num 6 ::name KLINGON DIGIT SIX
343
+ ::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN
344
+ ::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT
345
+ ::u F8F9 ::num 9 ::name KLINGON DIGIT NINE
346
+ ::u F8FD ::r , ::name KLINGON COMMA
347
+ ::u F8FE ::r . ::name KLINGON FULL STOP
348
+ ::u F8FF ::name KLINGON MUMMIFICATION GLYPH
349
+
350
+ ::u 1163D ::r +m ::comment Modi sign anusvara
351
+ ::u 1163E ::r +h ::comment Modi sign visarga
352
+
353
+ ::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph
354
+ ::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth
355
+ ::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm
356
+ ::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand
357
+ ::u 130AD ::num 10000 ::comment Egyptian Hieroglyph
358
+ ::u 130AE ::num 20000 ::comment Egyptian Hieroglyph
359
+ ::u 130AF ::num 30000 ::comment Egyptian Hieroglyph
360
+ ::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph
361
+ ::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph
362
+ ::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph
363
+ ::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph
364
+ ::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph
365
+ ::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph
366
+ ::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph
367
+ ::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot
368
+ ::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent
369
+ ::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder
370
+ ::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture
371
+ ::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl
372
+ ::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick
373
+ ::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg
374
+ ::u 13190 ::num 100000 ::comment Egyptian Hieroglyph
375
+ ::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper
376
+ ::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra
377
+ ::u 131BC ::num 1000 ::comment Egyptian Hieroglyph
378
+ ::u 131BD ::num 2000 ::comment Egyptian Hieroglyph
379
+ ::u 131BE ::num 3000 ::comment Egyptian Hieroglyph
380
+ ::u 131BF ::num 4000 ::comment Egyptian Hieroglyph
381
+ ::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph
382
+ ::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph
383
+ ::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph
384
+ ::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph
385
+ ::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph
386
+ ::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed
387
+ ::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed
388
+ ::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope
389
+ ::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills
390
+ ::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water
391
+ ::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin
392
+ ::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter
393
+ ::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt
394
+ ::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool
395
+ ::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown
396
+ ::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth
397
+ ::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick
398
+ ::u 13362 ::num 100 ::comment Egyptian Hieroglyph
399
+ ::u 13363 ::num 200 ::comment Egyptian Hieroglyph
400
+ ::u 13364 ::num 300 ::comment Egyptian Hieroglyph
401
+ ::u 13365 ::num 400 ::comment Egyptian Hieroglyph
402
+ ::u 13366 ::num 500 ::comment Egyptian Hieroglyph
403
+ ::u 13367 ::num 600 ::comment Egyptian Hieroglyph
404
+ ::u 13368 ::num 700 ::comment Egyptian Hieroglyph
405
+ ::u 13369 ::num 800 ::comment Egyptian Hieroglyph
406
+ ::u 1336A ::num 900 ::comment Egyptian Hieroglyph
407
+ ::u 1336B ::num 500 ::comment Egyptian Hieroglyph
408
+ ::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso
409
+ ::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble
410
+ ::u 13386 ::num 10 ::comment Egyptian Hieroglyph
411
+ ::u 13387 ::num 20 ::comment Egyptian Hieroglyph
412
+ ::u 13388 ::num 30 ::comment Egyptian Hieroglyph
413
+ ::u 13389 ::num 40 ::comment Egyptian Hieroglyph
414
+ ::u 1338A ::num 50 ::comment Egyptian Hieroglyph
415
+ ::u 1338B ::num 60 ::comment Egyptian Hieroglyph
416
+ ::u 1338C ::num 70 ::comment Egyptian Hieroglyph
417
+ ::u 1338D ::num 80 ::comment Egyptian Hieroglyph
418
+ ::u 1338E ::num 90 ::comment Egyptian Hieroglyph
419
+ ::u 1338F ::num 20 ::comment Egyptian Hieroglyph
420
+ ::u 13390 ::num 30 ::comment Egyptian Hieroglyph
421
+ ::u 13391 ::num 40 ::comment Egyptian Hieroglyph
422
+ ::u 13392 ::num 50 ::comment Egyptian Hieroglyph
423
+ ::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax
424
+ ::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle
425
+ ::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant
426
+ ::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag
427
+ ::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand
428
+ ::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf
429
+ ::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes
430
+ ::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant
431
+ ::u 133FA ::num 1 ::comment Egyptian Hieroglyph
432
+ ::u 133FB ::num 2 ::comment Egyptian Hieroglyph
433
+ ::u 133FC ::num 3 ::comment Egyptian Hieroglyph
434
+ ::u 133FD ::num 4 ::comment Egyptian Hieroglyph
435
+ ::u 133FE ::num 5 ::comment Egyptian Hieroglyph
436
+ ::u 133FF ::num 6 ::comment Egyptian Hieroglyph
437
+ ::u 13400 ::num 7 ::comment Egyptian Hieroglyph
438
+ ::u 13401 ::num 8 ::comment Egyptian Hieroglyph
439
+ ::u 13402 ::num 9 ::comment Egyptian Hieroglyph
440
+ ::u 13403 ::num 5 ::comment Egyptian Hieroglyph
441
+ ::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta?
442
+ ::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm)