aloobun commited on
Commit
f7a222c
·
verified ·
1 Parent(s): 99ed867

Upload 3 files

Browse files
test_fo/hindi_english_tokenization_results.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Hindi": [
3
+ {
4
+ "original_text": "नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।",
5
+ "token_ids_count": 14,
6
+ "token_strings_count": 14,
7
+ "decoded_text": "नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।",
8
+ "text_match": true,
9
+ "token_id_stats": {
10
+ "min": 12,
11
+ "max": 22166,
12
+ "mean": 4712.857142857143
13
+ }
14
+ },
15
+ {
16
+ "original_text": "हिंदी भाषा बहुत सुंदर है।",
17
+ "token_ids_count": 7,
18
+ "token_strings_count": 7,
19
+ "decoded_text": "हिंदी भाषा बहुत सुंदर है।",
20
+ "text_match": true,
21
+ "token_id_stats": {
22
+ "min": 304,
23
+ "max": 46067,
24
+ "mean": 9137.285714285714
25
+ }
26
+ },
27
+ {
28
+ "original_text": "मुझे किताबें पढ़ना पसंद है।",
29
+ "token_ids_count": 7,
30
+ "token_strings_count": 7,
31
+ "decoded_text": "मुझे किताबें पढ़ना पसंद है।",
32
+ "text_match": true,
33
+ "token_id_stats": {
34
+ "min": 325,
35
+ "max": 50205,
36
+ "mean": 18701.571428571428
37
+ }
38
+ },
39
+ {
40
+ "original_text": "यह एक उदाहरण वाक्य है।",
41
+ "token_ids_count": 6,
42
+ "token_strings_count": 6,
43
+ "decoded_text": "यह एक उदाहरण वाक्य है।",
44
+ "text_match": true,
45
+ "token_id_stats": {
46
+ "min": 331,
47
+ "max": 64341,
48
+ "mean": 15892.833333333334
49
+ }
50
+ }
51
+ ],
52
+ "English": [
53
+ {
54
+ "original_text": "Hello, I am from India. Delhi is a big city.",
55
+ "token_ids_count": 13,
56
+ "token_strings_count": 13,
57
+ "decoded_text": "Hello, I am from India. Delhi is a big city.",
58
+ "text_match": true,
59
+ "token_id_stats": {
60
+ "min": 12,
61
+ "max": 22355,
62
+ "mean": 3848.3076923076924
63
+ }
64
+ },
65
+ {
66
+ "original_text": "The English language is widely spoken.",
67
+ "token_ids_count": 7,
68
+ "token_strings_count": 7,
69
+ "decoded_text": "The English language is widely spoken.",
70
+ "text_match": true,
71
+ "token_id_stats": {
72
+ "min": 14,
73
+ "max": 28525,
74
+ "mean": 8552.714285714286
75
+ }
76
+ },
77
+ {
78
+ "original_text": "I enjoy reading books.",
79
+ "token_ids_count": 5,
80
+ "token_strings_count": 5,
81
+ "decoded_text": "I enjoy reading books.",
82
+ "text_match": true,
83
+ "token_id_stats": {
84
+ "min": 14,
85
+ "max": 6621,
86
+ "mean": 3323.8
87
+ }
88
+ },
89
+ {
90
+ "original_text": "This is an example sentence.",
91
+ "token_ids_count": 6,
92
+ "token_strings_count": 6,
93
+ "decoded_text": "This is an example sentence.",
94
+ "text_match": true,
95
+ "token_id_stats": {
96
+ "min": 14,
97
+ "max": 12418,
98
+ "mean": 3230.8333333333335
99
+ }
100
+ }
101
+ ]
102
+ }
test_fo/result.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "coverage": {},
3
+ "complexity": {},
4
+ "language_analysis": {},
5
+ "edge_cases": {
6
+ "hindi": {
7
+ "script_test": {
8
+ "tokens": [
9
+ "नम",
10
+ "सà¥įतà¥ĩ",
11
+ ",",
12
+ "Ġमà¥Īà¤Ĥ",
13
+ "Ġà¤Ńारत",
14
+ "Ġसà¥ĩ",
15
+ "Ġहà¥Ĥà¤ģ",
16
+ "।",
17
+ "Ġदिलà¥įलà¥Ģ",
18
+ "Ġबहà¥ģत",
19
+ "Ġबड़ा",
20
+ "Ġशहर",
21
+ "Ġहà¥Ī",
22
+ "।"
23
+ ],
24
+ "token_count": 14,
25
+ "unique_tokens": 13
26
+ },
27
+ "unicode_test": {
28
+ "tokens": [
29
+ "हिनà¥įद",
30
+ "à¥Ģ",
31
+ "Ġ",
32
+ "१",
33
+ "२",
34
+ "३",
35
+ "४",
36
+ "५",
37
+ "६",
38
+ "à¥Ń",
39
+ "८",
40
+ "९",
41
+ "Ġvow",
42
+ "els",
43
+ ":",
44
+ "Ġà¤ħ",
45
+ "Ġà¤Ĩ",
46
+ "Ġà¤ĩ",
47
+ "Ġà¤Ī",
48
+ "Ġà¤ī",
49
+ "Ġà¤Ĭ"
50
+ ],
51
+ "token_count": 21,
52
+ "unique_tokens": 21
53
+ },
54
+ "special_chars": {
55
+ "tokens": [
56
+ "हिनà¥įद",
57
+ "à¥Ģ",
58
+ "!",
59
+ "Ġ@",
60
+ "Ġ#",
61
+ "Ġ$",
62
+ "Ġ%",
63
+ "Ġ^",
64
+ "Ġ&",
65
+ "Ġ*",
66
+ "Ġ(",
67
+ "Ġ)",
68
+ "Ġ_",
69
+ "Ġ+",
70
+ "Ġ=",
71
+ "Ġ[",
72
+ "Ġ]",
73
+ "Ġ{",
74
+ "Ġ}"
75
+ ],
76
+ "token_count": 19,
77
+ "unique_tokens": 19
78
+ }
79
+ },
80
+ "english": {
81
+ "script_test": {
82
+ "tokens": [
83
+ "Hello",
84
+ ",",
85
+ "ĠI",
86
+ "Ġam",
87
+ "Ġfrom",
88
+ "Ġthe",
89
+ "ĠUnited",
90
+ "ĠStates",
91
+ ".",
92
+ "ĠNew",
93
+ "ĠYork",
94
+ "Ġis",
95
+ "Ġa",
96
+ "Ġbeautiful",
97
+ "Ġcity",
98
+ "."
99
+ ],
100
+ "token_count": 16,
101
+ "unique_tokens": 15
102
+ },
103
+ "unicode_test": {
104
+ "tokens": [
105
+ "English",
106
+ "Ġ",
107
+ "012",
108
+ "345",
109
+ "678",
110
+ "9",
111
+ "Ġvow",
112
+ "els",
113
+ ":",
114
+ "Ġa",
115
+ "Ġe",
116
+ "Ġi",
117
+ "Ġo",
118
+ "Ġu"
119
+ ],
120
+ "token_count": 14,
121
+ "unique_tokens": 14
122
+ },
123
+ "special_chars": {
124
+ "tokens": [
125
+ "English",
126
+ "!",
127
+ "Ġ@",
128
+ "Ġ#",
129
+ "Ġ$",
130
+ "Ġ%",
131
+ "Ġ^",
132
+ "Ġ&",
133
+ "Ġ*",
134
+ "Ġ(",
135
+ "Ġ)",
136
+ "Ġ_",
137
+ "Ġ+",
138
+ "Ġ=",
139
+ "Ġ[",
140
+ "Ġ]",
141
+ "Ġ{",
142
+ "Ġ}"
143
+ ],
144
+ "token_count": 18,
145
+ "unique_tokens": 18
146
+ }
147
+ }
148
+ },
149
+ "unicode_coverage": {
150
+ "hindi": {
151
+ "original_text": "हिन्दी १२३४५६७८९ vowels: अ आ इ ई उ ऊ",
152
+ "tokens": [
153
+ "हिनà¥įद",
154
+ "à¥Ģ",
155
+ "Ġ",
156
+ "१",
157
+ "२",
158
+ "३",
159
+ "४",
160
+ "५",
161
+ "६",
162
+ "à¥Ń",
163
+ "८",
164
+ "९",
165
+ "Ġvow",
166
+ "els",
167
+ ":",
168
+ "Ġà¤ħ",
169
+ "Ġà¤Ĩ",
170
+ "Ġà¤ĩ",
171
+ "Ġà¤��",
172
+ "Ġà¤ī",
173
+ "Ġà¤Ĭ"
174
+ ],
175
+ "token_count": 21,
176
+ "unique_tokens": 21,
177
+ "coverage_ratio": 1.0
178
+ },
179
+ "english": {
180
+ "original_text": "English 0123456789 vowels: a e i o u",
181
+ "tokens": [
182
+ "English",
183
+ "Ġ",
184
+ "012",
185
+ "345",
186
+ "678",
187
+ "9",
188
+ "Ġvow",
189
+ "els",
190
+ ":",
191
+ "Ġa",
192
+ "Ġe",
193
+ "Ġi",
194
+ "Ġo",
195
+ "Ġu"
196
+ ],
197
+ "token_count": 14,
198
+ "unique_tokens": 14,
199
+ "coverage_ratio": 1.0
200
+ }
201
+ },
202
+ "script_complexity": {
203
+ "hindi": {
204
+ "original_text_length": 49,
205
+ "tokens": [
206
+ "नम",
207
+ "सà¥įतà¥ĩ",
208
+ ",",
209
+ "Ġमà¥Īà¤Ĥ",
210
+ "Ġà¤Ńारत",
211
+ "Ġसà¥ĩ",
212
+ "Ġहà¥Ĥà¤ģ",
213
+ "।",
214
+ "Ġदिलà¥įलà¥Ģ",
215
+ "Ġबहà¥ģत",
216
+ "Ġबड़ा",
217
+ "Ġशहर",
218
+ "Ġहà¥Ī",
219
+ "।"
220
+ ],
221
+ "token_count": 14,
222
+ "avg_token_length": 9.071428571428571,
223
+ "token_diversity": 0.9285714285714286
224
+ },
225
+ "english": {
226
+ "original_text_length": 65,
227
+ "tokens": [
228
+ "Hello",
229
+ ",",
230
+ "ĠI",
231
+ "Ġam",
232
+ "Ġfrom",
233
+ "Ġthe",
234
+ "ĠUnited",
235
+ "ĠStates",
236
+ ".",
237
+ "ĠNew",
238
+ "ĠYork",
239
+ "Ġis",
240
+ "Ġa",
241
+ "Ġbeautiful",
242
+ "Ġcity",
243
+ "."
244
+ ],
245
+ "token_count": 16,
246
+ "avg_token_length": 4.0625,
247
+ "token_diversity": 0.9375
248
+ }
249
+ }
250
+ }
test_fo/test2.txt ADDED
The diff for this file is too large to render. See raw diff