= commited on
Commit
27e2dfc
Β·
1 Parent(s): 7601bd6
Files changed (2) hide show
  1. all_langs.json +69 -0
  2. source_config.py +256 -0
all_langs.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": "eng",
3
+ "es": "spa",
4
+ "fr": "fra",
5
+ "de": "deu",
6
+ "it": "ita",
7
+ "pt": "por",
8
+ "nl": "nld",
9
+ "vi": "vie",
10
+ "tr": "tur",
11
+ "la": "lat",
12
+ "id": "ind",
13
+ "ms": "msa",
14
+ "af": "afr",
15
+ "sq": "sqi",
16
+ "is": "isl",
17
+ "no": "nor",
18
+ "sv": "swe",
19
+ "da": "dan",
20
+ "fi": "fin",
21
+ "hu": "hun",
22
+ "pl": "pol",
23
+ "cs": "ces",
24
+ "ro": "ron",
25
+ "ru": "rus",
26
+ "bg": "bul",
27
+ "uk": "ukr",
28
+ "sr": "srp",
29
+ "be": "bel",
30
+ "kk": "kaz",
31
+ "mk": "mkd",
32
+ "mn": "mon",
33
+ "zh": "zho",
34
+ "ja": "jpn",
35
+ "ko": "kor",
36
+ "hi": "hin",
37
+ "ur": "urd",
38
+ "bn": "ben",
39
+ "ta": "tam",
40
+ "te": "tel",
41
+ "mr": "mar",
42
+ "gu": "guj",
43
+ "kn": "kan",
44
+ "ml": "mal",
45
+ "pa": "pan",
46
+ "as": "asm",
47
+ "or": "ori",
48
+ "ar": "ara",
49
+ "fa": "fas",
50
+ "ps": "pus",
51
+ "sd": "snd",
52
+ "ug": "uig",
53
+ "el": "ell",
54
+ "he": "heb",
55
+ "hy": "hye",
56
+ "ka": "kat",
57
+ "am": "amh",
58
+ "km": "khm",
59
+ "lo": "lao",
60
+ "my": "mya",
61
+ "th": "tha",
62
+ "si": "sin",
63
+ "bo": "bod",
64
+ "dv": "div",
65
+ "ti": "tir",
66
+ "sw": "swa",
67
+ "eu": "eus",
68
+ "tl": "tgl"
69
+ }
source_config.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ LANGUAGE_BUCKETS = {
4
+ # ~41% of CC β€” intentionally capped to avoid crowding out other languages
5
+ "English": {
6
+ "langs": ["en"],
7
+ "weight": 2.5,
8
+ "min_chars": 2_000,
9
+ "latin": True,
10
+ },
11
+ # ~6.3% of CC β€” was badly underweighted relative to German/French
12
+ "Russian": {
13
+ "langs": ["ru"],
14
+ "weight": 1.8,
15
+ "min_chars": 2_000,
16
+ "latin": False,
17
+ },
18
+ # ~5.9% of CC
19
+ "German": {
20
+ "langs": ["de"],
21
+ "weight": 1.8,
22
+ "min_chars": 2_000,
23
+ "latin": True,
24
+ },
25
+ # ~5.7% of CC β€” bumped up from 1.7 to match its actual footprint
26
+ "Japanese": {
27
+ "langs": ["ja"],
28
+ "weight": 1.8,
29
+ "min_chars": 1_200,
30
+ "latin": False,
31
+ },
32
+ # ~5.0% of CC β€” CC likely undercounts due to Great Firewall
33
+ "Chinese": {
34
+ "langs": ["zh"],
35
+ "weight": 1.8,
36
+ "min_chars": 1_200,
37
+ "latin": False,
38
+ },
39
+ # ~4.6% of CC
40
+ "French": {
41
+ "langs": ["fr"],
42
+ "weight": 1.8,
43
+ "min_chars": 2_000,
44
+ "latin": True,
45
+ },
46
+ # ~4.6% of CC
47
+ "Spanish": {
48
+ "langs": ["es"],
49
+ "weight": 1.8,
50
+ "min_chars": 2_000,
51
+ "latin": True,
52
+ },
53
+ # ~2.5% of CC
54
+ "Portuguese": {
55
+ "langs": ["pt"],
56
+ "weight": 1.6,
57
+ "min_chars": 2_000,
58
+ "latin": True,
59
+ },
60
+ # ~2.4% of CC
61
+ "Italian": {
62
+ "langs": ["it"],
63
+ "weight": 1.5,
64
+ "min_chars": 2_000,
65
+ "latin": True,
66
+ },
67
+ # ~2.0% of CC β€” split out from CentralEuropeanLatin; rivals Italian/Portuguese
68
+ "Polish": {
69
+ "langs": ["pl"],
70
+ "weight": 1.5,
71
+ "min_chars": 2_000,
72
+ "latin": True,
73
+ },
74
+ # ~1.8% of CC β€” was significantly underweighted at 1.15
75
+ "Dutch": {
76
+ "langs": ["nl"],
77
+ "weight": 1.5,
78
+ "min_chars": 2_000,
79
+ "latin": True,
80
+ },
81
+ # ~1.2% of CC β€” split out from CentralEuropeanLatin; large internet population
82
+ "Turkish": {
83
+ "langs": ["tr"],
84
+ "weight": 1.4,
85
+ "min_chars": 2_000,
86
+ "latin": True,
87
+ },
88
+ # ind ~1.1%, vie ~1.05% of CC
89
+ "SoutheastAsianLatin": {
90
+ "langs": ["vi", "id", "ms", "sq", "la"],
91
+ "weight": 1.4,
92
+ "min_chars": 2_000,
93
+ "latin": True,
94
+ },
95
+ # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC β€” smaller tier after splitting out pl/tr
96
+ "CentralEuropeanLatin": {
97
+ "langs": ["cs", "ro", "hu"],
98
+ "weight": 1.2,
99
+ "min_chars": 2_000,
100
+ "latin": True,
101
+ },
102
+ # ~0.81% of CC β€” was overweighted at 1.7
103
+ "Korean": {
104
+ "langs": ["ko"],
105
+ "weight": 1.3,
106
+ "min_chars": 1_200,
107
+ "latin": False,
108
+ },
109
+ # ukr ~0.70%, bel ~0.017% of CC
110
+ "EastSlavicCyrillic": {
111
+ "langs": ["uk", "be"],
112
+ "weight": 1.15,
113
+ "min_chars": 2_000,
114
+ "latin": False,
115
+ },
116
+ # ~0.65% of CC β€” upweighted relative to CC share given speaker population
117
+ "Arabic": {
118
+ "langs": ["ar"],
119
+ "weight": 1.35,
120
+ "min_chars": 2_000,
121
+ "latin": False,
122
+ },
123
+ # sv ~0.7%, dan ~0.51%, nor+nno ~0.33%, fin ~0.37%, isl ~0.04%, afr ~0.01%
124
+ # combined ~2.0% of CC β€” was drastically overweighted at 6.0
125
+ # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
126
+ "NordicCore": {
127
+ "langs": ["sv", "da", "no", "is", "af", "fi"],
128
+ "weight": 1.8,
129
+ "min_chars": 2_000,
130
+ "latin": True,
131
+ },
132
+ # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
133
+ "BalkanCyrillic": {
134
+ "langs": ["bg", "sr", "mk"],
135
+ "weight": 1.0,
136
+ "min_chars": 2_000,
137
+ "latin": False,
138
+ },
139
+ # fas ~0.20% of CC (ignore the one anomalous crawl spike)
140
+ "ArabicOther": {
141
+ "langs": ["fa", "ps", "sd", "ug"],
142
+ "weight": 0.9,
143
+ "min_chars": 2_000,
144
+ "latin": False,
145
+ },
146
+ # ~0.22% of CC β€” genuine web underrepresentation relative to speaker count,
147
+ # but corpus is thin; 1.0 avoids oversampling a small pool
148
+ "Hindi": {
149
+ "langs": ["hi"],
150
+ "weight": 1.0,
151
+ "min_chars": 2_000,
152
+ "latin": False,
153
+ },
154
+ # combined ~0.27% of CC β€” upweighted for script diversity
155
+ "IndicOther": {
156
+ "langs": ["ur", "bn", "ta", "te", "mr", "gu", "kn", "ml", "pa", "as", "or"],
157
+ "weight": 0.9,
158
+ "min_chars": 2_000,
159
+ "latin": False,
160
+ },
161
+ # kk ~0.038%, mn ~0.016% of CC β€” very thin corpus, weight is already a large relative boost
162
+ "CentralAsianCyrillic": {
163
+ "langs": ["kk", "mn"],
164
+ "weight": 0.9,
165
+ "min_chars": 2_000,
166
+ "latin": False,
167
+ },
168
+ "AfricanLatin": {
169
+ "langs": ["sw", "tl", "eu"],
170
+ "weight": 0.8,
171
+ "min_chars": 1_500,
172
+ "latin": True,
173
+ },
174
+ # el ~0.55%, he ~0.24%, th ~0.38%, hy ~0.033%, ka ~0.044% etc. β€” combined ~1%+
175
+ # nudged up slightly from 0.8 given Greek and Thai have meaningful CC presence
176
+ "OtherScripts": {
177
+ "langs": ["el", "he", "hy", "ka", "am", "km", "lo", "my", "th", "si", "bo", "ti", "dv"],
178
+ "weight": 0.9,
179
+ "min_chars": 2_000,
180
+ "latin": False,
181
+ },
182
+ }
183
+
184
+ POOL = {
185
+ "wiki": {
186
+ "reserve": 0.60,
187
+ "min": 4,
188
+ "max": 120_000,
189
+ },
190
+ "smol": {
191
+ "reserve": 0.95,
192
+ "min": 1,
193
+ "max": 1_000,
194
+ },
195
+ "ft": {
196
+ "reserve": 0.60,
197
+ "min": 1,
198
+ "max": 30_000,
199
+ },
200
+ }
201
+
202
+ DOC_MIX = {
203
+ "pure": {
204
+ "fraction": 0.60,
205
+ "pool": "reserve",
206
+ "min_sentences": 1,
207
+ "max_sentences": 4,
208
+ "strip_punct_prob": 0.10,
209
+ },
210
+ "homogeneous": {
211
+ "fraction": 0.30,
212
+ "pool": "main",
213
+ "min_sentences": 2,
214
+ "max_sentences": 6,
215
+ "strip_punct_prob": 0.15,
216
+ },
217
+ "mixed": {
218
+ "fraction": 0.10,
219
+ "pool": "main",
220
+ "min_segments": 2,
221
+ "max_segments": 4,
222
+ "strip_punct_prob": 0.25,
223
+ "swap_prob": 0.06,
224
+ "o_inject_prob": 0.06,
225
+ "allow_repeated_langs": True,
226
+ },
227
+ }
228
+
229
+ SMOL = {
230
+ "use": True,
231
+ "rebuild": False,
232
+ }
233
+
234
+ FT = {
235
+ "use": True,
236
+ "rebuild": False,
237
+ "max_lang": 50_000,
238
+ "overflow_lang": 75_000,
239
+ "max_row": 50_000,
240
+ "miss": 1_000,
241
+ "include_en": True,
242
+ "langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"},
243
+ }
244
+ FT["every"] = len(FT["langs"])
245
+
246
+ RUN = {
247
+ "len": 512,
248
+ "target": 2_500_000, # synthetic mixed-language training examples to generate
249
+ "syn_cache": True,
250
+ "syn_rebuild": False,
251
+ "tok_cache": True,
252
+ "tok_rebuild": False,
253
+ "tok_skip_check": False,
254
+ "retry": 8,
255
+ "preview": 2_000,
256
+ }