| from __future__ import annotations |
|
|
| LANGUAGE_BUCKETS = { |
| |
| "English": { |
| "langs": ["en"], |
| "weight": 2.5, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Russian": { |
| "langs": ["ru"], |
| "weight": 1.8, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "German": { |
| "langs": ["de"], |
| "weight": 1.8, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Japanese": { |
| "langs": ["ja"], |
| "weight": 1.8, |
| "min_chars": 1_200, |
| "latin": False, |
| }, |
| |
| "Chinese": { |
| "langs": ["zh"], |
| "weight": 1.8, |
| "min_chars": 1_200, |
| "latin": False, |
| }, |
| |
| "French": { |
| "langs": ["fr"], |
| "weight": 1.8, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Spanish": { |
| "langs": ["es"], |
| "weight": 1.8, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Portuguese": { |
| "langs": ["pt"], |
| "weight": 1.6, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Italian": { |
| "langs": ["it"], |
| "weight": 1.5, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Polish": { |
| "langs": ["pl"], |
| "weight": 1.5, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Dutch": { |
| "langs": ["nl"], |
| "weight": 1.5, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Turkish": { |
| "langs": ["tr"], |
| "weight": 1.4, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "SoutheastAsianLatin": { |
| "langs": ["vi", "id", "ms", "sq", "la"], |
| "weight": 1.4, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "CentralEuropeanLatin": { |
| "langs": ["cs", "ro", "hu"], |
| "weight": 1.2, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "Korean": { |
| "langs": ["ko"], |
| "weight": 1.3, |
| "min_chars": 1_200, |
| "latin": False, |
| }, |
| |
| "EastSlavicCyrillic": { |
| "langs": ["uk", "be"], |
| "weight": 1.15, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "Arabic": { |
| "langs": ["ar"], |
| "weight": 1.35, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| |
| |
| "NordicCore": { |
| "langs": ["sv", "da", "no", "is", "af", "fi"], |
| "weight": 1.8, |
| "min_chars": 2_000, |
| "latin": True, |
| }, |
| |
| "BalkanCyrillic": { |
| "langs": ["bg", "sr", "mk"], |
| "weight": 1.0, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "ArabicOther": { |
| "langs": ["fa", "ps", "sd", "ug"], |
| "weight": 0.9, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| |
| "Hindi": { |
| "langs": ["hi"], |
| "weight": 1.0, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "IndicOther": { |
| "langs": ["ur", "bn", "ta", "te", "mr", "gu", "kn", "ml", "pa", "as", "or"], |
| "weight": 0.9, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| |
| "CentralAsianCyrillic": { |
| "langs": ["kk", "mn"], |
| "weight": 0.9, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| "AfricanLatin": { |
| "langs": ["sw", "tl", "eu"], |
| "weight": 0.8, |
| "min_chars": 1_500, |
| "latin": True, |
| }, |
| |
| |
| "OtherScripts": { |
| "langs": ["el", "he", "hy", "ka", "am", "km", "lo", "my", "th", "si", "bo", "ti", "dv"], |
| "weight": 0.9, |
| "min_chars": 2_000, |
| "latin": False, |
| }, |
| } |
|
|
| POOL = { |
| "wiki": { |
| "reserve": 0.60, |
| "min": 4, |
| "max": 120_000, |
| }, |
| "smol": { |
| "reserve": 0.95, |
| "min": 1, |
| "max": 1_000, |
| }, |
| "ft": { |
| "reserve": 0.60, |
| "min": 1, |
| "max": 30_000, |
| }, |
| } |
|
|
| DOC_MIX = { |
| "pure": { |
| "fraction": 0.60, |
| "pool": "reserve", |
| "min_sentences": 1, |
| "max_sentences": 4, |
| "strip_punct_prob": 0.10, |
| }, |
| "homogeneous": { |
| "fraction": 0.30, |
| "pool": "main", |
| "min_sentences": 2, |
| "max_sentences": 6, |
| "strip_punct_prob": 0.15, |
| }, |
| "mixed": { |
| "fraction": 0.10, |
| "pool": "main", |
| "min_segments": 2, |
| "max_segments": 4, |
| "strip_punct_prob": 0.25, |
| "swap_prob": 0.06, |
| "o_inject_prob": 0.06, |
| "allow_repeated_langs": True, |
| }, |
| } |
|
|
| SMOL = { |
| "use": True, |
| "rebuild": False, |
| } |
|
|
| FT = { |
| "use": True, |
| "rebuild": False, |
| "max_lang": 50_000, |
| "overflow_lang": 75_000, |
| "max_row": 50_000, |
| "miss": 1_000, |
| "include_en": True, |
| "langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"}, |
| } |
| FT["every"] = len(FT["langs"]) |
|
|
| RUN = { |
| "len": 512, |
| "target": 2_500_000, |
| "syn_cache": True, |
| "syn_rebuild": False, |
| "tok_cache": True, |
| "tok_rebuild": False, |
| "tok_skip_check": False, |
| "retry": 8, |
| "preview": 2_000, |
| } |
|
|