Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| LANGUAGE_BUCKETS = { | |
| # ~41% of CC — intentionally capped to avoid crowding out other languages | |
| "English": { | |
| "langs": ["en"], | |
| "weight": 2.9, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~6.3% of CC — was badly underweighted relative to German/French | |
| "Russian": { | |
| "langs": ["ru"], | |
| "weight": 1.95, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # ~5.9% of CC | |
| "German": { | |
| "langs": ["de"], | |
| "weight": 1.9, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~5.7% of CC — bumped up from 1.7 to match its actual footprint | |
| "Japanese": { | |
| "langs": ["ja"], | |
| "weight": 1.9, | |
| "min_chars": 1_200, | |
| "latin": False, | |
| }, | |
| # ~5.0% of CC — CC likely undercounts due to Great Firewall | |
| "Chinese": { | |
| "langs": ["zh"], | |
| "weight": 1.9, | |
| "min_chars": 1_200, | |
| "latin": False, | |
| }, | |
| # ~4.6% of CC | |
| "French": { | |
| "langs": ["fr"], | |
| "weight": 1.9, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~4.6% of CC | |
| "Spanish": { | |
| "langs": ["es"], | |
| "weight": 1.9, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~2.5% of CC | |
| "Portuguese": { | |
| "langs": ["pt"], | |
| "weight": 1.7, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~2.4% of CC | |
| "Italian": { | |
| "langs": ["it"], | |
| "weight": 1.6, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese | |
| "Polish": { | |
| "langs": ["pl"], | |
| "weight": 1.55, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~1.8% of CC — was significantly underweighted at 1.15 | |
| "Dutch": { | |
| "langs": ["nl"], | |
| "weight": 1.55, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population | |
| "Turkish": { | |
| "langs": ["tr"], | |
| "weight": 1.45, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ind ~1.1%, vie ~1.05% of CC | |
| "SoutheastAsianLatin": { | |
| "langs": ["vi", "id", "ms", "sq", "la"], | |
| "weight": 1.55, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| "WesternLatin": { | |
| "langs": ["ca", "gl", "oc"], | |
| "weight": 1.2, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| "CelticLatin": { | |
| "langs": ["br", "ga", "gd", "cy"], | |
| "weight": 1.3, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| "AdriaticLatin": { | |
| "langs": ["bs", "hr", "sl", "sk"], | |
| "weight": 1.4, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| "BalticLatin": { | |
| "langs": ["et", "lv", "lt"], | |
| "weight": 1.2, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr | |
| "CentralEuropeanLatin": { | |
| "langs": ["cs", "ro", "hu"], | |
| "weight": 1.3, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # ~0.81% of CC — was overweighted at 1.7 | |
| "Korean": { | |
| "langs": ["ko"], | |
| "weight": 1.35, | |
| "min_chars": 1_200, | |
| "latin": False, | |
| }, | |
| # ukr ~0.70%, bel ~0.017% of CC | |
| "EastSlavicCyrillic": { | |
| "langs": ["uk", "be"], | |
| "weight": 1.7, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # ~0.65% of CC — upweighted relative to CC share given speaker population | |
| "Arabic": { | |
| "langs": ["ar"], | |
| "weight": 1.4, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| "Norwegian": { | |
| "langs": ["no"], | |
| "weight": 1.0, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01% | |
| # combined ~2.0% of CC — was drastically overweighted at 6.0 | |
| # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count | |
| "NordicCore": { | |
| "langs": ["sv", "da", "is", "af", "fi"], | |
| "weight": 2.1, | |
| "min_chars": 2_000, | |
| "latin": True, | |
| }, | |
| # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC | |
| "BalkanCyrillic": { | |
| "langs": ["bg", "sr", "mk"], | |
| "weight": 1.05, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # fas ~0.20% of CC (ignore the one anomalous crawl spike) | |
| "ArabicOther": { | |
| "langs": ["fa", "ps", "sd", "ug"], | |
| "weight": 0.95, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # ~0.22% of CC — genuine web underrepresentation relative to speaker count, | |
| # but corpus is thin; 1.0 avoids oversampling a small pool | |
| "Hindi": { | |
| "langs": ["hi"], | |
| "weight": 1.0, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # combined ~0.27% of CC — upweighted for script diversity | |
| "IndicOther": { | |
| "langs": [ | |
| "ur", | |
| "bn", | |
| "ta", | |
| "te", | |
| "mr", | |
| "gu", | |
| "kn", | |
| "ml", | |
| "pa", | |
| "as", | |
| "or", | |
| "ne", | |
| ], | |
| "weight": 0.95, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost | |
| "CentralAsianCaucusCyrillic": { | |
| "langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"], | |
| "weight": 1.1, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| # Kurdish is split by script/source: | |
| # - ku: Wikipedia / Latin-script Kurdish | |
| # - ckb: FineTranslations / Arabic-script Kurdish | |
| "KurdishLatin": { | |
| "langs": ["ku"], | |
| "weight": 0.45, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| "KurdishArabic": { | |
| "langs": ["ckb"], | |
| "weight": 0.45, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| "AfricanLatin": { | |
| "langs": ["sw", "tl", "eu", "yo", "zu", "ny"], | |
| "weight": 1.0, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| "PeripheralLatin": { | |
| "langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"], | |
| "weight": 1.0, | |
| "min_chars": 1_500, | |
| "latin": True, | |
| }, | |
| # Split the remaining non-Latin scripts into two buckets to keep | |
| # Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones. | |
| "OtherScriptsWest": { | |
| "langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"], | |
| "weight": 1.0, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| "OtherScriptsEast": { | |
| "langs": ["km", "lo", "my", "th", "si", "bo"], | |
| "weight": 1.0, | |
| "min_chars": 2_000, | |
| "latin": False, | |
| }, | |
| } |