Spaces:

DerivedFunction
/

language-extractor-demo

Running

App Files Files Community

= commited on 2 days ago

Commit

27e2dfc

1 Parent(s): 7601bd6

add

Browse files

Files changed (2) hide show

all_langs.json +69 -0
source_config.py +256 -0

all_langs.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "en": "eng",
+  "es": "spa",
+  "fr": "fra",
+  "de": "deu",
+  "it": "ita",
+  "pt": "por",
+  "nl": "nld",
+  "vi": "vie",
+  "tr": "tur",
+  "la": "lat",
+  "id": "ind",
+  "ms": "msa",
+  "af": "afr",
+  "sq": "sqi",
+  "is": "isl",
+  "no": "nor",
+  "sv": "swe",
+  "da": "dan",
+  "fi": "fin",
+  "hu": "hun",
+  "pl": "pol",
+  "cs": "ces",
+  "ro": "ron",
+  "ru": "rus",
+  "bg": "bul",
+  "uk": "ukr",
+  "sr": "srp",
+  "be": "bel",
+  "kk": "kaz",
+  "mk": "mkd",
+  "mn": "mon",
+  "zh": "zho",
+  "ja": "jpn",
+  "ko": "kor",
+  "hi": "hin",
+  "ur": "urd",
+  "bn": "ben",
+  "ta": "tam",
+  "te": "tel",
+  "mr": "mar",
+  "gu": "guj",
+  "kn": "kan",
+  "ml": "mal",
+  "pa": "pan",
+  "as": "asm",
+  "or": "ori",
+  "ar": "ara",
+  "fa": "fas",
+  "ps": "pus",
+  "sd": "snd",
+  "ug": "uig",
+  "el": "ell",
+  "he": "heb",
+  "hy": "hye",
+  "ka": "kat",
+  "am": "amh",
+  "km": "khm",
+  "lo": "lao",
+  "my": "mya",
+  "th": "tha",
+  "si": "sin",
+  "bo": "bod",
+  "dv": "div",
+  "ti": "tir",
+  "sw": "swa",
+  "eu": "eus",
+  "tl": "tgl"
+}

source_config.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from __future__ import annotations
+LANGUAGE_BUCKETS = {
+    # ~41% of CC — intentionally capped to avoid crowding out other languages
+    "English": {
+        "langs": ["en"],
+        "weight": 2.5,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~6.3% of CC — was badly underweighted relative to German/French
+    "Russian": {
+        "langs": ["ru"],
+        "weight": 1.8,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # ~5.9% of CC
+    "German": {
+        "langs": ["de"],
+        "weight": 1.8,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
+    "Japanese": {
+        "langs": ["ja"],
+        "weight": 1.8,
+        "min_chars": 1_200,
+        "latin": False,
+    },
+    # ~5.0% of CC — CC likely undercounts due to Great Firewall
+    "Chinese": {
+        "langs": ["zh"],
+        "weight": 1.8,
+        "min_chars": 1_200,
+        "latin": False,
+    },
+    # ~4.6% of CC
+    "French": {
+        "langs": ["fr"],
+        "weight": 1.8,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~4.6% of CC
+    "Spanish": {
+        "langs": ["es"],
+        "weight": 1.8,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~2.5% of CC
+    "Portuguese": {
+        "langs": ["pt"],
+        "weight": 1.6,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~2.4% of CC
+    "Italian": {
+        "langs": ["it"],
+        "weight": 1.5,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
+    "Polish": {
+        "langs": ["pl"],
+        "weight": 1.5,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~1.8% of CC — was significantly underweighted at 1.15
+    "Dutch": {
+        "langs": ["nl"],
+        "weight": 1.5,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
+    "Turkish": {
+        "langs": ["tr"],
+        "weight": 1.4,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ind ~1.1%, vie ~1.05% of CC
+    "SoutheastAsianLatin": {
+        "langs": ["vi", "id", "ms", "sq", "la"],
+        "weight": 1.4,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
+    "CentralEuropeanLatin": {
+        "langs": ["cs", "ro", "hu"],
+        "weight": 1.2,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # ~0.81% of CC — was overweighted at 1.7
+    "Korean": {
+        "langs": ["ko"],
+        "weight": 1.3,
+        "min_chars": 1_200,
+        "latin": False,
+    },
+    # ukr ~0.70%, bel ~0.017% of CC
+    "EastSlavicCyrillic": {
+        "langs": ["uk", "be"],
+        "weight": 1.15,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # ~0.65% of CC — upweighted relative to CC share given speaker population
+    "Arabic": {
+        "langs": ["ar"],
+        "weight": 1.35,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # sv ~0.7%, dan ~0.51%, nor+nno ~0.33%, fin ~0.37%, isl ~0.04%, afr ~0.01%
+    # combined ~2.0% of CC — was drastically overweighted at 6.0
+    # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
+    "NordicCore": {
+        "langs": ["sv", "da", "no", "is", "af", "fi"],
+        "weight": 1.8,
+        "min_chars": 2_000,
+        "latin": True,
+    },
+    # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
+    "BalkanCyrillic": {
+        "langs": ["bg", "sr", "mk"],
+        "weight": 1.0,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # fas ~0.20% of CC (ignore the one anomalous crawl spike)
+    "ArabicOther": {
+        "langs": ["fa", "ps", "sd", "ug"],
+        "weight": 0.9,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # ~0.22% of CC — genuine web underrepresentation relative to speaker count,
+    # but corpus is thin; 1.0 avoids oversampling a small pool
+    "Hindi": {
+        "langs": ["hi"],
+        "weight": 1.0,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # combined ~0.27% of CC — upweighted for script diversity
+    "IndicOther": {
+        "langs": ["ur", "bn", "ta", "te", "mr", "gu", "kn", "ml", "pa", "as", "or"],
+        "weight": 0.9,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
+    "CentralAsianCyrillic": {
+        "langs": ["kk", "mn"],
+        "weight": 0.9,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+    "AfricanLatin": {
+        "langs": ["sw", "tl", "eu"],
+        "weight": 0.8,
+        "min_chars": 1_500,
+        "latin": True,
+    },
+    # el ~0.55%, he ~0.24%, th ~0.38%, hy ~0.033%, ka ~0.044% etc. — combined ~1%+
+    # nudged up slightly from 0.8 given Greek and Thai have meaningful CC presence
+    "OtherScripts": {
+        "langs": ["el", "he", "hy", "ka", "am", "km", "lo", "my", "th", "si", "bo", "ti", "dv"],
+        "weight": 0.9,
+        "min_chars": 2_000,
+        "latin": False,
+    },
+}
+POOL = {
+    "wiki": {
+        "reserve": 0.60,
+        "min": 4,
+        "max": 120_000,
+    },
+    "smol": {
+        "reserve": 0.95,
+        "min": 1,
+        "max": 1_000,
+    },
+    "ft": {
+        "reserve": 0.60,
+        "min": 1,
+        "max": 30_000,
+    },
+}
+DOC_MIX = {
+    "pure": {
+        "fraction": 0.60,
+        "pool": "reserve",
+        "min_sentences": 1,
+        "max_sentences": 4,
+        "strip_punct_prob": 0.10,
+    },
+    "homogeneous": {
+        "fraction": 0.30,
+        "pool": "main",
+        "min_sentences": 2,
+        "max_sentences": 6,
+        "strip_punct_prob": 0.15,
+    },
+    "mixed": {
+        "fraction": 0.10,
+        "pool": "main",
+        "min_segments": 2,
+        "max_segments": 4,
+        "strip_punct_prob": 0.25,
+        "swap_prob": 0.06,
+        "o_inject_prob": 0.06,
+        "allow_repeated_langs": True,
+    },
+}
+SMOL = {
+    "use": True,
+    "rebuild": False,
+}
+FT = {
+    "use": True,
+    "rebuild": False,
+    "max_lang": 50_000,
+    "overflow_lang": 75_000,
+    "max_row": 50_000,
+    "miss": 1_000,
+    "include_en": True,
+    "langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"},
+}
+FT["every"] = len(FT["langs"])
+RUN = {
+    "len": 512,
+    "target": 2_500_000,  # synthetic mixed-language training examples to generate
+    "syn_cache": True,
+    "syn_rebuild": False,
+    "tok_cache": True,
+    "tok_rebuild": False,
+    "tok_skip_check": False,
+    "retry": 8,
+    "preview": 2_000,
+}