Spaces:

DerivedFunction
/

language-extractor-demo

Running

language-extractor-demo / source_config.py

add

27e2dfc 1 day ago

6.67 kB

	from __future__ import annotations

	LANGUAGE_BUCKETS = {
	# ~41% of CC — intentionally capped to avoid crowding out other languages
	"English": {
	"langs": ["en"],
	"weight": 2.5,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~6.3% of CC — was badly underweighted relative to German/French
	"Russian": {
	"langs": ["ru"],
	"weight": 1.8,
	"min_chars": 2_000,
	"latin": False,
	},
	# ~5.9% of CC
	"German": {
	"langs": ["de"],
	"weight": 1.8,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~5.7% of CC — bumped up from 1.7 to match its actual footprint
	"Japanese": {
	"langs": ["ja"],
	"weight": 1.8,
	"min_chars": 1_200,
	"latin": False,
	},
	# ~5.0% of CC — CC likely undercounts due to Great Firewall
	"Chinese": {
	"langs": ["zh"],
	"weight": 1.8,
	"min_chars": 1_200,
	"latin": False,
	},
	# ~4.6% of CC
	"French": {
	"langs": ["fr"],
	"weight": 1.8,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~4.6% of CC
	"Spanish": {
	"langs": ["es"],
	"weight": 1.8,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~2.5% of CC
	"Portuguese": {
	"langs": ["pt"],
	"weight": 1.6,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~2.4% of CC
	"Italian": {
	"langs": ["it"],
	"weight": 1.5,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
	"Polish": {
	"langs": ["pl"],
	"weight": 1.5,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~1.8% of CC — was significantly underweighted at 1.15
	"Dutch": {
	"langs": ["nl"],
	"weight": 1.5,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
	"Turkish": {
	"langs": ["tr"],
	"weight": 1.4,
	"min_chars": 2_000,
	"latin": True,
	},
	# ind ~1.1%, vie ~1.05% of CC
	"SoutheastAsianLatin": {
	"langs": ["vi", "id", "ms", "sq", "la"],
	"weight": 1.4,
	"min_chars": 2_000,
	"latin": True,
	},
	# ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
	"CentralEuropeanLatin": {
	"langs": ["cs", "ro", "hu"],
	"weight": 1.2,
	"min_chars": 2_000,
	"latin": True,
	},
	# ~0.81% of CC — was overweighted at 1.7
	"Korean": {
	"langs": ["ko"],
	"weight": 1.3,
	"min_chars": 1_200,
	"latin": False,
	},
	# ukr ~0.70%, bel ~0.017% of CC
	"EastSlavicCyrillic": {
	"langs": ["uk", "be"],
	"weight": 1.15,
	"min_chars": 2_000,
	"latin": False,
	},
	# ~0.65% of CC — upweighted relative to CC share given speaker population
	"Arabic": {
	"langs": ["ar"],
	"weight": 1.35,
	"min_chars": 2_000,
	"latin": False,
	},
	# sv ~0.7%, dan ~0.51%, nor+nno ~0.33%, fin ~0.37%, isl ~0.04%, afr ~0.01%
	# combined ~2.0% of CC — was drastically overweighted at 6.0
	# note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
	"NordicCore": {
	"langs": ["sv", "da", "no", "is", "af", "fi"],
	"weight": 1.8,
	"min_chars": 2_000,
	"latin": True,
	},
	# bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
	"BalkanCyrillic": {
	"langs": ["bg", "sr", "mk"],
	"weight": 1.0,
	"min_chars": 2_000,
	"latin": False,
	},
	# fas ~0.20% of CC (ignore the one anomalous crawl spike)
	"ArabicOther": {
	"langs": ["fa", "ps", "sd", "ug"],
	"weight": 0.9,
	"min_chars": 2_000,
	"latin": False,
	},
	# ~0.22% of CC — genuine web underrepresentation relative to speaker count,
	# but corpus is thin; 1.0 avoids oversampling a small pool
	"Hindi": {
	"langs": ["hi"],
	"weight": 1.0,
	"min_chars": 2_000,
	"latin": False,
	},
	# combined ~0.27% of CC — upweighted for script diversity
	"IndicOther": {
	"langs": ["ur", "bn", "ta", "te", "mr", "gu", "kn", "ml", "pa", "as", "or"],
	"weight": 0.9,
	"min_chars": 2_000,
	"latin": False,
	},
	# kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
	"CentralAsianCyrillic": {
	"langs": ["kk", "mn"],
	"weight": 0.9,
	"min_chars": 2_000,
	"latin": False,
	},
	"AfricanLatin": {
	"langs": ["sw", "tl", "eu"],
	"weight": 0.8,
	"min_chars": 1_500,
	"latin": True,
	},
	# el ~0.55%, he ~0.24%, th ~0.38%, hy ~0.033%, ka ~0.044% etc. — combined ~1%+
	# nudged up slightly from 0.8 given Greek and Thai have meaningful CC presence
	"OtherScripts": {
	"langs": ["el", "he", "hy", "ka", "am", "km", "lo", "my", "th", "si", "bo", "ti", "dv"],
	"weight": 0.9,
	"min_chars": 2_000,
	"latin": False,
	},
	}

	POOL = {
	"wiki": {
	"reserve": 0.60,
	"min": 4,
	"max": 120_000,
	},
	"smol": {
	"reserve": 0.95,
	"min": 1,
	"max": 1_000,
	},
	"ft": {
	"reserve": 0.60,
	"min": 1,
	"max": 30_000,
	},
	}

	DOC_MIX = {
	"pure": {
	"fraction": 0.60,
	"pool": "reserve",
	"min_sentences": 1,
	"max_sentences": 4,
	"strip_punct_prob": 0.10,
	},
	"homogeneous": {
	"fraction": 0.30,
	"pool": "main",
	"min_sentences": 2,
	"max_sentences": 6,
	"strip_punct_prob": 0.15,
	},
	"mixed": {
	"fraction": 0.10,
	"pool": "main",
	"min_segments": 2,
	"max_segments": 4,
	"strip_punct_prob": 0.25,
	"swap_prob": 0.06,
	"o_inject_prob": 0.06,
	"allow_repeated_langs": True,
	},
	}

	SMOL = {
	"use": True,
	"rebuild": False,
	}

	FT = {
	"use": True,
	"rebuild": False,
	"max_lang": 50_000,
	"overflow_lang": 75_000,
	"max_row": 50_000,
	"miss": 1_000,
	"include_en": True,
	"langs": {"en", "es", "fr", "pt", "it", "nl", "de", "sv", "da", "id", "ms"},
	}
	FT["every"] = len(FT["langs"])

	RUN = {
	"len": 512,
	"target": 2_500_000, # synthetic mixed-language training examples to generate
	"syn_cache": True,
	"syn_rebuild": False,
	"tok_cache": True,
	"tok_rebuild": False,
	"tok_skip_check": False,
	"retry": 8,
	"preview": 2_000,
	}