=
add
24aa1c5
from __future__ import annotations
import json
import os
from pathlib import Path
from source_config import LANGUAGE_BUCKETS
LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
LANGS_JSON = Path(__file__).with_name("all_langs.json")
# Read from file to load the key-value pairs. JSON key order is canonical.
LANG_ISO2_TO_ISO3 = {}
with open(LANGS_JSON) as f:
# Parse the file as a json string
LANG_ISO2_TO_ISO3 = json.load(f)
ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys())
LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None:
"""Write the canonical ALL_LANGS list to JSON if it is missing."""
path = Path(path)
if path.exists():
return
with path.open("w", encoding="utf-8") as f:
json.dump(ALL_LANGS, f, ensure_ascii=False, indent=2)
def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]:
"""Load ALL_LANGS from JSON, falling back to the in-repo constant."""
path = Path(path)
if path.exists():
with path.open(encoding="utf-8") as f:
langs = json.load(f)
if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs):
return langs
write_all_langs_json(path)
return ALL_LANGS[:]
ENGLISH_STOP_WORDS = [
"able",
"about",
"above",
"abroad",
"according",
"accordingly",
"across",
"actually",
"after",
"afterwards",
"again",
"against",
"ago",
"ahead",
"aint",
"all",
"allow",
"almost",
"alone",
"along",
"alongside",
"already",
"also",
"although",
"always",
"am",
"amid",
"amidst",
"among",
"amongst",
"an",
"and",
"another",
"any",
"anybody",
"anyhow",
"anyone",
"anything",
"anyway",
"anyways",
"anywhere",
"apart",
"appear",
"appreciate",
"appropriate",
"app",
"are",
"arent",
"aren",
"around",
"as",
"aside",
"ask",
"asking",
"associated",
"at",
"available",
"away",
"awfully",
"back",
"backward",
"be",
"became",
"because",
"become",
"becoming",
"been",
"before",
"beforehand",
"begin",
"behind",
"being",
"believe",
"below",
"beside",
"best",
"better",
"between",
"beyond",
"both",
"brief",
"but",
"by",
"came",
"can",
"cannot",
"cant",
"caption",
"cause",
"certain",
"certainly",
"changes",
"clearly",
"cmon",
"com",
"come",
"concerning",
"consequently",
"consider",
"considering",
"contain",
"containing",
"corresponding",
"could",
"couldnt",
"course",
"currently",
"definitely",
"described",
"despite",
"did",
"didnt",
"different",
"directly",
"do",
"does",
"doesnt",
"doing",
"done",
"dont",
"down",
"downward",
"download",
"during",
"each",
"eight",
"eighty",
"either",
"else",
"elsewhere",
"end",
"ending",
"enough",
"entirely",
"especially",
"etc",
"even",
"ever",
"evermore",
"every",
"everybody",
"everyone",
"everything",
"everywhere",
"exactly",
"example",
"except",
"fairly",
"far",
"farther",
"few",
"fewer",
"fifth",
"first",
"five",
"followed",
"following",
"follows",
"for",
"forever",
"former",
"formerly",
"forth",
"forward",
"found",
"four",
"from",
"free",
"further",
"furthermore",
"get",
"gets",
"getting",
"given",
"gives",
"go",
"goes",
"going",
"gone",
"got",
"gotten",
"greetings",
"had",
"hadnt",
"half",
"happens",
"hardly",
"has",
"hasnt",
"have",
"havent",
"having",
"he",
"hed",
"hell",
"hello",
"help",
"hence",
"her",
"here",
"hereafter",
"hereby",
"herein",
"hereupon",
"herself",
"hi",
"him",
"himself",
"his",
"hither",
"hopefully",
"how",
"howbeit",
"however",
"hundred",
"id",
"ie",
"if",
"ignored",
"ill",
"im",
"immediate",
"in",
"inasmuch",
"inc",
"indeed",
"indicate",
"indicated",
"inner",
"inside",
"insofar",
"instead",
"into",
"inward",
"is",
"isnt",
"it",
"itd",
"itll",
"itself",
"ive",
"just",
"keep",
"keeps",
"kept",
"know",
"known",
"last",
"lately",
"later",
"latter",
"least",
"less",
"lest",
"let",
"like",
"liked",
"likely",
"likewise",
"little",
"look",
"looking",
"low",
"lower",
"ltd",
"made",
"mainly",
"make",
"many",
"may",
"maybe",
"maynt",
"me",
"mean",
"meantime",
"meanwhile",
"merely",
"might",
"mightnt",
"mine",
"minus",
"miss",
"more",
"moreover",
"most",
"mostly",
"much",
"must",
"mustnt",
"my",
"myself",
"name",
"namely",
"near",
"nearly",
"necessary",
"need",
"neednt",
"neither",
"never",
"neverless",
"nevertheless",
"new",
"next",
"nine",
"ninety",
"no",
"nobody",
"non",
"none",
"nonetheless",
"noone",
"no-one",
"nor",
"normally",
"not",
"nothing",
"notwithstanding",
"novel",
"now",
"nowhere",
"obviously",
"of",
"off",
"often",
"oh",
"ok",
"okay",
"old",
"on",
"once",
"one",
"only",
"onto",
"opposite",
"or",
"other",
"otherwise",
"ought",
"oughtnt",
"our",
"ourselves",
"out",
"outside",
"over",
"overall",
"own",
"particular",
"particularly",
"past",
"per",
"perhaps",
"placed",
"please",
"plus",
"possible",
"presumably",
"probably",
"provided",
"provide",
"quite",
"rather",
"really",
"reasonably",
"recent",
"recently",
"regarding",
"regardless",
"regards",
"relatively",
"respectively",
"right",
"round",
"said",
"same",
"saw",
"say",
"saying",
"second",
"secondly",
"see",
"seeing",
"seem",
"seemed",
"seeming",
"seems",
"seen",
"self",
"sensible",
"sent",
"serious",
"seriously",
"seven",
"several",
"shall",
"shant",
"she",
"shed",
"shell",
"should",
"shouldnt",
"since",
"six",
"so",
"some",
"somebody",
"someday",
"somehow",
"someone",
"something",
"sometime",
"somewhat",
"somewhere",
"soon",
"sorry",
"specified",
"specify",
"specifying",
"still",
"such",
"sure",
"take",
"taken",
"taking",
"tell",
"tends",
"ten",
"than",
"thank",
"that",
"thatll",
"thatve",
"the",
"their",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"thered",
"therefore",
"therein",
"therell",
"therere",
"thereupon",
"thereve",
"these",
"they",
"theyd",
"theyll",
"theyre",
"theyve",
"thing",
"think",
"third",
"thirty",
"this",
"thorough",
"thoroughly",
"those",
"though",
"three",
"through",
"throughout",
"thru",
"thus",
"till",
"to",
"together",
"too",
"took",
"toward",
"tried",
"tries",
"truly",
"try",
"trying",
"twice",
"two",
"under",
"underneath",
"undoing",
"unfortunately",
"unless",
"unlike",
"unlikely",
"until",
"unto",
"up",
"upon",
"upwards",
"use",
"used",
"useful",
"using",
"usually",
"value",
"various",
"versus",
"very",
"via",
"viz",
"want",
"was",
"wasnt",
"way",
"we",
"wed",
"welcome",
"well",
"went",
"were",
"werent",
"weve",
"what",
"whatever",
"whatll",
"whatve",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"whereupon",
"wherever",
"whether",
"which",
"whichever",
"while",
"whilst",
"whither",
"who",
"whod",
"whoever",
"whole",
"wholl",
"whom",
"whomever",
"whose",
"why",
"will",
"willing",
"wish",
"with",
"within",
"without",
"wonder",
"wont",
"would",
"wouldnt",
"website",
"yes",
"yet",
"you",
"youd",
"youll",
"your",
"youre",
"yourself",
"yourselves",
"youve",
"zero",
]