| from __future__ import annotations |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| from source_config import LANGUAGE_BUCKETS |
|
|
| LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()} |
| LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()} |
| LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()} |
| LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")} |
|
|
| LANGS_JSON = Path(__file__).with_name("all_langs.json") |
|
|
| |
| LANG_ISO2_TO_ISO3 = {} |
| with open(LANGS_JSON) as f: |
| |
| LANG_ISO2_TO_ISO3 = json.load(f) |
|
|
| ALL_LANGS = list(LANG_ISO2_TO_ISO3.keys()) |
| LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs} |
|
|
|
|
| def write_all_langs_json(path: str | os.PathLike[str] = LANGS_JSON) -> None: |
| """Write the canonical ALL_LANGS list to JSON if it is missing.""" |
| path = Path(path) |
| if path.exists(): |
| return |
| with path.open("w", encoding="utf-8") as f: |
| json.dump(ALL_LANGS, f, ensure_ascii=False, indent=2) |
|
|
|
|
| def load_all_langs(path: str | os.PathLike[str] = LANGS_JSON) -> list[str]: |
| """Load ALL_LANGS from JSON, falling back to the in-repo constant.""" |
| path = Path(path) |
| if path.exists(): |
| with path.open(encoding="utf-8") as f: |
| langs = json.load(f) |
| if isinstance(langs, list) and all(isinstance(lang, str) for lang in langs): |
| return langs |
| write_all_langs_json(path) |
| return ALL_LANGS[:] |
|
|
| ENGLISH_STOP_WORDS = [ |
| "able", |
| "about", |
| "above", |
| "abroad", |
| "according", |
| "accordingly", |
| "across", |
| "actually", |
| "after", |
| "afterwards", |
| "again", |
| "against", |
| "ago", |
| "ahead", |
| "aint", |
| "all", |
| "allow", |
| "almost", |
| "alone", |
| "along", |
| "alongside", |
| "already", |
| "also", |
| "although", |
| "always", |
| "am", |
| "amid", |
| "amidst", |
| "among", |
| "amongst", |
| "an", |
| "and", |
| "another", |
| "any", |
| "anybody", |
| "anyhow", |
| "anyone", |
| "anything", |
| "anyway", |
| "anyways", |
| "anywhere", |
| "apart", |
| "appear", |
| "appreciate", |
| "appropriate", |
| "app", |
| "are", |
| "arent", |
| "aren", |
| "around", |
| "as", |
| "aside", |
| "ask", |
| "asking", |
| "associated", |
| "at", |
| "available", |
| "away", |
| "awfully", |
| "back", |
| "backward", |
| "be", |
| "became", |
| "because", |
| "become", |
| "becoming", |
| "been", |
| "before", |
| "beforehand", |
| "begin", |
| "behind", |
| "being", |
| "believe", |
| "below", |
| "beside", |
| "best", |
| "better", |
| "between", |
| "beyond", |
| "both", |
| "brief", |
| "but", |
| "by", |
| "came", |
| "can", |
| "cannot", |
| "cant", |
| "caption", |
| "cause", |
| "certain", |
| "certainly", |
| "changes", |
| "clearly", |
| "cmon", |
| "com", |
| "come", |
| "concerning", |
| "consequently", |
| "consider", |
| "considering", |
| "contain", |
| "containing", |
| "corresponding", |
| "could", |
| "couldnt", |
| "course", |
| "currently", |
| "definitely", |
| "described", |
| "despite", |
| "did", |
| "didnt", |
| "different", |
| "directly", |
| "do", |
| "does", |
| "doesnt", |
| "doing", |
| "done", |
| "dont", |
| "down", |
| "downward", |
| "download", |
| "during", |
| "each", |
| "eight", |
| "eighty", |
| "either", |
| "else", |
| "elsewhere", |
| "end", |
| "ending", |
| "enough", |
| "entirely", |
| "especially", |
| "etc", |
| "even", |
| "ever", |
| "evermore", |
| "every", |
| "everybody", |
| "everyone", |
| "everything", |
| "everywhere", |
| "exactly", |
| "example", |
| "except", |
| "fairly", |
| "far", |
| "farther", |
| "few", |
| "fewer", |
| "fifth", |
| "first", |
| "five", |
| "followed", |
| "following", |
| "follows", |
| "for", |
| "forever", |
| "former", |
| "formerly", |
| "forth", |
| "forward", |
| "found", |
| "four", |
| "from", |
| "free", |
| "further", |
| "furthermore", |
| "get", |
| "gets", |
| "getting", |
| "given", |
| "gives", |
| "go", |
| "goes", |
| "going", |
| "gone", |
| "got", |
| "gotten", |
| "greetings", |
| "had", |
| "hadnt", |
| "half", |
| "happens", |
| "hardly", |
| "has", |
| "hasnt", |
| "have", |
| "havent", |
| "having", |
| "he", |
| "hed", |
| "hell", |
| "hello", |
| "help", |
| "hence", |
| "her", |
| "here", |
| "hereafter", |
| "hereby", |
| "herein", |
| "hereupon", |
| "herself", |
| "hi", |
| "him", |
| "himself", |
| "his", |
| "hither", |
| "hopefully", |
| "how", |
| "howbeit", |
| "however", |
| "hundred", |
| "id", |
| "ie", |
| "if", |
| "ignored", |
| "ill", |
| "im", |
| "immediate", |
| "in", |
| "inasmuch", |
| "inc", |
| "indeed", |
| "indicate", |
| "indicated", |
| "inner", |
| "inside", |
| "insofar", |
| "instead", |
| "into", |
| "inward", |
| "is", |
| "isnt", |
| "it", |
| "itd", |
| "itll", |
| "itself", |
| "ive", |
| "just", |
| "keep", |
| "keeps", |
| "kept", |
| "know", |
| "known", |
| "last", |
| "lately", |
| "later", |
| "latter", |
| "least", |
| "less", |
| "lest", |
| "let", |
| "like", |
| "liked", |
| "likely", |
| "likewise", |
| "little", |
| "look", |
| "looking", |
| "low", |
| "lower", |
| "ltd", |
| "made", |
| "mainly", |
| "make", |
| "many", |
| "may", |
| "maybe", |
| "maynt", |
| "me", |
| "mean", |
| "meantime", |
| "meanwhile", |
| "merely", |
| "might", |
| "mightnt", |
| "mine", |
| "minus", |
| "miss", |
| "more", |
| "moreover", |
| "most", |
| "mostly", |
| "much", |
| "must", |
| "mustnt", |
| "my", |
| "myself", |
| "name", |
| "namely", |
| "near", |
| "nearly", |
| "necessary", |
| "need", |
| "neednt", |
| "neither", |
| "never", |
| "neverless", |
| "nevertheless", |
| "new", |
| "next", |
| "nine", |
| "ninety", |
| "no", |
| "nobody", |
| "non", |
| "none", |
| "nonetheless", |
| "noone", |
| "no-one", |
| "nor", |
| "normally", |
| "not", |
| "nothing", |
| "notwithstanding", |
| "novel", |
| "now", |
| "nowhere", |
| "obviously", |
| "of", |
| "off", |
| "often", |
| "oh", |
| "ok", |
| "okay", |
| "old", |
| "on", |
| "once", |
| "one", |
| "only", |
| "onto", |
| "opposite", |
| "or", |
| "other", |
| "otherwise", |
| "ought", |
| "oughtnt", |
| "our", |
| "ourselves", |
| "out", |
| "outside", |
| "over", |
| "overall", |
| "own", |
| "particular", |
| "particularly", |
| "past", |
| "per", |
| "perhaps", |
| "placed", |
| "please", |
| "plus", |
| "possible", |
| "presumably", |
| "probably", |
| "provided", |
| "provide", |
| "quite", |
| "rather", |
| "really", |
| "reasonably", |
| "recent", |
| "recently", |
| "regarding", |
| "regardless", |
| "regards", |
| "relatively", |
| "respectively", |
| "right", |
| "round", |
| "said", |
| "same", |
| "saw", |
| "say", |
| "saying", |
| "second", |
| "secondly", |
| "see", |
| "seeing", |
| "seem", |
| "seemed", |
| "seeming", |
| "seems", |
| "seen", |
| "self", |
| "sensible", |
| "sent", |
| "serious", |
| "seriously", |
| "seven", |
| "several", |
| "shall", |
| "shant", |
| "she", |
| "shed", |
| "shell", |
| "should", |
| "shouldnt", |
| "since", |
| "six", |
| "so", |
| "some", |
| "somebody", |
| "someday", |
| "somehow", |
| "someone", |
| "something", |
| "sometime", |
| "somewhat", |
| "somewhere", |
| "soon", |
| "sorry", |
| "specified", |
| "specify", |
| "specifying", |
| "still", |
| "such", |
| "sure", |
| "take", |
| "taken", |
| "taking", |
| "tell", |
| "tends", |
| "ten", |
| "than", |
| "thank", |
| "that", |
| "thatll", |
| "thatve", |
| "the", |
| "their", |
| "them", |
| "themselves", |
| "then", |
| "thence", |
| "there", |
| "thereafter", |
| "thereby", |
| "thered", |
| "therefore", |
| "therein", |
| "therell", |
| "therere", |
| "thereupon", |
| "thereve", |
| "these", |
| "they", |
| "theyd", |
| "theyll", |
| "theyre", |
| "theyve", |
| "thing", |
| "think", |
| "third", |
| "thirty", |
| "this", |
| "thorough", |
| "thoroughly", |
| "those", |
| "though", |
| "three", |
| "through", |
| "throughout", |
| "thru", |
| "thus", |
| "till", |
| "to", |
| "together", |
| "too", |
| "took", |
| "toward", |
| "tried", |
| "tries", |
| "truly", |
| "try", |
| "trying", |
| "twice", |
| "two", |
| "under", |
| "underneath", |
| "undoing", |
| "unfortunately", |
| "unless", |
| "unlike", |
| "unlikely", |
| "until", |
| "unto", |
| "up", |
| "upon", |
| "upwards", |
| "use", |
| "used", |
| "useful", |
| "using", |
| "usually", |
| "value", |
| "various", |
| "versus", |
| "very", |
| "via", |
| "viz", |
| "want", |
| "was", |
| "wasnt", |
| "way", |
| "we", |
| "wed", |
| "welcome", |
| "well", |
| "went", |
| "were", |
| "werent", |
| "weve", |
| "what", |
| "whatever", |
| "whatll", |
| "whatve", |
| "when", |
| "whence", |
| "whenever", |
| "where", |
| "whereafter", |
| "whereas", |
| "whereby", |
| "wherein", |
| "whereupon", |
| "wherever", |
| "whether", |
| "which", |
| "whichever", |
| "while", |
| "whilst", |
| "whither", |
| "who", |
| "whod", |
| "whoever", |
| "whole", |
| "wholl", |
| "whom", |
| "whomever", |
| "whose", |
| "why", |
| "will", |
| "willing", |
| "wish", |
| "with", |
| "within", |
| "without", |
| "wonder", |
| "wont", |
| "would", |
| "wouldnt", |
| "website", |
| "yes", |
| "yet", |
| "you", |
| "youd", |
| "youll", |
| "your", |
| "youre", |
| "yourself", |
| "yourselves", |
| "youve", |
| "zero", |
| ] |
|
|