DerivedFunction1 commited on
Commit
84e2dc1
·
1 Parent(s): 89f8b1b
Files changed (3) hide show
  1. README.md +9 -1
  2. app.py +68 -14
  3. sib200_cache.py +215 -0
README.md CHANGED
@@ -14,7 +14,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
14
 
15
  ## Offline caches
16
 
17
- The demo now uses local parquet caches for both FLEURS and Tatoeba.
18
 
19
  Build the FLEURS cache once with:
20
 
@@ -32,3 +32,11 @@ Build the Tatoeba cache once with:
32
  ```
33
 
34
  That converts `sentences.csv` into `data/tatoeba/tatoeba_text.parquet` and keeps only the lean inference columns.
 
 
 
 
 
 
 
 
 
14
 
15
  ## Offline caches
16
 
17
+ The demo now uses local parquet caches for FLEURS, Tatoeba, and SIB-200.
18
 
19
  Build the FLEURS cache once with:
20
 
 
32
  ```
33
 
34
  That converts `sentences.csv` into `data/tatoeba/tatoeba_text.parquet` and keeps only the lean inference columns.
35
+
36
+ Build the SIB-200 cache once with:
37
+
38
+ ```bash
39
+ ./.venv/bin/python sib200_cache.py
40
+ ```
41
+
42
+ That downloads the `Davlan/sib200` configs, keeps the text plus language/topic metadata, and writes a reusable lean parquet file at `data/sib200/sib200_text.parquet`.
app.py CHANGED
@@ -20,6 +20,7 @@ from transformers import AutoModelForTokenClassification, AutoTokenizer, pipelin
20
 
21
  from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
22
  from language import ALL_LANGS, LANG_ALIASES, LANG_ISO2_TO_ISO3, canonical_lang, canonical_lang_family
 
23
  from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
24
 
25
 
@@ -34,10 +35,12 @@ ARTIFACT_SPAN_WEIGHT = 0.35
34
  RANDOM_SENTENCE_SAMPLERS = (
35
  fetch_random_fleurs_sentence,
36
  fetch_random_tatoeba_sentence,
 
37
  )
38
  RANDOM_MIX_SAMPLERS = (
39
  fetch_random_fleurs_sentence_mix,
40
  fetch_random_tatoeba_sentence_mix,
 
41
  )
42
 
43
 
@@ -367,6 +370,35 @@ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
367
  return render_validation_html(validation, source_label="Tatoeba")
368
 
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  def _language_name(lang_code: str) -> str:
371
  """Best-effort human readable language name for a code."""
372
  code = (lang_code or "").strip()
@@ -556,12 +588,32 @@ def fasttext_alias_hint_for_lang(fasttext_result: dict[str, Any] | None, lang: s
556
 
557
  def fetch_random_cached_sentence() -> dict[str, Any]:
558
  """Randomly sample a sentence from either cached source."""
559
- return random.choice(RANDOM_SENTENCE_SAMPLERS)()
 
 
 
 
 
 
 
 
 
 
560
 
561
 
562
  def fetch_random_cached_sentence_mix() -> dict[str, Any]:
563
  """Randomly sample a mixed-language example from either cached source."""
564
- return random.choice(RANDOM_MIX_SAMPLERS)()
 
 
 
 
 
 
 
 
 
 
565
 
566
 
567
  def render_prediction_summary(
@@ -864,6 +916,7 @@ def load_random_tatoeba_example(fasttext_mode: str = "full") -> tuple[str, str,
864
  text = sentence["text"]
865
  summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
866
  sentence_rows = sentence.get("sentences") or [sentence]
 
867
  sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
868
  sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
869
  validation = build_example_validation(
@@ -874,8 +927,8 @@ def load_random_tatoeba_example(fasttext_mode: str = "full") -> tuple[str, str,
874
  raw = {
875
  **raw,
876
  "source": "tatoeba",
877
- "sentence_id": sentence.get("sentence_id", sentence.get("id")),
878
- "sentence_ids": [item.get("sentence_id", item.get("id")) for item in sentence_rows],
879
  "lang_count": sentence.get("lang_count", len(sentence_rows)),
880
  "sentence_langs": sentence_langs,
881
  "sentence_lang_iso3s": sentence_lang_iso3s,
@@ -883,9 +936,9 @@ def load_random_tatoeba_example(fasttext_mode: str = "full") -> tuple[str, str,
883
  "sentence_lang": sentence.get("source_lang", sentence.get("lang")),
884
  "sentence_lang_iso2": sentence.get("lang_iso2", sentence.get("source_lang")),
885
  "sentence_lang_iso3": sentence.get("lang_iso3", ""),
886
- "tatoeba_validation": validation,
887
  }
888
- validation_html = render_validation_html(validation, source_label="Tatoeba")
889
  summary = render_prediction_summary(
890
  text=text,
891
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -915,9 +968,9 @@ def load_random_tatoeba_mix_example(fasttext_mode: str = "full") -> tuple[str, s
915
  "sentence_langs": mix["langs"],
916
  "sentence_lang_iso3s": mix["lang_iso3s"],
917
  "sentences": mix["sentences"],
918
- "tatoeba_validation": validation,
919
  }
920
- validation_html = render_validation_html(validation, source_label="Tatoeba")
921
  summary = render_prediction_summary(
922
  text=text,
923
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -945,6 +998,7 @@ def load_random_fleurs_example(fasttext_mode: str = "full") -> tuple[str, str, p
945
  text = sentence["text"]
946
  summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
947
  sentence_rows = sentence.get("sentences") or [sentence]
 
948
  sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
949
  sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
950
  validation = build_example_validation(
@@ -955,8 +1009,8 @@ def load_random_fleurs_example(fasttext_mode: str = "full") -> tuple[str, str, p
955
  raw = {
956
  **raw,
957
  "source": sentence.get("source", "fleurs"),
958
- "cached_sentence_id": sentence.get("fleurs_id", sentence.get("sentence_id")),
959
- "cached_sentence_ids": [item.get("fleurs_id", item.get("sentence_id")) for item in sentence_rows],
960
  "lang_count": sentence.get("lang_count", len(sentence_rows)),
961
  "cached_split": sentence.get("split"),
962
  "cached_source_lang": sentence.get("source_lang"),
@@ -965,9 +1019,9 @@ def load_random_fleurs_example(fasttext_mode: str = "full") -> tuple[str, str, p
965
  "sentence_langs": sentence_langs,
966
  "sentence_lang_iso3s": sentence_lang_iso3s,
967
  "sentences": sentence_rows,
968
- "fleurs_validation": validation if sentence.get("source") == "fleurs" else {},
969
  }
970
- source_label = "FLEURS" if sentence.get("source") == "fleurs" else "Tatoeba"
971
  validation_html = render_validation_html(validation, source_label=source_label)
972
  summary = render_prediction_summary(
973
  text=text,
@@ -1007,9 +1061,9 @@ def load_random_fleurs_mix_example(fasttext_mode: str = "full") -> tuple[str, st
1007
  "sentence_langs": mix["langs"],
1008
  "sentence_lang_iso3s": mix["lang_iso3s"],
1009
  "sentences": mix["sentences"],
1010
- "fleurs_validation": validation if mix.get("source") == "fleurs-mix" else {},
1011
  }
1012
- source_label = "FLEURS" if mix.get("source") == "fleurs-mix" else "Tatoeba"
1013
  validation_html = render_validation_html(validation, source_label=source_label)
1014
  summary = render_prediction_summary(
1015
  text=text,
 
20
 
21
  from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
22
  from language import ALL_LANGS, LANG_ALIASES, LANG_ISO2_TO_ISO3, canonical_lang, canonical_lang_family
23
+ from sib200_cache import fetch_random_sib200_sentence, fetch_random_sib200_sentence_mix
24
  from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
25
 
26
 
 
35
  RANDOM_SENTENCE_SAMPLERS = (
36
  fetch_random_fleurs_sentence,
37
  fetch_random_tatoeba_sentence,
38
+ fetch_random_sib200_sentence,
39
  )
40
  RANDOM_MIX_SAMPLERS = (
41
  fetch_random_fleurs_sentence_mix,
42
  fetch_random_tatoeba_sentence_mix,
43
+ fetch_random_sib200_sentence_mix,
44
  )
45
 
46
 
 
370
  return render_validation_html(validation, source_label="Tatoeba")
371
 
372
 
373
+ def _source_key(source: str) -> str:
374
+ return (source or "").strip().split("-", 1)[0].lower()
375
+
376
+
377
+ def _source_label(source: str) -> str:
378
+ key = _source_key(source)
379
+ if key == "fleurs":
380
+ return "FLEURS"
381
+ if key == "tatoeba":
382
+ return "Tatoeba"
383
+ if key == "sib200":
384
+ return "SIB-200"
385
+ return key.upper() or "Example"
386
+
387
+
388
+ def _validation_key(source: str) -> str:
389
+ key = _source_key(source) or "example"
390
+ return f"{key}_validation"
391
+
392
+
393
+ def _sentence_id_keys(sentence: dict[str, Any]) -> list[str]:
394
+ keys = []
395
+ for candidate in ("fleurs_id", "sentence_id", "sib200_id", "id"):
396
+ value = sentence.get(candidate)
397
+ if value is not None:
398
+ keys.append(value)
399
+ return keys
400
+
401
+
402
  def _language_name(lang_code: str) -> str:
403
  """Best-effort human readable language name for a code."""
404
  code = (lang_code or "").strip()
 
588
 
589
  def fetch_random_cached_sentence() -> dict[str, Any]:
590
  """Randomly sample a sentence from either cached source."""
591
+ samplers = list(RANDOM_SENTENCE_SAMPLERS)
592
+ random.shuffle(samplers)
593
+ last_error: FileNotFoundError | None = None
594
+ for sampler in samplers:
595
+ try:
596
+ return sampler()
597
+ except FileNotFoundError as exc:
598
+ last_error = exc
599
+ if last_error is not None:
600
+ raise last_error
601
+ raise RuntimeError("No cached sentence samplers are registered.")
602
 
603
 
604
  def fetch_random_cached_sentence_mix() -> dict[str, Any]:
605
  """Randomly sample a mixed-language example from either cached source."""
606
+ samplers = list(RANDOM_MIX_SAMPLERS)
607
+ random.shuffle(samplers)
608
+ last_error: FileNotFoundError | None = None
609
+ for sampler in samplers:
610
+ try:
611
+ return sampler()
612
+ except FileNotFoundError as exc:
613
+ last_error = exc
614
+ if last_error is not None:
615
+ raise last_error
616
+ raise RuntimeError("No cached mix samplers are registered.")
617
 
618
 
619
  def render_prediction_summary(
 
916
  text = sentence["text"]
917
  summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
918
  sentence_rows = sentence.get("sentences") or [sentence]
919
+ sentence_ids = _sentence_id_keys(sentence)
920
  sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
921
  sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
922
  validation = build_example_validation(
 
927
  raw = {
928
  **raw,
929
  "source": "tatoeba",
930
+ "sentence_id": sentence_ids[0] if sentence_ids else sentence.get("sentence_id", sentence.get("id")),
931
+ "sentence_ids": sentence_ids,
932
  "lang_count": sentence.get("lang_count", len(sentence_rows)),
933
  "sentence_langs": sentence_langs,
934
  "sentence_lang_iso3s": sentence_lang_iso3s,
 
936
  "sentence_lang": sentence.get("source_lang", sentence.get("lang")),
937
  "sentence_lang_iso2": sentence.get("lang_iso2", sentence.get("source_lang")),
938
  "sentence_lang_iso3": sentence.get("lang_iso3", ""),
939
+ _validation_key(sentence.get("source", "tatoeba")): validation,
940
  }
941
+ validation_html = render_validation_html(validation, source_label=_source_label(sentence.get("source", "tatoeba")))
942
  summary = render_prediction_summary(
943
  text=text,
944
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 
968
  "sentence_langs": mix["langs"],
969
  "sentence_lang_iso3s": mix["lang_iso3s"],
970
  "sentences": mix["sentences"],
971
+ _validation_key(mix.get("source", "tatoeba-mix")): validation,
972
  }
973
+ validation_html = render_validation_html(validation, source_label=_source_label(mix.get("source", "tatoeba-mix")))
974
  summary = render_prediction_summary(
975
  text=text,
976
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 
998
  text = sentence["text"]
999
  summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
1000
  sentence_rows = sentence.get("sentences") or [sentence]
1001
+ sentence_id_values = _sentence_id_keys(sentence)
1002
  sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
1003
  sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
1004
  validation = build_example_validation(
 
1009
  raw = {
1010
  **raw,
1011
  "source": sentence.get("source", "fleurs"),
1012
+ "cached_sentence_id": sentence_id_values[0] if sentence_id_values else None,
1013
+ "cached_sentence_ids": [_sentence_id_keys(item)[0] if _sentence_id_keys(item) else None for item in sentence_rows],
1014
  "lang_count": sentence.get("lang_count", len(sentence_rows)),
1015
  "cached_split": sentence.get("split"),
1016
  "cached_source_lang": sentence.get("source_lang"),
 
1019
  "sentence_langs": sentence_langs,
1020
  "sentence_lang_iso3s": sentence_lang_iso3s,
1021
  "sentences": sentence_rows,
1022
+ _validation_key(sentence.get("source", "fleurs")): validation,
1023
  }
1024
+ source_label = _source_label(sentence.get("source", "fleurs"))
1025
  validation_html = render_validation_html(validation, source_label=source_label)
1026
  summary = render_prediction_summary(
1027
  text=text,
 
1061
  "sentence_langs": mix["langs"],
1062
  "sentence_lang_iso3s": mix["lang_iso3s"],
1063
  "sentences": mix["sentences"],
1064
+ _validation_key(mix.get("source", "fleurs-mix")): validation,
1065
  }
1066
+ source_label = _source_label(mix.get("source", "fleurs-mix"))
1067
  validation_html = render_validation_html(validation, source_label=source_label)
1068
  summary = render_prediction_summary(
1069
  text=text,
sib200_cache.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import unicodedata
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ from datasets import get_dataset_config_names, load_dataset
11
+ import pycountry
12
+ from tqdm.auto import tqdm
13
+
14
+ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
15
+ from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
16
+
17
+
18
+ SIB200_DATASET = "Davlan/sib200"
19
+ SIB200_CACHE_DIR = Path(__file__).with_name("data") / "sib200"
20
+ SIB200_PARQUET_PATH = SIB200_CACHE_DIR / "sib200_text.parquet"
21
+ SIB200_SPLIT_ORDER = {"train": 0, "validation": 1, "test": 2}
22
+
23
+
24
+ def _normalize_text_key(text: str) -> str:
25
+ normalized = unicodedata.normalize("NFKC", text)
26
+ normalized = " ".join(normalized.split())
27
+ return normalized.casefold().strip()
28
+
29
+
30
+ def _normalize_source_lang(config_name: str) -> str:
31
+ base = (config_name or "").strip().split("_", 1)[0].lower()
32
+ if not base:
33
+ return ""
34
+ if len(base) == 3:
35
+ language = pycountry.languages.get(alpha_3=base)
36
+ if language is not None:
37
+ alpha_2 = getattr(language, "alpha_2", None)
38
+ if alpha_2:
39
+ return canonical_lang(alpha_2.lower())
40
+ language = canonical_lang(base)
41
+ return language if language in ALL_LANGS else base
42
+
43
+
44
+ def _normalize_split_name(split_name: str) -> str:
45
+ split = (split_name or "").strip().lower()
46
+ if split == "dev":
47
+ return "validation"
48
+ return split
49
+
50
+
51
+ def _row_to_sentence(row: pd.Series) -> dict[str, Any]:
52
+ source_lang = str(row.get("source_lang", "")).strip()
53
+ lang_iso2 = str(row.get("lang_iso2", "")).strip()
54
+ lang_iso3 = str(row.get("lang_iso3", "")).strip()
55
+ label = row.get("label", -1)
56
+ topic = str(row.get("topic", "")).strip()
57
+ return {
58
+ "text": str(row.get("text", "")).strip(),
59
+ "raw_text": str(row.get("text", "")).strip(),
60
+ "source": "sib200",
61
+ "source_lang": source_lang,
62
+ "lang_iso2": lang_iso2,
63
+ "lang_iso3": lang_iso3 or LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
64
+ "language": source_lang,
65
+ "split": str(row.get("split", "")).strip(),
66
+ "sib200_id": int(row.get("index_id", -1)) if str(row.get("index_id", "-1")).strip().lstrip("-").isdigit() else -1,
67
+ "sib200_label": int(label) if str(label).strip().lstrip("-").isdigit() else -1,
68
+ "sib200_topic": topic,
69
+ }
70
+
71
+
72
+ def _frame_from_dataset(config_name: str) -> pd.DataFrame:
73
+ dataset = load_dataset(SIB200_DATASET, config_name)
74
+ if len(dataset) == 0:
75
+ return pd.DataFrame()
76
+
77
+ label_names: list[str] = []
78
+ for split_name in ("train", "validation", "test"):
79
+ if split_name in dataset and "label" in dataset[split_name].features:
80
+ label_names = list(dataset[split_name].features["label"].names)
81
+ break
82
+
83
+ records: list[dict[str, Any]] = []
84
+ source_lang = _normalize_source_lang(config_name)
85
+ if not source_lang:
86
+ return pd.DataFrame()
87
+
88
+ for split_name, split_ds in dataset.items():
89
+ normalized_split = _normalize_split_name(split_name)
90
+ for row in split_ds:
91
+ text = str(row.get("text", "")).strip()
92
+ if not text:
93
+ continue
94
+
95
+ label = row.get("label", -1)
96
+ label_int = int(label) if str(label).strip().lstrip("-").isdigit() else -1
97
+ topic = label_names[label_int] if 0 <= label_int < len(label_names) else ""
98
+ lang_iso2 = source_lang
99
+ records.append(
100
+ {
101
+ "index_id": int(row.get("index_id", -1)) if str(row.get("index_id", "-1")).strip().lstrip("-").isdigit() else -1,
102
+ "text": text,
103
+ "label": label_int,
104
+ "topic": topic,
105
+ "source_lang": config_name,
106
+ "lang_iso2": lang_iso2,
107
+ "lang_iso3": LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
108
+ "source": "sib200",
109
+ "split": normalized_split,
110
+ }
111
+ )
112
+
113
+ if not records:
114
+ return pd.DataFrame()
115
+
116
+ frame = pd.DataFrame.from_records(records)
117
+ frame["text_key"] = frame["text"].astype(str).map(_normalize_text_key)
118
+ frame["split_rank"] = frame["split"].map(lambda split: SIB200_SPLIT_ORDER.get(str(split), 99))
119
+ frame = frame.sort_values(by=["source_lang", "text_key", "split_rank", "index_id"], kind="stable")
120
+ frame = frame.drop_duplicates(subset=["source_lang", "text_key"], keep="first")
121
+ frame = frame.drop(columns=["text_key", "split_rank"], errors="ignore").reset_index(drop=True)
122
+ return frame
123
+
124
+
125
+ def build_sib200_text_parquet(parquet_path: str | Path = SIB200_PARQUET_PATH) -> Path:
126
+ """Download SIB-200 and persist a lean parquet cache for offline sampling."""
127
+ parquet_path = Path(parquet_path)
128
+ parquet_path.parent.mkdir(parents=True, exist_ok=True)
129
+
130
+ config_names = get_dataset_config_names(SIB200_DATASET)
131
+ frames: list[pd.DataFrame] = []
132
+ for config_name in tqdm(config_names, desc="SIB-200 configs"):
133
+ frame = _frame_from_dataset(config_name)
134
+ if not frame.empty:
135
+ frames.append(frame)
136
+
137
+ if not frames:
138
+ raise RuntimeError("No usable SIB-200 rows were loaded.")
139
+
140
+ combined = pd.concat(frames, ignore_index=True)
141
+ combined["split_rank"] = combined["split"].map(lambda split: SIB200_SPLIT_ORDER.get(str(split), 99))
142
+ combined = combined.sort_values(by=["source_lang", "split_rank", "index_id"], kind="stable").reset_index(drop=True)
143
+ combined = combined.drop(columns=["split_rank"], errors="ignore")
144
+ combined.to_parquet(parquet_path, index=False)
145
+ print(
146
+ f"Built lean SIB-200 parquet with {len(combined):,} rows "
147
+ f"and {len(combined.columns)} columns at {parquet_path}."
148
+ )
149
+ return parquet_path
150
+
151
+
152
+ @lru_cache(maxsize=1)
153
+ def load_sib200_table(parquet_path: str | Path = SIB200_PARQUET_PATH) -> pd.DataFrame:
154
+ parquet_path = Path(parquet_path)
155
+ if not parquet_path.exists():
156
+ raise FileNotFoundError(
157
+ f"Missing SIB-200 cache at {parquet_path}. "
158
+ "Run `./.venv/bin/python sib200_cache.py` once while online to build it."
159
+ )
160
+ frame = pd.read_parquet(parquet_path)
161
+ if "text" not in frame.columns:
162
+ raise RuntimeError("SIB-200 parquet cache is missing the text column.")
163
+ return frame
164
+
165
+
166
+ def fetch_random_sib200_sentence(
167
+ *,
168
+ attempts: int = 8,
169
+ parquet_path: str | Path = SIB200_PARQUET_PATH,
170
+ ) -> dict[str, Any]:
171
+ frame = load_sib200_table(parquet_path)
172
+ candidate_frame = frame[frame["lang_iso2"].isin(ALL_LANGS)] if "lang_iso2" in frame.columns else frame
173
+ return sample_single_group_bundle(
174
+ candidate_frame,
175
+ group_column="lang_iso2",
176
+ row_to_sentence=_row_to_sentence,
177
+ attempts=attempts,
178
+ )
179
+
180
+
181
+ def fetch_random_sib200_sentence_mix(
182
+ *,
183
+ min_groups: int = 2,
184
+ max_groups: int = 3,
185
+ parquet_path: str | Path = SIB200_PARQUET_PATH,
186
+ ) -> dict[str, Any]:
187
+ frame = load_sib200_table(parquet_path)
188
+ candidate_frame = frame[frame["lang_iso2"].isin(ALL_LANGS)] if "lang_iso2" in frame.columns else frame
189
+ bundle = sample_multi_group_bundle(
190
+ candidate_frame,
191
+ group_column="lang_iso2",
192
+ row_to_sentence=_row_to_sentence,
193
+ min_groups=min_groups,
194
+ max_groups=max_groups,
195
+ )
196
+ return {
197
+ **bundle,
198
+ "source": "sib200-mix",
199
+ }
200
+
201
+
202
+ def main() -> None:
203
+ parser = argparse.ArgumentParser(description="Build the cached text-only SIB-200 parquet.")
204
+ parser.add_argument(
205
+ "--output",
206
+ default=str(SIB200_PARQUET_PATH),
207
+ help="Output parquet path for the cached SIB-200 text rows.",
208
+ )
209
+ args = parser.parse_args()
210
+ path = build_sib200_text_parquet(args.output)
211
+ print(f"Wrote SIB-200 text cache to {path}")
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()