Spaces:

polyglot-tagger
/

language-extractor-demo

Sleeping

App Files Files Community

DerivedFunction1 commited on Apr 16

Commit

84e2dc1

1 Parent(s): 89f8b1b

add

Browse files

Files changed (3) hide show

README.md +9 -1
app.py +68 -14
sib200_cache.py +215 -0

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 ## Offline caches
-The demo now uses local parquet caches for both FLEURS and Tatoeba.
 Build the FLEURS cache once with:
@@ -32,3 +32,11 @@ Build the Tatoeba cache once with:
 ```
 That converts `sentences.csv` into `data/tatoeba/tatoeba_text.parquet` and keeps only the lean inference columns.

 ## Offline caches
+The demo now uses local parquet caches for FLEURS, Tatoeba, and SIB-200.
 Build the FLEURS cache once with:
 ```
 That converts `sentences.csv` into `data/tatoeba/tatoeba_text.parquet` and keeps only the lean inference columns.
+Build the SIB-200 cache once with:
+```bash
+./.venv/bin/python sib200_cache.py
+```
+That downloads the `Davlan/sib200` configs, keeps the text plus language/topic metadata, and writes a reusable lean parquet file at `data/sib200/sib200_text.parquet`.

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ from transformers import AutoModelForTokenClassification, AutoTokenizer, pipelin
 from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
 from language import ALL_LANGS, LANG_ALIASES, LANG_ISO2_TO_ISO3, canonical_lang, canonical_lang_family
 from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
@@ -34,10 +35,12 @@ ARTIFACT_SPAN_WEIGHT = 0.35
 RANDOM_SENTENCE_SAMPLERS = (
     fetch_random_fleurs_sentence,
     fetch_random_tatoeba_sentence,
 )
 RANDOM_MIX_SAMPLERS = (
     fetch_random_fleurs_sentence_mix,
     fetch_random_tatoeba_sentence_mix,
 )
@@ -367,6 +370,35 @@ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
     return render_validation_html(validation, source_label="Tatoeba")
 def _language_name(lang_code: str) -> str:
     """Best-effort human readable language name for a code."""
     code = (lang_code or "").strip()
@@ -556,12 +588,32 @@ def fasttext_alias_hint_for_lang(fasttext_result: dict[str, Any] | None, lang: s
 def fetch_random_cached_sentence() -> dict[str, Any]:
     """Randomly sample a sentence from either cached source."""
-    return random.choice(RANDOM_SENTENCE_SAMPLERS)()
 def fetch_random_cached_sentence_mix() -> dict[str, Any]:
     """Randomly sample a mixed-language example from either cached source."""
-    return random.choice(RANDOM_MIX_SAMPLERS)()
 def render_prediction_summary(
@@ -864,6 +916,7 @@ def load_random_tatoeba_example(fasttext_mode: str = "full") -> tuple[str, str,
     text = sentence["text"]
     summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
     sentence_rows = sentence.get("sentences") or [sentence]
     sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
     sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
     validation = build_example_validation(
@@ -874,8 +927,8 @@ def load_random_tatoeba_example(fasttext_mode: str = "full") -> tuple[str, str,
     raw = {
         **raw,
         "source": "tatoeba",
-        "sentence_id": sentence.get("sentence_id", sentence.get("id")),
-        "sentence_ids": [item.get("sentence_id", item.get("id")) for item in sentence_rows],
         "lang_count": sentence.get("lang_count", len(sentence_rows)),
         "sentence_langs": sentence_langs,
         "sentence_lang_iso3s": sentence_lang_iso3s,
@@ -883,9 +936,9 @@ def load_random_tatoeba_example(fasttext_mode: str = "full") -> tuple[str, str,
         "sentence_lang": sentence.get("source_lang", sentence.get("lang")),
         "sentence_lang_iso2": sentence.get("lang_iso2", sentence.get("source_lang")),
         "sentence_lang_iso3": sentence.get("lang_iso3", ""),
-        "tatoeba_validation": validation,
     }
-    validation_html = render_validation_html(validation, source_label="Tatoeba")
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -915,9 +968,9 @@ def load_random_tatoeba_mix_example(fasttext_mode: str = "full") -> tuple[str, s
         "sentence_langs": mix["langs"],
         "sentence_lang_iso3s": mix["lang_iso3s"],
         "sentences": mix["sentences"],
-        "tatoeba_validation": validation,
     }
-    validation_html = render_validation_html(validation, source_label="Tatoeba")
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -945,6 +998,7 @@ def load_random_fleurs_example(fasttext_mode: str = "full") -> tuple[str, str, p
     text = sentence["text"]
     summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
     sentence_rows = sentence.get("sentences") or [sentence]
     sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
     sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
     validation = build_example_validation(
@@ -955,8 +1009,8 @@ def load_random_fleurs_example(fasttext_mode: str = "full") -> tuple[str, str, p
     raw = {
         **raw,
         "source": sentence.get("source", "fleurs"),
-        "cached_sentence_id": sentence.get("fleurs_id", sentence.get("sentence_id")),
-        "cached_sentence_ids": [item.get("fleurs_id", item.get("sentence_id")) for item in sentence_rows],
         "lang_count": sentence.get("lang_count", len(sentence_rows)),
         "cached_split": sentence.get("split"),
         "cached_source_lang": sentence.get("source_lang"),
@@ -965,9 +1019,9 @@ def load_random_fleurs_example(fasttext_mode: str = "full") -> tuple[str, str, p
         "sentence_langs": sentence_langs,
         "sentence_lang_iso3s": sentence_lang_iso3s,
         "sentences": sentence_rows,
-        "fleurs_validation": validation if sentence.get("source") == "fleurs" else {},
     }
-    source_label = "FLEURS" if sentence.get("source") == "fleurs" else "Tatoeba"
     validation_html = render_validation_html(validation, source_label=source_label)
     summary = render_prediction_summary(
         text=text,
@@ -1007,9 +1061,9 @@ def load_random_fleurs_mix_example(fasttext_mode: str = "full") -> tuple[str, st
         "sentence_langs": mix["langs"],
         "sentence_lang_iso3s": mix["lang_iso3s"],
         "sentences": mix["sentences"],
-        "fleurs_validation": validation if mix.get("source") == "fleurs-mix" else {},
     }
-    source_label = "FLEURS" if mix.get("source") == "fleurs-mix" else "Tatoeba"
     validation_html = render_validation_html(validation, source_label=source_label)
     summary = render_prediction_summary(
         text=text,

 from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
 from language import ALL_LANGS, LANG_ALIASES, LANG_ISO2_TO_ISO3, canonical_lang, canonical_lang_family
+from sib200_cache import fetch_random_sib200_sentence, fetch_random_sib200_sentence_mix
 from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
 RANDOM_SENTENCE_SAMPLERS = (
     fetch_random_fleurs_sentence,
     fetch_random_tatoeba_sentence,
+    fetch_random_sib200_sentence,
 )
 RANDOM_MIX_SAMPLERS = (
     fetch_random_fleurs_sentence_mix,
     fetch_random_tatoeba_sentence_mix,
+    fetch_random_sib200_sentence_mix,
 )
     return render_validation_html(validation, source_label="Tatoeba")
+def _source_key(source: str) -> str:
+    return (source or "").strip().split("-", 1)[0].lower()
+def _source_label(source: str) -> str:
+    key = _source_key(source)
+    if key == "fleurs":
+        return "FLEURS"
+    if key == "tatoeba":
+        return "Tatoeba"
+    if key == "sib200":
+        return "SIB-200"
+    return key.upper() or "Example"
+def _validation_key(source: str) -> str:
+    key = _source_key(source) or "example"
+    return f"{key}_validation"
+def _sentence_id_keys(sentence: dict[str, Any]) -> list[str]:
+    keys = []
+    for candidate in ("fleurs_id", "sentence_id", "sib200_id", "id"):
+        value = sentence.get(candidate)
+        if value is not None:
+            keys.append(value)
+    return keys
 def _language_name(lang_code: str) -> str:
     """Best-effort human readable language name for a code."""
     code = (lang_code or "").strip()
 def fetch_random_cached_sentence() -> dict[str, Any]:
     """Randomly sample a sentence from either cached source."""
+    samplers = list(RANDOM_SENTENCE_SAMPLERS)
+    random.shuffle(samplers)
+    last_error: FileNotFoundError | None = None
+    for sampler in samplers:
+        try:
+            return sampler()
+        except FileNotFoundError as exc:
+            last_error = exc
+    if last_error is not None:
+        raise last_error
+    raise RuntimeError("No cached sentence samplers are registered.")
 def fetch_random_cached_sentence_mix() -> dict[str, Any]:
     """Randomly sample a mixed-language example from either cached source."""
+    samplers = list(RANDOM_MIX_SAMPLERS)
+    random.shuffle(samplers)
+    last_error: FileNotFoundError | None = None
+    for sampler in samplers:
+        try:
+            return sampler()
+        except FileNotFoundError as exc:
+            last_error = exc
+    if last_error is not None:
+        raise last_error
+    raise RuntimeError("No cached mix samplers are registered.")
 def render_prediction_summary(
     text = sentence["text"]
     summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
     sentence_rows = sentence.get("sentences") or [sentence]
+    sentence_ids = _sentence_id_keys(sentence)
     sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
     sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
     validation = build_example_validation(
     raw = {
         **raw,
         "source": "tatoeba",
+        "sentence_id": sentence_ids[0] if sentence_ids else sentence.get("sentence_id", sentence.get("id")),
+        "sentence_ids": sentence_ids,
         "lang_count": sentence.get("lang_count", len(sentence_rows)),
         "sentence_langs": sentence_langs,
         "sentence_lang_iso3s": sentence_lang_iso3s,
         "sentence_lang": sentence.get("source_lang", sentence.get("lang")),
         "sentence_lang_iso2": sentence.get("lang_iso2", sentence.get("source_lang")),
         "sentence_lang_iso3": sentence.get("lang_iso3", ""),
+        _validation_key(sentence.get("source", "tatoeba")): validation,
     }
+    validation_html = render_validation_html(validation, source_label=_source_label(sentence.get("source", "tatoeba")))
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
         "sentence_langs": mix["langs"],
         "sentence_lang_iso3s": mix["lang_iso3s"],
         "sentences": mix["sentences"],
+        _validation_key(mix.get("source", "tatoeba-mix")): validation,
     }
+    validation_html = render_validation_html(validation, source_label=_source_label(mix.get("source", "tatoeba-mix")))
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
     text = sentence["text"]
     summary, spans, raw, ui_state, _, *chip_updates = predict(text, fasttext_mode=fasttext_mode)
     sentence_rows = sentence.get("sentences") or [sentence]
+    sentence_id_values = _sentence_id_keys(sentence)
     sentence_langs = [item.get("lang_iso2", "") for item in sentence_rows]
     sentence_lang_iso3s = [item.get("lang_iso3", "") for item in sentence_rows]
     validation = build_example_validation(
     raw = {
         **raw,
         "source": sentence.get("source", "fleurs"),
+        "cached_sentence_id": sentence_id_values[0] if sentence_id_values else None,
+        "cached_sentence_ids": [_sentence_id_keys(item)[0] if _sentence_id_keys(item) else None for item in sentence_rows],
         "lang_count": sentence.get("lang_count", len(sentence_rows)),
         "cached_split": sentence.get("split"),
         "cached_source_lang": sentence.get("source_lang"),
         "sentence_langs": sentence_langs,
         "sentence_lang_iso3s": sentence_lang_iso3s,
         "sentences": sentence_rows,
+        _validation_key(sentence.get("source", "fleurs")): validation,
     }
+    source_label = _source_label(sentence.get("source", "fleurs"))
     validation_html = render_validation_html(validation, source_label=source_label)
     summary = render_prediction_summary(
         text=text,
         "sentence_langs": mix["langs"],
         "sentence_lang_iso3s": mix["lang_iso3s"],
         "sentences": mix["sentences"],
+        _validation_key(mix.get("source", "fleurs-mix")): validation,
     }
+    source_label = _source_label(mix.get("source", "fleurs-mix"))
     validation_html = render_validation_html(validation, source_label=source_label)
     summary = render_prediction_summary(
         text=text,

sib200_cache.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from __future__ import annotations
+import argparse
+import unicodedata
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+import pandas as pd
+from datasets import get_dataset_config_names, load_dataset
+import pycountry
+from tqdm.auto import tqdm
+from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
+from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
+SIB200_DATASET = "Davlan/sib200"
+SIB200_CACHE_DIR = Path(__file__).with_name("data") / "sib200"
+SIB200_PARQUET_PATH = SIB200_CACHE_DIR / "sib200_text.parquet"
+SIB200_SPLIT_ORDER = {"train": 0, "validation": 1, "test": 2}
+def _normalize_text_key(text: str) -> str:
+    normalized = unicodedata.normalize("NFKC", text)
+    normalized = " ".join(normalized.split())
+    return normalized.casefold().strip()
+def _normalize_source_lang(config_name: str) -> str:
+    base = (config_name or "").strip().split("_", 1)[0].lower()
+    if not base:
+        return ""
+    if len(base) == 3:
+        language = pycountry.languages.get(alpha_3=base)
+        if language is not None:
+            alpha_2 = getattr(language, "alpha_2", None)
+            if alpha_2:
+                return canonical_lang(alpha_2.lower())
+    language = canonical_lang(base)
+    return language if language in ALL_LANGS else base
+def _normalize_split_name(split_name: str) -> str:
+    split = (split_name or "").strip().lower()
+    if split == "dev":
+        return "validation"
+    return split
+def _row_to_sentence(row: pd.Series) -> dict[str, Any]:
+    source_lang = str(row.get("source_lang", "")).strip()
+    lang_iso2 = str(row.get("lang_iso2", "")).strip()
+    lang_iso3 = str(row.get("lang_iso3", "")).strip()
+    label = row.get("label", -1)
+    topic = str(row.get("topic", "")).strip()
+    return {
+        "text": str(row.get("text", "")).strip(),
+        "raw_text": str(row.get("text", "")).strip(),
+        "source": "sib200",
+        "source_lang": source_lang,
+        "lang_iso2": lang_iso2,
+        "lang_iso3": lang_iso3 or LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
+        "language": source_lang,
+        "split": str(row.get("split", "")).strip(),
+        "sib200_id": int(row.get("index_id", -1)) if str(row.get("index_id", "-1")).strip().lstrip("-").isdigit() else -1,
+        "sib200_label": int(label) if str(label).strip().lstrip("-").isdigit() else -1,
+        "sib200_topic": topic,
+    }
+def _frame_from_dataset(config_name: str) -> pd.DataFrame:
+    dataset = load_dataset(SIB200_DATASET, config_name)
+    if len(dataset) == 0:
+        return pd.DataFrame()
+    label_names: list[str] = []
+    for split_name in ("train", "validation", "test"):
+        if split_name in dataset and "label" in dataset[split_name].features:
+            label_names = list(dataset[split_name].features["label"].names)
+            break
+    records: list[dict[str, Any]] = []
+    source_lang = _normalize_source_lang(config_name)
+    if not source_lang:
+        return pd.DataFrame()
+    for split_name, split_ds in dataset.items():
+        normalized_split = _normalize_split_name(split_name)
+        for row in split_ds:
+            text = str(row.get("text", "")).strip()
+            if not text:
+                continue
+            label = row.get("label", -1)
+            label_int = int(label) if str(label).strip().lstrip("-").isdigit() else -1
+            topic = label_names[label_int] if 0 <= label_int < len(label_names) else ""
+            lang_iso2 = source_lang
+            records.append(
+                {
+                    "index_id": int(row.get("index_id", -1)) if str(row.get("index_id", "-1")).strip().lstrip("-").isdigit() else -1,
+                    "text": text,
+                    "label": label_int,
+                    "topic": topic,
+                    "source_lang": config_name,
+                    "lang_iso2": lang_iso2,
+                    "lang_iso3": LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
+                    "source": "sib200",
+                    "split": normalized_split,
+                }
+            )
+    if not records:
+        return pd.DataFrame()
+    frame = pd.DataFrame.from_records(records)
+    frame["text_key"] = frame["text"].astype(str).map(_normalize_text_key)
+    frame["split_rank"] = frame["split"].map(lambda split: SIB200_SPLIT_ORDER.get(str(split), 99))
+    frame = frame.sort_values(by=["source_lang", "text_key", "split_rank", "index_id"], kind="stable")
+    frame = frame.drop_duplicates(subset=["source_lang", "text_key"], keep="first")
+    frame = frame.drop(columns=["text_key", "split_rank"], errors="ignore").reset_index(drop=True)
+    return frame
+def build_sib200_text_parquet(parquet_path: str | Path = SIB200_PARQUET_PATH) -> Path:
+    """Download SIB-200 and persist a lean parquet cache for offline sampling."""
+    parquet_path = Path(parquet_path)
+    parquet_path.parent.mkdir(parents=True, exist_ok=True)
+    config_names = get_dataset_config_names(SIB200_DATASET)
+    frames: list[pd.DataFrame] = []
+    for config_name in tqdm(config_names, desc="SIB-200 configs"):
+        frame = _frame_from_dataset(config_name)
+        if not frame.empty:
+            frames.append(frame)
+    if not frames:
+        raise RuntimeError("No usable SIB-200 rows were loaded.")
+    combined = pd.concat(frames, ignore_index=True)
+    combined["split_rank"] = combined["split"].map(lambda split: SIB200_SPLIT_ORDER.get(str(split), 99))
+    combined = combined.sort_values(by=["source_lang", "split_rank", "index_id"], kind="stable").reset_index(drop=True)
+    combined = combined.drop(columns=["split_rank"], errors="ignore")
+    combined.to_parquet(parquet_path, index=False)
+    print(
+        f"Built lean SIB-200 parquet with {len(combined):,} rows "
+        f"and {len(combined.columns)} columns at {parquet_path}."
+    )
+    return parquet_path
+@lru_cache(maxsize=1)
+def load_sib200_table(parquet_path: str | Path = SIB200_PARQUET_PATH) -> pd.DataFrame:
+    parquet_path = Path(parquet_path)
+    if not parquet_path.exists():
+        raise FileNotFoundError(
+            f"Missing SIB-200 cache at {parquet_path}. "
+            "Run `./.venv/bin/python sib200_cache.py` once while online to build it."
+        )
+    frame = pd.read_parquet(parquet_path)
+    if "text" not in frame.columns:
+        raise RuntimeError("SIB-200 parquet cache is missing the text column.")
+    return frame
+def fetch_random_sib200_sentence(
+    *,
+    attempts: int = 8,
+    parquet_path: str | Path = SIB200_PARQUET_PATH,
+) -> dict[str, Any]:
+    frame = load_sib200_table(parquet_path)
+    candidate_frame = frame[frame["lang_iso2"].isin(ALL_LANGS)] if "lang_iso2" in frame.columns else frame
+    return sample_single_group_bundle(
+        candidate_frame,
+        group_column="lang_iso2",
+        row_to_sentence=_row_to_sentence,
+        attempts=attempts,
+    )
+def fetch_random_sib200_sentence_mix(
+    *,
+    min_groups: int = 2,
+    max_groups: int = 3,
+    parquet_path: str | Path = SIB200_PARQUET_PATH,
+) -> dict[str, Any]:
+    frame = load_sib200_table(parquet_path)
+    candidate_frame = frame[frame["lang_iso2"].isin(ALL_LANGS)] if "lang_iso2" in frame.columns else frame
+    bundle = sample_multi_group_bundle(
+        candidate_frame,
+        group_column="lang_iso2",
+        row_to_sentence=_row_to_sentence,
+        min_groups=min_groups,
+        max_groups=max_groups,
+    )
+    return {
+        **bundle,
+        "source": "sib200-mix",
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Build the cached text-only SIB-200 parquet.")
+    parser.add_argument(
+        "--output",
+        default=str(SIB200_PARQUET_PATH),
+        help="Output parquet path for the cached SIB-200 text rows.",
+    )
+    args = parser.parse_args()
+    path = build_sib200_text_parquet(args.output)
+    print(f"Wrote SIB-200 text cache to {path}")
+if __name__ == "__main__":
+    main()