Spaces:

themechanism
/

script_fidelity_rate

Sleeping

App Files Files Community

themechanism commited on 23 days ago

Commit

832e0be

verified ·

1 Parent(s): 52d6341

Upload folder using huggingface_hub

Browse files

Files changed (33) hide show

MANIFEST.in +6 -0
README.md +281 -13
dist/.gitignore +1 -0
dist/script_fidelity-0.1.1-py3-none-any.whl +0 -0
dist/script_fidelity-0.1.1.tar.gz +3 -0
examples/ci_gate.py +7 -0
examples/hf_evaluate.py +8 -0
examples/pandas_dataframe.py +7 -0
examples/plain_python.py +7 -0
examples/transformers_compute_metrics.py +14 -0
metrics/script_fidelity_rate/README.md +31 -0
metrics/script_fidelity_rate/requirements.txt +2 -0
metrics/script_fidelity_rate/script_fidelity_rate.py +79 -0
pyproject.toml +50 -0
script_fidelity/__init__.py +40 -0
script_fidelity/__main__.py +3 -0
script_fidelity/cli.py +133 -0
script_fidelity/core.py +132 -0
script_fidelity/data/fleurs_registry.json +210 -0
script_fidelity/dominant.py +75 -0
script_fidelity/registry.py +86 -0
script_fidelity/types.py +33 -0
script_fidelity_rate/.gitattributes +35 -0
script_fidelity_rate/README.md +50 -0
script_fidelity_rate/app.py +6 -0
script_fidelity_rate/requirements.txt +1 -0
script_fidelity_rate/script_fidelity_rate.py +95 -0
script_fidelity_rate/tests.py +17 -0
tests/test_cli.py +81 -0
tests/test_core.py +59 -0
tests/test_evaluate_metric.py +27 -0
tests/test_registry.py +42 -0
uv.lock +0 -0

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,6 @@

+include README.md
+include pyproject.toml
+recursive-include script_fidelity/data *.json
+recursive-include metrics *
+recursive-include examples *
+recursive-include tests *.py

README.md CHANGED Viewed

@@ -1,13 +1,281 @@
----
-title: Script Fidelity Rate
-emoji: 🏆
-colorFrom: green
-colorTo: purple
-sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# script-fidelity
+`script-fidelity` is a small Python package for Script Fidelity Rate (SFR), a
+reference-free metric for multilingual ASR. SFR measures the fraction of
+countable hypothesis characters that belong to the expected Unicode script for a
+target language.
+Quick signals:
+- Install with `uv add script-fidelity`
+- Load with HF Evaluate via `themechanism/script_fidelity_rate`
+- Supports 102 FLEURS language configs, excluding `all`
+- PyPI: <https://pypi.org/project/script-fidelity/>
+Use SFR with WER and CER. SFR checks script validity; WER and CER measure
+transcription error against references.
+## install
+For package development in this repo:
+```bash
+uv sync --extra dev
+```
+For a downstream project:
+```bash
+uv add script-fidelity
+```
+Run the CLI without adding it to a project:
+```bash
+uvx --from script-fidelity sfr score --language ps_af --text "کابل کې ښه هوا ده"
+```
+## python use
+```python
+from script_fidelity import compute_sfr, compute_sfr_batch
+score = compute_sfr("کابل کې ښه هوا ده", language="ps_af")
+scores = compute_sfr_batch(
+    ["کابل کې ښه هوا ده", "this is romanized output"],
+    language="pashto",
+)
+```
+Digits count by default, matching the paper. Treat digits as neutral with
+`digit_policy="ignore"`.
+```python
+compute_sfr("کابل 2026", language="ps_af", digit_policy="ignore")
+```
+## HF Evaluate use
+Local metric:
+```python
+import evaluate
+sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
+sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
+```
+Hub metric after publishing:
+```python
+import evaluate
+sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
+sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
+```
+## CLI
+```bash
+sfr score --language ps_af --text "کابل کې ښه هوا ده"
+sfr audit predictions.jsonl --language ps_af --text-column prediction
+sfr audit predictions.csv --language bn_in --text-column transcript --format csv
+```
+## ASR batch example
+```python
+from script_fidelity import compute_corpus_sfr
+predictions = [
+    item["text"]
+    for item in whisper_outputs
+]
+summary = compute_corpus_sfr(predictions, language="bn_in")
+print(summary["sfr_percent"])
+print(summary["dominant_script_counts"])
+```
+## pandas dataframe example
+```python
+import pandas as pd
+from script_fidelity import compute_sfr
+df = pd.read_json("predictions.jsonl", lines=True)
+df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
+```
+## Transformers compute_metrics example
+```python
+import evaluate
+wer = evaluate.load("wer")
+sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
+    label_text = processor.batch_decode(labels, skip_special_tokens=True)
+    return {
+        "wer": wer.compute(predictions=pred_text, references=label_text),
+        "sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
+    }
+```
+## CI gate example
+```python
+from script_fidelity import compute_corpus_sfr
+summary = compute_corpus_sfr(predictions, language="ml_in")
+if summary["sfr"] < 0.90:
+    raise SystemExit("SFR regression: Malayalam output is below 90% target script")
+```
+## shared-script caveats
+SFR is a script check, not a language identifier. Pashto, Urdu, Persian, Arabic,
+Central Kurdish, and Sindhi share Arabic-script Unicode blocks. Latin-script
+languages mostly detect romanization or non-Latin substitution, not language
+identity. Pair SFR with language ID or lexical checks when shared-script
+confusions matter.
+Use `dominant_script()` and `script_distribution()` to inspect failures:
+```python
+from script_fidelity import dominant_script, script_distribution
+dominant_script("this is romanized output")
+script_distribution("বাংলা भाषा")
+```
+## FLEURS codes
+The registry covers the 102 FLEURS language configs listed by `sfr languages`.
+These paper languages have short aliases:
+| FLEURS code | Alias | Script |
+|---|---|---|
+| `ps_af` | `pashto` | Arabic |
+| `ur_pk` | `urdu` | Arabic |
+| `ar_eg` | `arabic` | Arabic |
+| `fa_ir` | `persian`, `farsi` | Arabic |
+| `hi_in` | `hindi` | Devanagari |
+| `bn_in` | `bengali`, `bangla` | Bengali |
+| `ml_in` | `malayalam` | Malayalam |
+| `ta_in` | `tamil` | Tamil |
+| `so_so` | `somali` | Latin |
+| `ka_ge` | `georgian` | Georgian |
+For the full reviewed registry, see
+`script_fidelity/data/fleurs_registry.json`.
+Full code table:
+| Code | Language | Script |
+|---|---|---|
+| `af_za` | Afrikaans | Latin |
+| `am_et` | Amharic | Ethiopic |
+| `ar_eg` | Arabic | Arabic |
+| `as_in` | Assamese | Bengali |
+| `ast_es` | Asturian | Latin |
+| `az_az` | Azerbaijani | Latin |
+| `be_by` | Belarusian | Cyrillic |
+| `bg_bg` | Bulgarian | Cyrillic |
+| `bn_in` | Bengali | Bengali |
+| `bs_ba` | Bosnian | Latin |
+| `ca_es` | Catalan | Latin |
+| `ceb_ph` | Cebuano | Latin |
+| `ckb_iq` | Central Kurdish | Arabic |
+| `cmn_hans_cn` | Mandarin Chinese | Han |
+| `cs_cz` | Czech | Latin |
+| `cy_gb` | Welsh | Latin |
+| `da_dk` | Danish | Latin |
+| `de_de` | German | Latin |
+| `el_gr` | Greek | Greek |
+| `en_us` | English | Latin |
+| `es_419` | Spanish | Latin |
+| `et_ee` | Estonian | Latin |
+| `fa_ir` | Persian | Arabic |
+| `ff_sn` | Fulah | Latin |
+| `fi_fi` | Finnish | Latin |
+| `fil_ph` | Filipino | Latin |
+| `fr_fr` | French | Latin |
+| `ga_ie` | Irish | Latin |
+| `gl_es` | Galician | Latin |
+| `gu_in` | Gujarati | Gujarati |
+| `ha_ng` | Hausa | Latin |
+| `he_il` | Hebrew | Hebrew |
+| `hi_in` | Hindi | Devanagari |
+| `hr_hr` | Croatian | Latin |
+| `hu_hu` | Hungarian | Latin |
+| `hy_am` | Armenian | Armenian |
+| `id_id` | Indonesian | Latin |
+| `ig_ng` | Igbo | Latin |
+| `is_is` | Icelandic | Latin |
+| `it_it` | Italian | Latin |
+| `ja_jp` | Japanese | Han, Hiragana, Katakana |
+| `jv_id` | Javanese | Latin |
+| `ka_ge` | Georgian | Georgian |
+| `kam_ke` | Kamba | Latin |
+| `kea_cv` | Kabuverdianu | Latin |
+| `kk_kz` | Kazakh | Cyrillic |
+| `km_kh` | Khmer | Khmer |
+| `kn_in` | Kannada | Kannada |
+| `ko_kr` | Korean | Hangul |
+| `ky_kg` | Kyrgyz | Cyrillic |
+| `lb_lu` | Luxembourgish | Latin |
+| `lg_ug` | Ganda | Latin |
+| `ln_cd` | Lingala | Latin |
+| `lo_la` | Lao | Lao |
+| `lt_lt` | Lithuanian | Latin |
+| `luo_ke` | Luo | Latin |
+| `lv_lv` | Latvian | Latin |
+| `mi_nz` | Maori | Latin |
+| `mk_mk` | Macedonian | Cyrillic |
+| `ml_in` | Malayalam | Malayalam |
+| `mn_mn` | Mongolian | Cyrillic |
+| `mr_in` | Marathi | Devanagari |
+| `ms_my` | Malay | Latin |
+| `mt_mt` | Maltese | Latin |
+| `my_mm` | Burmese | Myanmar |
+| `nb_no` | Norwegian Bokmal | Latin |
+| `ne_np` | Nepali | Devanagari |
+| `nl_nl` | Dutch | Latin |
+| `nso_za` | Northern Sotho | Latin |
+| `ny_mw` | Chichewa | Latin |
+| `oc_fr` | Occitan | Latin |
+| `om_et` | Oromo | Latin |
+| `or_in` | Odia | Odia |
+| `pa_in` | Punjabi | Gurmukhi |
+| `pl_pl` | Polish | Latin |
+| `ps_af` | Pashto | Arabic |
+| `pt_br` | Portuguese | Latin |
+| `ro_ro` | Romanian | Latin |
+| `ru_ru` | Russian | Cyrillic |
+| `sd_in` | Sindhi | Arabic |
+| `sk_sk` | Slovak | Latin |
+| `sl_si` | Slovenian | Latin |
+| `sn_zw` | Shona | Latin |
+| `so_so` | Somali | Latin |
+| `sr_rs` | Serbian | Cyrillic |
+| `sv_se` | Swedish | Latin |
+| `sw_ke` | Swahili | Latin |
+| `ta_in` | Tamil | Tamil |
+| `te_in` | Telugu | Telugu |
+| `tg_tj` | Tajik | Cyrillic |
+| `th_th` | Thai | Thai |
+| `tr_tr` | Turkish | Latin |
+| `uk_ua` | Ukrainian | Cyrillic |
+| `umb_ao` | Umbundu | Latin |
+| `ur_pk` | Urdu | Arabic |
+| `uz_uz` | Uzbek | Latin |
+| `vi_vn` | Vietnamese | Latin |
+| `wo_sn` | Wolof | Latin |
+| `xh_za` | Xhosa | Latin |
+| `yo_ng` | Yoruba | Latin |
+| `yue_hant_hk` | Cantonese | Han |
+| `zu_za` | Zulu | Latin |

dist/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

dist/script_fidelity-0.1.1-py3-none-any.whl ADDED Viewed

Binary file (14.2 kB). View file

dist/script_fidelity-0.1.1.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e36da45cddd306e6794eb59bd06cbd3fe9ae19801791bbe5c02862952aa89a8
+size 18936

examples/ci_gate.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from script_fidelity import compute_corpus_sfr
+predictions = ["മലയാളം വാക്യം", "malayalam romanized output"]
+summary = compute_corpus_sfr(predictions, language="ml_in")
+if summary["sfr"] < 0.90:
+    raise SystemExit("SFR regression: Malayalam output is below 90% target script")

examples/hf_evaluate.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import evaluate
+sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
+result = sfr.compute(
+    predictions=["کابل کې ښه هوا ده", "this is romanized output"],
+    language="ps_af",
+)
+print(result["sfr_percent"])

examples/pandas_dataframe.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import pandas as pd
+from script_fidelity import compute_sfr
+df = pd.read_json("predictions.jsonl", lines=True)
+df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
+print(df[["prediction", "sfr"]].head())

examples/plain_python.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from script_fidelity import compute_corpus_sfr, compute_sfr
+text = "کابل کې ښه هوا ده"
+print(compute_sfr(text, language="ps_af"))
+predictions = [text, "this is romanized output"]
+print(compute_corpus_sfr(predictions, language="pashto"))

examples/transformers_compute_metrics.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import evaluate
+wer = evaluate.load("wer")
+sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
+    label_text = processor.batch_decode(labels, skip_special_tokens=True)
+    return {
+        "wer": wer.compute(predictions=pred_text, references=label_text),
+        "sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
+    }

metrics/script_fidelity_rate/README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Script Fidelity Rate
+This directory is the Hugging Face Evaluate metric module for Script Fidelity
+Rate (SFR).
+The Python package is published as `script-fidelity` on PyPI:
+<https://pypi.org/project/script-fidelity/>. The import name is
+`script_fidelity`.
+```python
+import evaluate
+sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
+result = sfr.compute(
+    predictions=["کابل کې ښه هوا ده", "this is romanized output"],
+    language="ps_af",
+)
+print(result["sfr_percent"])
+```
+Hub use after publishing:
+```python
+import evaluate
+sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
+sfr.compute(predictions=["کابل کې ښه هوا ده"], language="pashto")
+```
+Use SFR with WER and CER, not instead of them. SFR checks whether output is in
+the intended script. It does not measure lexical accuracy.

metrics/script_fidelity_rate/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ evaluate>=0.4.0,<1.0
2	+ script-fidelity>=0.1.1

metrics/script_fidelity_rate/script_fidelity_rate.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Hugging Face Evaluate metric for Script Fidelity Rate."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+import datasets
+import evaluate
+CURRENT_DIR = Path(__file__).resolve().parent
+for parent in (CURRENT_DIR, CURRENT_DIR.parent, CURRENT_DIR.parent.parent):
+    if (parent / "script_fidelity").exists():
+        sys.path.insert(0, str(parent))
+        break
+from script_fidelity import compute_corpus_sfr  # noqa: E402
+_DESCRIPTION = """
+Script Fidelity Rate (SFR) is a reference-free metric for multilingual ASR.
+It computes the fraction of countable hypothesis characters that belong to the
+expected Unicode script for a target FLEURS language code.
+"""
+_CITATION = """
+@misc{scriptfidelity2026,
+  title = {Script Collapse in Multilingual ASR: A Reference-Free Metric and 100-Pair Benchmark},
+  year = {2026}
+}
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: List of ASR hypothesis strings.
+    language: FLEURS language code or alias, for example "ps_af" or "pashto".
+    digit_policy: "count" keeps digits in the denominator. "ignore" treats
+        digits as neutral.
+    return_details: Return per-example SFR details.
+Returns:
+    Corpus SFR, percent SFR, empty counts, low/high SFR rates, and dominant
+    script counts.
+"""
+class ScriptFidelityRate(evaluate.Metric):
+    """Evaluate community metric wrapper for SFR."""
+    def _info(self) -> evaluate.MetricInfo:
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string"),
+                }
+            ),
+            reference_urls=[
+                "https://huggingface.co/datasets/themechanism/script-fidelity-benchmark"
+            ],
+        )
+    def _compute(
+        self,
+        predictions: list[str],
+        language: str,
+        digit_policy: str = "count",
+        return_details: bool = False,
+    ) -> dict:
+        if digit_policy not in {"count", "ignore"}:
+            raise ValueError("digit_policy must be 'count' or 'ignore'")
+        return compute_corpus_sfr(
+            predictions,
+            language=language,
+            digit_policy=digit_policy,  # type: ignore[arg-type]
+            return_details=return_details,
+        )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "script-fidelity"
+version = "0.1.1"
+description = "Reference-free script fidelity metric for multilingual ASR."
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+authors = [{ name = "Anonymous" }]
+keywords = [
+  "asr",
+  "speech-recognition",
+  "evaluation",
+  "unicode",
+  "script-fidelity",
+  "fleurs",
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Intended Audience :: Science/Research",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = []
+[project.optional-dependencies]
+evaluate = ["evaluate>=0.4.0,<1.0"]
+dev = [
+  "evaluate>=0.4.0,<1.0",
+  "pytest>=8.0",
+]
+[project.scripts]
+sfr = "script_fidelity.cli:main"
+[tool.setuptools.packages.find]
+include = ["script_fidelity*"]
+[tool.setuptools.package-data]
+script_fidelity = ["data/*.json"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]

script_fidelity/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Reference-free script fidelity metrics for multilingual ASR."""
+from .core import (
+    compute_corpus_sfr,
+    compute_sf,
+    compute_sf_batch,
+    compute_sfr,
+    compute_sfr_batch,
+    score_text,
+)
+from .dominant import dominant_script, script_distribution
+from .registry import (
+    FLEURS_CONFIGS,
+    SCRIPT_CONFIGS,
+    get_script_config,
+    list_languages,
+    resolve_language,
+)
+from .types import DigitPolicy, SFRResult, ScriptConfig
+__all__ = [
+    "DigitPolicy",
+    "FLEURS_CONFIGS",
+    "SCRIPT_CONFIGS",
+    "SFRResult",
+    "ScriptConfig",
+    "compute_corpus_sfr",
+    "compute_sf",
+    "compute_sf_batch",
+    "compute_sfr",
+    "compute_sfr_batch",
+    "dominant_script",
+    "get_script_config",
+    "list_languages",
+    "resolve_language",
+    "score_text",
+    "script_distribution",
+]
+__version__ = "0.1.1"

script_fidelity/__main__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .cli import main
2	+
3	+ raise SystemExit(main())

script_fidelity/cli.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""Command line interface for Script Fidelity Rate."""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+from .core import compute_corpus_sfr, compute_sfr
+from .registry import list_languages
+def _read_predictions(path: Path, text_column: str) -> list[str]:
+    if path.suffix.lower() == ".jsonl":
+        rows = []
+        with path.open("r", encoding="utf-8") as handle:
+            for line_no, line in enumerate(handle, start=1):
+                if not line.strip():
+                    continue
+                item = json.loads(line)
+                if text_column not in item:
+                    raise ValueError(f"Missing column '{text_column}' on line {line_no}")
+                rows.append(str(item[text_column]))
+        return rows
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        reader = csv.DictReader(handle)
+        if not reader.fieldnames or text_column not in reader.fieldnames:
+            raise ValueError(f"Missing column '{text_column}' in CSV header")
+        return [str(row[text_column]) for row in reader]
+def _emit_summary(summary: dict, output_format: str) -> None:
+    if output_format == "json":
+        print(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True))
+        return
+    writer = csv.DictWriter(
+        sys.stdout,
+        fieldnames=[
+            "sfr",
+            "sfr_percent",
+            "n",
+            "n_valid",
+            "n_empty",
+            "low_sfr_rate",
+            "high_sfr_rate",
+            "dominant_script_counts",
+        ],
+    )
+    writer.writeheader()
+    row = dict(summary)
+    row["dominant_script_counts"] = json.dumps(
+        row["dominant_script_counts"],
+        ensure_ascii=False,
+        sort_keys=True,
+    )
+    writer.writerow(row)
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="sfr", description="Script Fidelity Rate tools")
+    sub = parser.add_subparsers(dest="command", required=True)
+    score = sub.add_parser("score", help="score one text string")
+    score.add_argument("--language", required=True, help="FLEURS code or alias")
+    score.add_argument("--text", required=True, help="ASR hypothesis text")
+    score.add_argument(
+        "--digit-policy",
+        choices=["count", "ignore"],
+        default="count",
+        help="count digits as characters or treat them as neutral",
+    )
+    audit = sub.add_parser("audit", help="audit a CSV or JSONL file")
+    audit.add_argument("path", type=Path, help="CSV or JSONL file")
+    audit.add_argument("--language", required=True, help="FLEURS code or alias")
+    audit.add_argument("--text-column", default="prediction", help="prediction column")
+    audit.add_argument(
+        "--digit-policy",
+        choices=["count", "ignore"],
+        default="count",
+        help="count digits as characters or treat them as neutral",
+    )
+    audit.add_argument("--format", choices=["json", "csv"], default="json")
+    audit.add_argument("--details", action="store_true", help="include per-row details")
+    langs = sub.add_parser("languages", help="list supported FLEURS codes")
+    langs.add_argument("--format", choices=["plain", "json"], default="plain")
+    return parser
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    if args.command == "score":
+        score = compute_sfr(
+            args.text,
+            language=args.language,
+            digit_policy=args.digit_policy,
+        )
+        print("NA" if score is None else f"{score:.6f}")
+        return 0
+    if args.command == "audit":
+        predictions = _read_predictions(args.path, args.text_column)
+        summary = compute_corpus_sfr(
+            predictions,
+            language=args.language,
+            digit_policy=args.digit_policy,
+            return_details=args.details,
+        )
+        _emit_summary(summary, args.format)
+        return 0
+    if args.command == "languages":
+        languages = list_languages()
+        if args.format == "json":
+            print(json.dumps(languages, indent=2))
+        else:
+            print("\n".join(languages))
+        return 0
+    parser.error("unknown command")
+    return 2
+if __name__ == "__main__":
+    raise SystemExit(main())

script_fidelity/core.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Core Script Fidelity Rate implementation."""
+from __future__ import annotations
+import unicodedata
+from collections import Counter
+from statistics import fmean
+from .dominant import dominant_script, is_countable, script_distribution
+from .registry import get_script_config
+from .types import DigitPolicy, SFRResult, ScriptConfig
+def _is_in_range(cp: int, ranges: tuple[tuple[int, int], ...]) -> bool:
+    return any(lo <= cp <= hi for lo, hi in ranges)
+def score_text(
+    text: str,
+    language: str = "ps_af",
+    *,
+    digit_policy: DigitPolicy = "count",
+    config: ScriptConfig | None = None,
+) -> SFRResult:
+    """Score one ASR hypothesis and return numerator, denominator, and scripts."""
+    cfg = config or get_script_config(language)
+    normalized = unicodedata.normalize("NFC", text or "")
+    chars = [ch for ch in normalized if is_countable(ch, digit_policy=digit_policy)]
+    numerator = sum(1 for ch in chars if _is_in_range(ord(ch), cfg.ranges))
+    denominator = len(chars)
+    sfr = None if denominator == 0 else numerator / denominator
+    return SFRResult(
+        language=cfg.code,
+        sfr=sfr,
+        numerator=numerator,
+        denominator=denominator,
+        dominant_script=dominant_script(
+            normalized,
+            digit_policy=digit_policy,
+        ),
+        script_counts=script_distribution(
+            normalized,
+            digit_policy=digit_policy,
+        ),
+    )
+def compute_sfr(
+    text: str,
+    language: str = "ps_af",
+    *,
+    digit_policy: DigitPolicy = "count",
+) -> float | None:
+    """Compute reference-free Script Fidelity Rate for one ASR hypothesis."""
+    return score_text(text, language, digit_policy=digit_policy).sfr
+def compute_sfr_batch(
+    predictions: list[str] | tuple[str, ...],
+    language: str = "ps_af",
+    *,
+    digit_policy: DigitPolicy = "count",
+) -> list[float | None]:
+    """Compute SFR for a batch of ASR hypotheses."""
+    config = get_script_config(language)
+    return [
+        score_text(text, config.code, digit_policy=digit_policy, config=config).sfr
+        for text in predictions
+    ]
+def compute_corpus_sfr(
+    predictions: list[str] | tuple[str, ...],
+    language: str = "ps_af",
+    *,
+    digit_policy: DigitPolicy = "count",
+    low_threshold: float = 0.1,
+    high_threshold: float = 0.9,
+    return_details: bool = False,
+) -> dict:
+    """Compute corpus SFR and audit counts for a batch."""
+    config = get_script_config(language)
+    details = [
+        score_text(text, config.code, digit_policy=digit_policy, config=config)
+        for text in predictions
+    ]
+    scores = [item.sfr for item in details if item.sfr is not None]
+    n = len(details)
+    n_valid = len(scores)
+    n_empty = n - n_valid
+    corpus = fmean(scores) if scores else None
+    dominant_counts = Counter(item.dominant_script for item in details)
+    result = {
+        "sfr": corpus,
+        "sfr_percent": None if corpus is None else corpus * 100,
+        "n": n,
+        "n_valid": n_valid,
+        "n_empty": n_empty,
+        "low_sfr_rate": None
+        if n_valid == 0
+        else sum(1 for score in scores if score < low_threshold) / n_valid,
+        "high_sfr_rate": None
+        if n_valid == 0
+        else sum(1 for score in scores if score >= high_threshold) / n_valid,
+        "dominant_script_counts": dict(sorted(dominant_counts.items())),
+    }
+    if return_details:
+        result["details"] = [
+            {
+                "language": item.language,
+                "sfr": item.sfr,
+                "numerator": item.numerator,
+                "denominator": item.denominator,
+                "dominant_script": item.dominant_script,
+                "script_counts": item.script_counts,
+            }
+            for item in details
+        ]
+    return result
+compute_sf = compute_sfr
+compute_sf_batch = compute_sfr_batch

script_fidelity/data/fleurs_registry.json ADDED Viewed

	@@ -0,0 +1,210 @@

+{
+  "version": "0.1.1",
+  "source": "Reviewed registry for google/fleurs configs as of 2026-05-07. The config named all is excluded.",
+  "scripts": {
+    "arabic": {
+      "name": "Arabic",
+      "ranges": [[1536, 1791], [1872, 1919], [2208, 2303], [64336, 65023], [65136, 65279], [69216, 69247], [126464, 126719]]
+    },
+    "armenian": {
+      "name": "Armenian",
+      "ranges": [[1328, 1423], [64275, 64279]]
+    },
+    "bengali": {
+      "name": "Bengali",
+      "ranges": [[2432, 2559]]
+    },
+    "cyrillic": {
+      "name": "Cyrillic",
+      "ranges": [[1024, 1279], [1280, 1327], [7296, 7311], [11744, 11775], [42560, 42655]]
+    },
+    "devanagari": {
+      "name": "Devanagari",
+      "ranges": [[2304, 2431], [43232, 43263], [72448, 72543]]
+    },
+    "ethiopic": {
+      "name": "Ethiopic",
+      "ranges": [[4608, 4991], [4992, 5023], [11648, 11743], [43776, 43823]]
+    },
+    "georgian": {
+      "name": "Georgian",
+      "ranges": [[4256, 4351], [11520, 11567], [7312, 7359]]
+    },
+    "greek": {
+      "name": "Greek",
+      "ranges": [[880, 1023], [7936, 8191]]
+    },
+    "gujarati": {
+      "name": "Gujarati",
+      "ranges": [[2688, 2815]]
+    },
+    "gurmukhi": {
+      "name": "Gurmukhi",
+      "ranges": [[2560, 2687]]
+    },
+    "han": {
+      "name": "Han",
+      "ranges": [[13312, 19903], [19968, 40959], [63744, 64255], [131072, 173791], [173824, 177983], [177984, 178207], [178208, 183983], [183984, 191471], [196608, 201551]]
+    },
+    "hangul": {
+      "name": "Hangul",
+      "ranges": [[4352, 4607], [12592, 12687], [43360, 43391], [44032, 55215], [55216, 55295]]
+    },
+    "hebrew": {
+      "name": "Hebrew",
+      "ranges": [[1424, 1535], [64285, 64335]]
+    },
+    "hiragana": {
+      "name": "Hiragana",
+      "ranges": [[12352, 12447]]
+    },
+    "kannada": {
+      "name": "Kannada",
+      "ranges": [[3200, 3327]]
+    },
+    "katakana": {
+      "name": "Katakana",
+      "ranges": [[12448, 12543], [12784, 12799], [65381, 65439]]
+    },
+    "khmer": {
+      "name": "Khmer",
+      "ranges": [[6016, 6143], [6624, 6655]]
+    },
+    "lao": {
+      "name": "Lao",
+      "ranges": [[3712, 3839]]
+    },
+    "latin": {
+      "name": "Latin",
+      "ranges": [[65, 90], [97, 122], [192, 591], [7680, 7935], [42784, 43007], [43824, 43887], [122624, 122879]]
+    },
+    "malayalam": {
+      "name": "Malayalam",
+      "ranges": [[3328, 3455]]
+    },
+    "myanmar": {
+      "name": "Myanmar",
+      "ranges": [[4096, 4255], [43392, 43487], [43488, 43519]]
+    },
+    "odia": {
+      "name": "Odia",
+      "ranges": [[2816, 2943]]
+    },
+    "tamil": {
+      "name": "Tamil",
+      "ranges": [[2944, 3071]]
+    },
+    "telugu": {
+      "name": "Telugu",
+      "ranges": [[3072, 3199]]
+    },
+    "thai": {
+      "name": "Thai",
+      "ranges": [[3584, 3711]]
+    }
+  },
+  "languages": {
+    "af_za": {"name": "Afrikaans", "scripts": ["latin"], "aliases": ["afrikaans"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "am_et": {"name": "Amharic", "scripts": ["ethiopic"], "aliases": ["amharic"]},
+    "ar_eg": {"name": "Arabic", "scripts": ["arabic"], "aliases": ["arabic", "msa"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
+    "as_in": {"name": "Assamese", "scripts": ["bengali"], "aliases": ["assamese"], "shared_script": true, "warning": "Bengali-Assamese script SFR does not identify the language."},
+    "ast_es": {"name": "Asturian", "scripts": ["latin"], "aliases": ["asturian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "az_az": {"name": "Azerbaijani", "scripts": ["latin"], "aliases": ["azerbaijani", "azeri"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "be_by": {"name": "Belarusian", "scripts": ["cyrillic"], "aliases": ["belarusian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "bg_bg": {"name": "Bulgarian", "scripts": ["cyrillic"], "aliases": ["bulgarian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "bn_in": {"name": "Bengali", "scripts": ["bengali"], "aliases": ["bengali", "bangla"], "shared_script": true, "warning": "Bengali-script SFR does not distinguish Bengali from Assamese."},
+    "bs_ba": {"name": "Bosnian", "scripts": ["latin"], "aliases": ["bosnian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ca_es": {"name": "Catalan", "scripts": ["latin"], "aliases": ["catalan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ceb_ph": {"name": "Cebuano", "scripts": ["latin"], "aliases": ["cebuano"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ckb_iq": {"name": "Central Kurdish", "scripts": ["arabic"], "aliases": ["central_kurdish", "sorani", "kurdish_sorani"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
+    "cmn_hans_cn": {"name": "Mandarin Chinese", "scripts": ["han"], "aliases": ["mandarin", "chinese", "simplified_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
+    "cs_cz": {"name": "Czech", "scripts": ["latin"], "aliases": ["czech"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "cy_gb": {"name": "Welsh", "scripts": ["latin"], "aliases": ["welsh"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "da_dk": {"name": "Danish", "scripts": ["latin"], "aliases": ["danish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "de_de": {"name": "German", "scripts": ["latin"], "aliases": ["german"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "el_gr": {"name": "Greek", "scripts": ["greek"], "aliases": ["greek"]},
+    "en_us": {"name": "English", "scripts": ["latin"], "aliases": ["english"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "es_419": {"name": "Spanish", "scripts": ["latin"], "aliases": ["spanish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "et_ee": {"name": "Estonian", "scripts": ["latin"], "aliases": ["estonian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "fa_ir": {"name": "Persian", "scripts": ["arabic"], "aliases": ["persian", "farsi"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
+    "ff_sn": {"name": "Fulah", "scripts": ["latin"], "aliases": ["fulah", "fulani"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "fi_fi": {"name": "Finnish", "scripts": ["latin"], "aliases": ["finnish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "fil_ph": {"name": "Filipino", "scripts": ["latin"], "aliases": ["filipino", "tagalog"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "fr_fr": {"name": "French", "scripts": ["latin"], "aliases": ["french"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ga_ie": {"name": "Irish", "scripts": ["latin"], "aliases": ["irish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "gl_es": {"name": "Galician", "scripts": ["latin"], "aliases": ["galician"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "gu_in": {"name": "Gujarati", "scripts": ["gujarati"], "aliases": ["gujarati"]},
+    "ha_ng": {"name": "Hausa", "scripts": ["latin"], "aliases": ["hausa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "he_il": {"name": "Hebrew", "scripts": ["hebrew"], "aliases": ["hebrew"]},
+    "hi_in": {"name": "Hindi", "scripts": ["devanagari"], "aliases": ["hindi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
+    "hr_hr": {"name": "Croatian", "scripts": ["latin"], "aliases": ["croatian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "hu_hu": {"name": "Hungarian", "scripts": ["latin"], "aliases": ["hungarian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "hy_am": {"name": "Armenian", "scripts": ["armenian"], "aliases": ["armenian"]},
+    "id_id": {"name": "Indonesian", "scripts": ["latin"], "aliases": ["indonesian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ig_ng": {"name": "Igbo", "scripts": ["latin"], "aliases": ["igbo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "is_is": {"name": "Icelandic", "scripts": ["latin"], "aliases": ["icelandic"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "it_it": {"name": "Italian", "scripts": ["latin"], "aliases": ["italian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ja_jp": {"name": "Japanese", "scripts": ["han", "hiragana", "katakana"], "aliases": ["japanese"], "shared_script": true, "warning": "Japanese SFR counts Han and kana; it is not a language identifier."},
+    "jv_id": {"name": "Javanese", "scripts": ["latin"], "aliases": ["javanese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ka_ge": {"name": "Georgian", "scripts": ["georgian"], "aliases": ["georgian"]},
+    "kam_ke": {"name": "Kamba", "scripts": ["latin"], "aliases": ["kamba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "kea_cv": {"name": "Kabuverdianu", "scripts": ["latin"], "aliases": ["kabuverdianu", "cape_verdean_creole"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "kk_kz": {"name": "Kazakh", "scripts": ["cyrillic"], "aliases": ["kazakh"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "km_kh": {"name": "Khmer", "scripts": ["khmer"], "aliases": ["khmer"]},
+    "kn_in": {"name": "Kannada", "scripts": ["kannada"], "aliases": ["kannada"]},
+    "ko_kr": {"name": "Korean", "scripts": ["hangul"], "aliases": ["korean"]},
+    "ky_kg": {"name": "Kyrgyz", "scripts": ["cyrillic"], "aliases": ["kyrgyz"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "lb_lu": {"name": "Luxembourgish", "scripts": ["latin"], "aliases": ["luxembourgish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "lg_ug": {"name": "Ganda", "scripts": ["latin"], "aliases": ["ganda", "luganda"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ln_cd": {"name": "Lingala", "scripts": ["latin"], "aliases": ["lingala"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "lo_la": {"name": "Lao", "scripts": ["lao"], "aliases": ["lao"]},
+    "lt_lt": {"name": "Lithuanian", "scripts": ["latin"], "aliases": ["lithuanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "luo_ke": {"name": "Luo", "scripts": ["latin"], "aliases": ["luo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "lv_lv": {"name": "Latvian", "scripts": ["latin"], "aliases": ["latvian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "mi_nz": {"name": "Maori", "scripts": ["latin"], "aliases": ["maori", "māori"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "mk_mk": {"name": "Macedonian", "scripts": ["cyrillic"], "aliases": ["macedonian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "ml_in": {"name": "Malayalam", "scripts": ["malayalam"], "aliases": ["malayalam"]},
+    "mn_mn": {"name": "Mongolian", "scripts": ["cyrillic"], "aliases": ["mongolian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "mr_in": {"name": "Marathi", "scripts": ["devanagari"], "aliases": ["marathi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
+    "ms_my": {"name": "Malay", "scripts": ["latin"], "aliases": ["malay"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "mt_mt": {"name": "Maltese", "scripts": ["latin"], "aliases": ["maltese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "my_mm": {"name": "Burmese", "scripts": ["myanmar"], "aliases": ["burmese", "myanmar_language"]},
+    "nb_no": {"name": "Norwegian Bokmal", "scripts": ["latin"], "aliases": ["norwegian", "norwegian_bokmal", "bokmal"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ne_np": {"name": "Nepali", "scripts": ["devanagari"], "aliases": ["nepali"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
+    "nl_nl": {"name": "Dutch", "scripts": ["latin"], "aliases": ["dutch"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "nso_za": {"name": "Northern Sotho", "scripts": ["latin"], "aliases": ["northern_sotho", "sepedi"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ny_mw": {"name": "Chichewa", "scripts": ["latin"], "aliases": ["chichewa", "nyanja"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "oc_fr": {"name": "Occitan", "scripts": ["latin"], "aliases": ["occitan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "om_et": {"name": "Oromo", "scripts": ["latin"], "aliases": ["oromo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "or_in": {"name": "Odia", "scripts": ["odia"], "aliases": ["odia", "oriya"]},
+    "pa_in": {"name": "Punjabi", "scripts": ["gurmukhi"], "aliases": ["punjabi", "eastern_punjabi"], "shared_script": true, "warning": "Gurmukhi SFR checks script, not dialect or language identity."},
+    "pl_pl": {"name": "Polish", "scripts": ["latin"], "aliases": ["polish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ps_af": {"name": "Pashto", "scripts": ["arabic"], "aliases": ["pashto", "pushto", "ps"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
+    "pt_br": {"name": "Portuguese", "scripts": ["latin"], "aliases": ["portuguese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ro_ro": {"name": "Romanian", "scripts": ["latin"], "aliases": ["romanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ru_ru": {"name": "Russian", "scripts": ["cyrillic"], "aliases": ["russian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "sd_in": {"name": "Sindhi", "scripts": ["arabic"], "aliases": ["sindhi"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
+    "sk_sk": {"name": "Slovak", "scripts": ["latin"], "aliases": ["slovak"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "sl_si": {"name": "Slovenian", "scripts": ["latin"], "aliases": ["slovenian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "sn_zw": {"name": "Shona", "scripts": ["latin"], "aliases": ["shona"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "so_so": {"name": "Somali", "scripts": ["latin"], "aliases": ["somali"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "sr_rs": {"name": "Serbian", "scripts": ["cyrillic"], "aliases": ["serbian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "sv_se": {"name": "Swedish", "scripts": ["latin"], "aliases": ["swedish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "sw_ke": {"name": "Swahili", "scripts": ["latin"], "aliases": ["swahili"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ta_in": {"name": "Tamil", "scripts": ["tamil"], "aliases": ["tamil"]},
+    "te_in": {"name": "Telugu", "scripts": ["telugu"], "aliases": ["telugu"]},
+    "tg_tj": {"name": "Tajik", "scripts": ["cyrillic"], "aliases": ["tajik"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "th_th": {"name": "Thai", "scripts": ["thai"], "aliases": ["thai"]},
+    "tr_tr": {"name": "Turkish", "scripts": ["latin"], "aliases": ["turkish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "uk_ua": {"name": "Ukrainian", "scripts": ["cyrillic"], "aliases": ["ukrainian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
+    "umb_ao": {"name": "Umbundu", "scripts": ["latin"], "aliases": ["umbundu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "ur_pk": {"name": "Urdu", "scripts": ["arabic"], "aliases": ["urdu"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
+    "uz_uz": {"name": "Uzbek", "scripts": ["latin"], "aliases": ["uzbek"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "vi_vn": {"name": "Vietnamese", "scripts": ["latin"], "aliases": ["vietnamese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "wo_sn": {"name": "Wolof", "scripts": ["latin"], "aliases": ["wolof"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "xh_za": {"name": "Xhosa", "scripts": ["latin"], "aliases": ["xhosa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "yo_ng": {"name": "Yoruba", "scripts": ["latin"], "aliases": ["yoruba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
+    "yue_hant_hk": {"name": "Cantonese", "scripts": ["han"], "aliases": ["cantonese", "traditional_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
+    "zu_za": {"name": "Zulu", "scripts": ["latin"], "aliases": ["zulu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."}
+  }
+}

script_fidelity/dominant.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Dominant script helpers for SFR audits."""
+from __future__ import annotations
+import unicodedata
+from collections import Counter
+from .registry import _registry
+from .types import DigitPolicy
+def is_countable(ch: str, digit_policy: DigitPolicy = "count") -> bool:
+    """Return whether a character should count in an SFR denominator."""
+    if digit_policy not in {"count", "ignore"}:
+        raise ValueError("digit_policy must be 'count' or 'ignore'")
+    cat = unicodedata.category(ch)
+    if digit_policy == "ignore" and cat.startswith("N"):
+        return False
+    return (
+        not ch.isspace()
+        and not cat.startswith("P")
+        and not cat.startswith("Z")
+        and not cat.startswith("C")
+        and not cat.startswith("M")
+    )
+def _in_ranges(cp: int, ranges: list[list[int]]) -> bool:
+    return any(int(lo) <= cp <= int(hi) for lo, hi in ranges)
+def script_distribution(
+    text: str,
+    *,
+    digit_policy: DigitPolicy = "count",
+) -> dict[str, int]:
+    """Count broad Unicode script families in text."""
+    normalized = unicodedata.normalize("NFC", text or "")
+    scripts = _registry()["scripts"]
+    counts: Counter[str] = Counter()
+    for ch in normalized:
+        if not is_countable(ch, digit_policy=digit_policy):
+            continue
+        cp = ord(ch)
+        label = "other"
+        for script_id, config in scripts.items():
+            if _in_ranges(cp, config["ranges"]):
+                label = script_id
+                break
+        counts[label] += 1
+    return dict(counts)
+def dominant_script(
+    text: str,
+    *,
+    digit_policy: DigitPolicy = "count",
+    threshold: float = 0.5,
+) -> str:
+    """Return the dominant script label, ``mixed``, or ``empty``."""
+    counts = script_distribution(text, digit_policy=digit_policy)
+    total = sum(counts.values())
+    if total == 0:
+        return "empty"
+    script, count = max(counts.items(), key=lambda item: item[1])
+    if count / total >= threshold:
+        return script
+    return "mixed"

script_fidelity/registry.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""FLEURS language registry for Script Fidelity Rate."""
+from __future__ import annotations
+import json
+from functools import lru_cache
+from importlib.resources import files
+from .types import ScriptConfig
+@lru_cache(maxsize=1)
+def _registry() -> dict:
+    data_path = files("script_fidelity").joinpath("data/fleurs_registry.json")
+    return json.loads(data_path.read_text(encoding="utf-8"))
+def _script_ranges(script_ids: list[str]) -> tuple[tuple[int, int], ...]:
+    scripts = _registry()["scripts"]
+    ranges: list[tuple[int, int]] = []
+    seen: set[tuple[int, int]] = set()
+    for script_id in script_ids:
+        for lo, hi in scripts[script_id]["ranges"]:
+            item = (int(lo), int(hi))
+            if item not in seen:
+                ranges.append(item)
+                seen.add(item)
+    return tuple(ranges)
+@lru_cache(maxsize=1)
+def _language_configs() -> dict[str, ScriptConfig]:
+    configs: dict[str, ScriptConfig] = {}
+    for code, item in _registry()["languages"].items():
+        script_ids = item["scripts"]
+        configs[code] = ScriptConfig(
+            code=code,
+            name=item["name"],
+            script="+".join(script_ids),
+            ranges=_script_ranges(script_ids),
+            aliases=tuple(item.get("aliases", [])),
+            shared_script=bool(item.get("shared_script", False)),
+            warning=item.get("warning", ""),
+        )
+    return configs
+@lru_cache(maxsize=1)
+def _alias_map() -> dict[str, str]:
+    aliases: dict[str, str] = {}
+    for code, config in _language_configs().items():
+        aliases[code.lower()] = code
+        aliases[code.replace("_", "-").lower()] = code
+        for alias in config.aliases:
+            aliases[alias.lower()] = code
+    return aliases
+def resolve_language(language: str) -> str:
+    """Resolve a FLEURS code or alias to a canonical FLEURS code."""
+    normalized = language.strip().lower().replace(" ", "_")
+    try:
+        return _alias_map()[normalized]
+    except KeyError as exc:
+        known = ", ".join(list_languages()[:12])
+        raise ValueError(
+            f"Unknown language '{language}'. Use a FLEURS code such as ps_af, "
+            f"or an alias such as pashto. Examples: {known}, ..."
+        ) from exc
+def get_script_config(language: str) -> ScriptConfig:
+    """Return the reviewed script configuration for a language."""
+    return _language_configs()[resolve_language(language)]
+def list_languages() -> list[str]:
+    """Return canonical FLEURS language codes supported by the registry."""
+    return sorted(_language_configs())
+FLEURS_CONFIGS = tuple(list_languages())
+SCRIPT_CONFIGS = _language_configs()

script_fidelity/types.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Shared types for Script Fidelity Rate."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+DigitPolicy = Literal["count", "ignore"]
+@dataclass(frozen=True)
+class ScriptConfig:
+    """Script configuration for one FLEURS language."""
+    code: str
+    name: str
+    script: str
+    ranges: tuple[tuple[int, int], ...]
+    aliases: tuple[str, ...] = ()
+    shared_script: bool = False
+    warning: str = ""
+@dataclass(frozen=True)
+class SFRResult:
+    """Per-text Script Fidelity Rate result."""
+    language: str
+    sfr: float | None
+    numerator: int
+    denominator: int
+    dominant_script: str
+    script_counts: dict[str, int]

script_fidelity_rate/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

script_fidelity_rate/README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+---
+title: script_fidelity_rate
+datasets:
+-
+tags:
+- evaluate
+- metric
+description: "TODO: add a description here"
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+---
+# Metric Card for script_fidelity_rate
+***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
+## Metric Description
+*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
+## How to Use
+*Give general statement of how to use the metric*
+*Provide simplest possible example for using the metric*
+### Inputs
+*List all input arguments in the format below*
+- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
+### Output Values
+*Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
+*State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
+#### Values from Popular Papers
+*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
+### Examples
+*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
+## Limitations and Bias
+*Note any known limitations or biases that the metric has, with links and references if possible.*
+## Citation
+*Cite the source where this metric was introduced.*
+## Further References
+*Add any useful further references.*

script_fidelity_rate/app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("themechanism/script_fidelity_rate")
+launch_gradio_widget(module)

script_fidelity_rate/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git+https://github.com/huggingface/evaluate@main

script_fidelity_rate/script_fidelity_rate.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TODO: Add a description here."""
+import evaluate
+import datasets
+# TODO: Add BibTeX citation
+_CITATION = """\
+@InProceedings{huggingface:module,
+title = {A great new module},
+authors={huggingface, Inc.},
+year={2020}
+}
+"""
+# TODO: Add description of the module here
+_DESCRIPTION = """\
+This new module is designed to solve this great ML task and is crafted with a lot of care.
+"""
+# TODO: Add description of the arguments of the module here
+_KWARGS_DESCRIPTION = """
+Calculates how good are predictions given some references, using certain scores
+Args:
+    predictions: list of predictions to score. Each predictions
+        should be a string with tokens separated by spaces.
+    references: list of reference for each prediction. Each
+        reference should be a string with tokens separated by spaces.
+Returns:
+    accuracy: description of the first score,
+    another_score: description of the second score,
+Examples:
+    Examples should be written in doctest format, and should illustrate how
+    to use the function.
+    >>> my_new_module = evaluate.load("my_new_module")
+    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
+    >>> print(results)
+    {'accuracy': 1.0}
+"""
+# TODO: Define external resources urls if needed
+BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class script_fidelity_rate(evaluate.Metric):
+    """TODO: Short description of my evaluation module."""
+    def _info(self):
+        # TODO: Specifies the evaluate.EvaluationModuleInfo object
+        return evaluate.MetricInfo(
+            # This is the description that will appear on the modules page.
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('int64'),
+                'references': datasets.Value('int64'),
+            }),
+            # Homepage of the module for documentation
+            homepage="http://module.homepage",
+            # Additional links to the codebase or references
+            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=["http://path.to.reference.url/new_module"]
+        )
+    def _download_and_prepare(self, dl_manager):
+        """Optional: download external resources useful to compute the scores"""
+        # TODO: Download external resources if needed
+        pass
+    def _compute(self, predictions, references):
+        """Returns the scores"""
+        # TODO: Compute the different scores of the module
+        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
+        return {
+            "accuracy": accuracy,
+        }

script_fidelity_rate/tests.py ADDED Viewed

	@@ -0,0 +1,17 @@

+test_cases = [
+    {
+        "predictions": [0, 0],
+        "references": [1, 1],
+        "result": {"metric_score": 0}
+    },
+    {
+        "predictions": [1, 1],
+        "references": [1, 1],
+        "result": {"metric_score": 1}
+    },
+    {
+        "predictions": [1, 0],
+        "references": [1, 1],
+        "result": {"metric_score": 0.5}
+    }
+]

tests/test_cli.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import csv
+import json
+import subprocess
+import sys
+from pathlib import Path
+def test_cli_score():
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "script_fidelity",
+            "score",
+            "--language",
+            "ps_af",
+            "--text",
+            "کابل کې ښه هوا ده",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    assert result.stdout.strip() == "1.000000"
+def test_cli_audit_jsonl(tmp_path: Path):
+    path = tmp_path / "predictions.jsonl"
+    rows = [
+        {"prediction": "کابل کې ښه هوا ده"},
+        {"prediction": "romanized output"},
+    ]
+    path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in rows))
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "script_fidelity",
+            "audit",
+            str(path),
+            "--language",
+            "ps_af",
+            "--text-column",
+            "prediction",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    summary = json.loads(result.stdout)
+    assert summary["n"] == 2
+    assert summary["sfr"] == 0.5
+def test_cli_audit_csv_format(tmp_path: Path):
+    path = tmp_path / "predictions.csv"
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=["prediction"])
+        writer.writeheader()
+        writer.writerow({"prediction": "বাংলা ভাষা"})
+        writer.writerow({"prediction": "namaste"})
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "script_fidelity",
+            "audit",
+            str(path),
+            "--language",
+            "bn_in",
+            "--format",
+            "csv",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    assert "sfr_percent" in result.stdout
+    assert "50.0" in result.stdout

tests/test_core.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from script_fidelity import (
+    compute_corpus_sfr,
+    compute_sfr,
+    compute_sfr_batch,
+    dominant_script,
+    script_distribution,
+)
+def test_pashto_positive_and_latin_collapse():
+    assert compute_sfr("کابل کې ښه هوا ده", language="ps_af") == 1.0
+    assert compute_sfr("this is romanized pashto", language="pashto") == 0.0
+def test_bengali_vs_devanagari_wrong_script():
+    assert compute_sfr("বাংলা ভাষা", language="bn_in") == 1.0
+    assert compute_sfr("नमस्ते दुनिया", language="bengali") == 0.0
+def test_somali_latin_positive_and_arabic_negative():
+    assert compute_sfr("Somali waa luuqad", language="so_so") == 1.0
+    assert compute_sfr("كابل في هواء جيد", language="somali") == 0.0
+def test_empty_punctuation_combining_and_emoji_cases():
+    assert compute_sfr("", language="ps_af") is None
+    assert compute_sfr("...?!", language="ps_af") is None
+    assert compute_sfr("\u0301\u0301", language="ps_af") is None
+    assert compute_sfr("🙂", language="ps_af") == 0.0
+    assert dominant_script("...?!") == "empty"
+def test_mixed_script_and_distribution():
+    score = compute_sfr("বাংলা भाषा", language="bn_in")
+    assert score is not None
+    assert 0.0 < score < 1.0
+    counts = script_distribution("বাংলা भाषा")
+    assert counts["bengali"] > 0
+    assert counts["devanagari"] > 0
+def test_digit_policy_count_and_ignore():
+    counted = compute_sfr("کابل 123", language="ps_af")
+    ignored = compute_sfr("کابل 123", language="ps_af", digit_policy="ignore")
+    assert counted == 4 / 7
+    assert ignored == 1.0
+def test_batch_and_corpus_summary():
+    predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
+    scores = compute_sfr_batch(predictions, language="pashto")
+    assert scores == [1.0, 0.0, None]
+    summary = compute_corpus_sfr(predictions, language="pashto")
+    assert summary["n"] == 3
+    assert summary["n_valid"] == 2
+    assert summary["n_empty"] == 1
+    assert summary["sfr"] == 0.5
+    assert summary["low_sfr_rate"] == 0.5

tests/test_evaluate_metric.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pytest
+evaluate = pytest.importorskip("evaluate")
+def test_local_evaluate_metric_matches_package():
+    from script_fidelity import compute_corpus_sfr
+    predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
+    metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
+    actual = metric.compute(predictions=predictions, language="ps_af")
+    expected = compute_corpus_sfr(predictions, language="ps_af")
+    assert actual["sfr"] == expected["sfr"]
+    assert actual["n_empty"] == expected["n_empty"]
+def test_evaluate_metric_details():
+    metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
+    result = metric.compute(
+        predictions=["বাংলা ভাষা", "नमस्ते"],
+        language="bn_in",
+        return_details=True,
+    )
+    assert len(result["details"]) == 2
+    assert result["details"][0]["sfr"] == 1.0
+    assert result["details"][1]["sfr"] == 0.0

tests/test_registry.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from script_fidelity import FLEURS_CONFIGS, get_script_config, list_languages, resolve_language
+def test_registry_has_all_fleurs_configs_except_all():
+    codes = list_languages()
+    assert len(codes) == 102
+    assert "all" not in codes
+    assert tuple(codes) == FLEURS_CONFIGS
+def test_every_language_has_ranges():
+    for code in list_languages():
+        config = get_script_config(code)
+        assert config.code == code
+        assert config.ranges
+def test_aliases_for_paper_languages():
+    aliases = {
+        "pashto": "ps_af",
+        "urdu": "ur_pk",
+        "arabic": "ar_eg",
+        "persian": "fa_ir",
+        "farsi": "fa_ir",
+        "hindi": "hi_in",
+        "bengali": "bn_in",
+        "malayalam": "ml_in",
+        "tamil": "ta_in",
+        "somali": "so_so",
+        "georgian": "ka_ge",
+    }
+    for alias, code in aliases.items():
+        assert resolve_language(alias) == code
+def test_shared_script_metadata():
+    for code in ["ps_af", "ur_pk", "fa_ir", "ar_eg", "so_so", "hi_in"]:
+        config = get_script_config(code)
+        assert config.shared_script is True
+        assert config.warning
+    assert get_script_config("ka_ge").shared_script is False

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff