Spaces:

themechanism
/

script_fidelity_rate

Sleeping

App Files Files Community

themechanism commited on 17 days ago

Commit

fa3603e

verified ·

1 Parent(s): 832e0be

Fix metric Space configuration

Browse files

Files changed (33) hide show

MANIFEST.in +0 -6
README.md +31 -267
script_fidelity_rate/app.py → app.py +2 -2
dist/.gitignore +0 -1
dist/script_fidelity-0.1.1-py3-none-any.whl +0 -0
dist/script_fidelity-0.1.1.tar.gz +0 -3
examples/ci_gate.py +0 -7
examples/hf_evaluate.py +0 -8
examples/pandas_dataframe.py +0 -7
examples/plain_python.py +0 -7
examples/transformers_compute_metrics.py +0 -14
metrics/script_fidelity_rate/README.md +0 -31
pyproject.toml +0 -50
metrics/script_fidelity_rate/requirements.txt → requirements.txt +0 -0
script_fidelity/__init__.py +0 -40
script_fidelity/__main__.py +0 -3
script_fidelity/cli.py +0 -133
script_fidelity/core.py +0 -132
script_fidelity/data/fleurs_registry.json +0 -210
script_fidelity/dominant.py +0 -75
script_fidelity/registry.py +0 -86
script_fidelity/types.py +0 -33
metrics/script_fidelity_rate/script_fidelity_rate.py → script_fidelity_rate.py +0 -9
script_fidelity_rate/.gitattributes +0 -35
script_fidelity_rate/README.md +0 -50
script_fidelity_rate/requirements.txt +0 -1
script_fidelity_rate/script_fidelity_rate.py +0 -95
script_fidelity_rate/tests.py +0 -17
tests/test_cli.py +0 -81
tests/test_core.py +0 -59
tests/test_evaluate_metric.py +0 -27
tests/test_registry.py +0 -42
uv.lock +0 -0

MANIFEST.in DELETED Viewed

@@ -1,6 +0,0 @@
-include README.md
-include pyproject.toml
-recursive-include script_fidelity/data *.json
-recursive-include metrics *
-recursive-include examples *
-recursive-include tests *.py

README.md CHANGED Viewed

@@ -1,281 +1,45 @@
-# script-fidelity
-`script-fidelity` is a small Python package for Script Fidelity Rate (SFR), a
-reference-free metric for multilingual ASR. SFR measures the fraction of
-countable hypothesis characters that belong to the expected Unicode script for a
-target language.
-Quick signals:
-- Install with `uv add script-fidelity`
-- Load with HF Evaluate via `themechanism/script_fidelity_rate`
-- Supports 102 FLEURS language configs, excluding `all`
-- PyPI: <https://pypi.org/project/script-fidelity/>
-Use SFR with WER and CER. SFR checks script validity; WER and CER measure
-transcription error against references.
-## install
-For package development in this repo:
-```bash
-uv sync --extra dev
-```
-For a downstream project:
-```bash
-uv add script-fidelity
-```
-Run the CLI without adding it to a project:
-```bash
-uvx --from script-fidelity sfr score --language ps_af --text "کابل کې ښه هوا ده"
-```
-## python use
-```python
-from script_fidelity import compute_sfr, compute_sfr_batch
-score = compute_sfr("کابل کې ښه هوا ده", language="ps_af")
-scores = compute_sfr_batch(
-    ["کابل کې ښه هوا ده", "this is romanized output"],
-    language="pashto",
-)
-```
-Digits count by default, matching the paper. Treat digits as neutral with
-`digit_policy="ignore"`.
-```python
-compute_sfr("کابل 2026", language="ps_af", digit_policy="ignore")
-```
-## HF Evaluate use
-Local metric:
 ```python
 import evaluate
 sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
-sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
-```
-Hub metric after publishing:
-```python
-import evaluate
-sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
-sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
-```
-## CLI
-```bash
-sfr score --language ps_af --text "کابل کې ښه هوا ده"
-sfr audit predictions.jsonl --language ps_af --text-column prediction
-sfr audit predictions.csv --language bn_in --text-column transcript --format csv
-```
-## ASR batch example
-```python
-from script_fidelity import compute_corpus_sfr
-predictions = [
-    item["text"]
-    for item in whisper_outputs
-]
-summary = compute_corpus_sfr(predictions, language="bn_in")
-print(summary["sfr_percent"])
-print(summary["dominant_script_counts"])
-```
-## pandas dataframe example
-```python
-import pandas as pd
-from script_fidelity import compute_sfr
-df = pd.read_json("predictions.jsonl", lines=True)
-df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
 ```
-## Transformers compute_metrics example
 ```python
 import evaluate
-wer = evaluate.load("wer")
 sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
-    label_text = processor.batch_decode(labels, skip_special_tokens=True)
-    return {
-        "wer": wer.compute(predictions=pred_text, references=label_text),
-        "sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
-    }
 ```
-## CI gate example
-```python
-from script_fidelity import compute_corpus_sfr
-summary = compute_corpus_sfr(predictions, language="ml_in")
-if summary["sfr"] < 0.90:
-    raise SystemExit("SFR regression: Malayalam output is below 90% target script")
-```
-## shared-script caveats
-SFR is a script check, not a language identifier. Pashto, Urdu, Persian, Arabic,
-Central Kurdish, and Sindhi share Arabic-script Unicode blocks. Latin-script
-languages mostly detect romanization or non-Latin substitution, not language
-identity. Pair SFR with language ID or lexical checks when shared-script
-confusions matter.
-Use `dominant_script()` and `script_distribution()` to inspect failures:
-```python
-from script_fidelity import dominant_script, script_distribution
-dominant_script("this is romanized output")
-script_distribution("বাংলা भाषा")
-```
-## FLEURS codes
-The registry covers the 102 FLEURS language configs listed by `sfr languages`.
-These paper languages have short aliases:
-| FLEURS code | Alias | Script |
-|---|---|---|
-| `ps_af` | `pashto` | Arabic |
-| `ur_pk` | `urdu` | Arabic |
-| `ar_eg` | `arabic` | Arabic |
-| `fa_ir` | `persian`, `farsi` | Arabic |
-| `hi_in` | `hindi` | Devanagari |
-| `bn_in` | `bengali`, `bangla` | Bengali |
-| `ml_in` | `malayalam` | Malayalam |
-| `ta_in` | `tamil` | Tamil |
-| `so_so` | `somali` | Latin |
-| `ka_ge` | `georgian` | Georgian |
-For the full reviewed registry, see
-`script_fidelity/data/fleurs_registry.json`.
-Full code table:
-| Code | Language | Script |
-|---|---|---|
-| `af_za` | Afrikaans | Latin |
-| `am_et` | Amharic | Ethiopic |
-| `ar_eg` | Arabic | Arabic |
-| `as_in` | Assamese | Bengali |
-| `ast_es` | Asturian | Latin |
-| `az_az` | Azerbaijani | Latin |
-| `be_by` | Belarusian | Cyrillic |
-| `bg_bg` | Bulgarian | Cyrillic |
-| `bn_in` | Bengali | Bengali |
-| `bs_ba` | Bosnian | Latin |
-| `ca_es` | Catalan | Latin |
-| `ceb_ph` | Cebuano | Latin |
-| `ckb_iq` | Central Kurdish | Arabic |
-| `cmn_hans_cn` | Mandarin Chinese | Han |
-| `cs_cz` | Czech | Latin |
-| `cy_gb` | Welsh | Latin |
-| `da_dk` | Danish | Latin |
-| `de_de` | German | Latin |
-| `el_gr` | Greek | Greek |
-| `en_us` | English | Latin |
-| `es_419` | Spanish | Latin |
-| `et_ee` | Estonian | Latin |
-| `fa_ir` | Persian | Arabic |
-| `ff_sn` | Fulah | Latin |
-| `fi_fi` | Finnish | Latin |
-| `fil_ph` | Filipino | Latin |
-| `fr_fr` | French | Latin |
-| `ga_ie` | Irish | Latin |
-| `gl_es` | Galician | Latin |
-| `gu_in` | Gujarati | Gujarati |
-| `ha_ng` | Hausa | Latin |
-| `he_il` | Hebrew | Hebrew |
-| `hi_in` | Hindi | Devanagari |
-| `hr_hr` | Croatian | Latin |
-| `hu_hu` | Hungarian | Latin |
-| `hy_am` | Armenian | Armenian |
-| `id_id` | Indonesian | Latin |
-| `ig_ng` | Igbo | Latin |
-| `is_is` | Icelandic | Latin |
-| `it_it` | Italian | Latin |
-| `ja_jp` | Japanese | Han, Hiragana, Katakana |
-| `jv_id` | Javanese | Latin |
-| `ka_ge` | Georgian | Georgian |
-| `kam_ke` | Kamba | Latin |
-| `kea_cv` | Kabuverdianu | Latin |
-| `kk_kz` | Kazakh | Cyrillic |
-| `km_kh` | Khmer | Khmer |
-| `kn_in` | Kannada | Kannada |
-| `ko_kr` | Korean | Hangul |
-| `ky_kg` | Kyrgyz | Cyrillic |
-| `lb_lu` | Luxembourgish | Latin |
-| `lg_ug` | Ganda | Latin |
-| `ln_cd` | Lingala | Latin |
-| `lo_la` | Lao | Lao |
-| `lt_lt` | Lithuanian | Latin |
-| `luo_ke` | Luo | Latin |
-| `lv_lv` | Latvian | Latin |
-| `mi_nz` | Maori | Latin |
-| `mk_mk` | Macedonian | Cyrillic |
-| `ml_in` | Malayalam | Malayalam |
-| `mn_mn` | Mongolian | Cyrillic |
-| `mr_in` | Marathi | Devanagari |
-| `ms_my` | Malay | Latin |
-| `mt_mt` | Maltese | Latin |
-| `my_mm` | Burmese | Myanmar |
-| `nb_no` | Norwegian Bokmal | Latin |
-| `ne_np` | Nepali | Devanagari |
-| `nl_nl` | Dutch | Latin |
-| `nso_za` | Northern Sotho | Latin |
-| `ny_mw` | Chichewa | Latin |
-| `oc_fr` | Occitan | Latin |
-| `om_et` | Oromo | Latin |
-| `or_in` | Odia | Odia |
-| `pa_in` | Punjabi | Gurmukhi |
-| `pl_pl` | Polish | Latin |
-| `ps_af` | Pashto | Arabic |
-| `pt_br` | Portuguese | Latin |
-| `ro_ro` | Romanian | Latin |
-| `ru_ru` | Russian | Cyrillic |
-| `sd_in` | Sindhi | Arabic |
-| `sk_sk` | Slovak | Latin |
-| `sl_si` | Slovenian | Latin |
-| `sn_zw` | Shona | Latin |
-| `so_so` | Somali | Latin |
-| `sr_rs` | Serbian | Cyrillic |
-| `sv_se` | Swedish | Latin |
-| `sw_ke` | Swahili | Latin |
-| `ta_in` | Tamil | Tamil |
-| `te_in` | Telugu | Telugu |
-| `tg_tj` | Tajik | Cyrillic |
-| `th_th` | Thai | Thai |
-| `tr_tr` | Turkish | Latin |
-| `uk_ua` | Ukrainian | Cyrillic |
-| `umb_ao` | Umbundu | Latin |
-| `ur_pk` | Urdu | Arabic |
-| `uz_uz` | Uzbek | Latin |
-| `vi_vn` | Vietnamese | Latin |
-| `wo_sn` | Wolof | Latin |
-| `xh_za` | Xhosa | Latin |
-| `yo_ng` | Yoruba | Latin |
-| `yue_hant_hk` | Cantonese | Han |
-| `zu_za` | Zulu | Latin |

+---
+title: Script Fidelity Rate
+sdk: gradio
+app_file: app.py
+pinned: false
+license: mit
+tags:
+- evaluate
+- metric
+- automatic-speech-recognition
+- unicode
+- multilingual-asr
+---
+# Script Fidelity Rate
+This directory is the Hugging Face Evaluate metric module for Script Fidelity
+Rate (SFR).
+The Python package is published as `script-fidelity` on PyPI:
+<https://pypi.org/project/script-fidelity/>. The import name is
+`script_fidelity`.
 ```python
 import evaluate
 sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
+result = sfr.compute(
+    predictions=["کابل کې ښه هوا ده", "this is romanized output"],
+    language="ps_af",
+)
+print(result["sfr_percent"])
 ```
+Hub use:
 ```python
 import evaluate
 sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
+sfr.compute(predictions=["کابل کې ښه هوا ده"], language="pashto")
 ```
+Use SFR with WER and CER, not instead of them. SFR checks whether output is in
+the intended script. It does not measure lexical accuracy.

script_fidelity_rate/app.py → app.py RENAMED Viewed

@@ -2,5 +2,5 @@ import evaluate
 from evaluate.utils import launch_gradio_widget
-module = evaluate.load("themechanism/script_fidelity_rate")
-launch_gradio_widget(module)

 from evaluate.utils import launch_gradio_widget
+module = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
+launch_gradio_widget(module)

dist/.gitignore DELETED Viewed

	@@ -1 +0,0 @@
1	- *

dist/script_fidelity-0.1.1-py3-none-any.whl DELETED Viewed

Binary file (14.2 kB)

dist/script_fidelity-0.1.1.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4e36da45cddd306e6794eb59bd06cbd3fe9ae19801791bbe5c02862952aa89a8
-size 18936

examples/ci_gate.py DELETED Viewed

@@ -1,7 +0,0 @@
-from script_fidelity import compute_corpus_sfr
-predictions = ["മലയാളം വാക്യം", "malayalam romanized output"]
-summary = compute_corpus_sfr(predictions, language="ml_in")
-if summary["sfr"] < 0.90:
-    raise SystemExit("SFR regression: Malayalam output is below 90% target script")

examples/hf_evaluate.py DELETED Viewed

@@ -1,8 +0,0 @@
-import evaluate
-sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
-result = sfr.compute(
-    predictions=["کابل کې ښه هوا ده", "this is romanized output"],
-    language="ps_af",
-)
-print(result["sfr_percent"])

examples/pandas_dataframe.py DELETED Viewed

@@ -1,7 +0,0 @@
-import pandas as pd
-from script_fidelity import compute_sfr
-df = pd.read_json("predictions.jsonl", lines=True)
-df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
-print(df[["prediction", "sfr"]].head())

examples/plain_python.py DELETED Viewed

@@ -1,7 +0,0 @@
-from script_fidelity import compute_corpus_sfr, compute_sfr
-text = "کابل کې ښه هوا ده"
-print(compute_sfr(text, language="ps_af"))
-predictions = [text, "this is romanized output"]
-print(compute_corpus_sfr(predictions, language="pashto"))

examples/transformers_compute_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-import evaluate
-wer = evaluate.load("wer")
-sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
-    label_text = processor.batch_decode(labels, skip_special_tokens=True)
-    return {
-        "wer": wer.compute(predictions=pred_text, references=label_text),
-        "sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
-    }

metrics/script_fidelity_rate/README.md DELETED Viewed

@@ -1,31 +0,0 @@
-# Script Fidelity Rate
-This directory is the Hugging Face Evaluate metric module for Script Fidelity
-Rate (SFR).
-The Python package is published as `script-fidelity` on PyPI:
-<https://pypi.org/project/script-fidelity/>. The import name is
-`script_fidelity`.
-```python
-import evaluate
-sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
-result = sfr.compute(
-    predictions=["کابل کې ښه هوا ده", "this is romanized output"],
-    language="ps_af",
-)
-print(result["sfr_percent"])
-```
-Hub use after publishing:
-```python
-import evaluate
-sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
-sfr.compute(predictions=["کابل کې ښه هوا ده"], language="pashto")
-```
-Use SFR with WER and CER, not instead of them. SFR checks whether output is in
-the intended script. It does not measure lexical accuracy.

pyproject.toml DELETED Viewed

@@ -1,50 +0,0 @@
-[build-system]
-requires = ["setuptools>=69", "wheel"]
-build-backend = "setuptools.build_meta"
-[project]
-name = "script-fidelity"
-version = "0.1.1"
-description = "Reference-free script fidelity metric for multilingual ASR."
-readme = "README.md"
-requires-python = ">=3.10"
-license = "MIT"
-authors = [{ name = "Anonymous" }]
-keywords = [
-  "asr",
-  "speech-recognition",
-  "evaluation",
-  "unicode",
-  "script-fidelity",
-  "fleurs",
-]
-classifiers = [
-  "Development Status :: 3 - Alpha",
-  "Intended Audience :: Science/Research",
-  "Programming Language :: Python :: 3",
-  "Programming Language :: Python :: 3.10",
-  "Programming Language :: Python :: 3.11",
-  "Programming Language :: Python :: 3.12",
-  "Topic :: Scientific/Engineering :: Artificial Intelligence",
-]
-dependencies = []
-[project.optional-dependencies]
-evaluate = ["evaluate>=0.4.0,<1.0"]
-dev = [
-  "evaluate>=0.4.0,<1.0",
-  "pytest>=8.0",
-]
-[project.scripts]
-sfr = "script_fidelity.cli:main"
-[tool.setuptools.packages.find]
-include = ["script_fidelity*"]
-[tool.setuptools.package-data]
-script_fidelity = ["data/*.json"]
-[tool.pytest.ini_options]
-testpaths = ["tests"]
-pythonpath = ["."]

metrics/script_fidelity_rate/requirements.txt → requirements.txt RENAMED Viewed

File without changes

script_fidelity/__init__.py DELETED Viewed

@@ -1,40 +0,0 @@
-"""Reference-free script fidelity metrics for multilingual ASR."""
-from .core import (
-    compute_corpus_sfr,
-    compute_sf,
-    compute_sf_batch,
-    compute_sfr,
-    compute_sfr_batch,
-    score_text,
-)
-from .dominant import dominant_script, script_distribution
-from .registry import (
-    FLEURS_CONFIGS,
-    SCRIPT_CONFIGS,
-    get_script_config,
-    list_languages,
-    resolve_language,
-)
-from .types import DigitPolicy, SFRResult, ScriptConfig
-__all__ = [
-    "DigitPolicy",
-    "FLEURS_CONFIGS",
-    "SCRIPT_CONFIGS",
-    "SFRResult",
-    "ScriptConfig",
-    "compute_corpus_sfr",
-    "compute_sf",
-    "compute_sf_batch",
-    "compute_sfr",
-    "compute_sfr_batch",
-    "dominant_script",
-    "get_script_config",
-    "list_languages",
-    "resolve_language",
-    "score_text",
-    "script_distribution",
-]
-__version__ = "0.1.1"

script_fidelity/__main__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .cli import main
-raise SystemExit(main())

script_fidelity/cli.py DELETED Viewed

@@ -1,133 +0,0 @@
-"""Command line interface for Script Fidelity Rate."""
-from __future__ import annotations
-import argparse
-import csv
-import json
-import sys
-from pathlib import Path
-from .core import compute_corpus_sfr, compute_sfr
-from .registry import list_languages
-def _read_predictions(path: Path, text_column: str) -> list[str]:
-    if path.suffix.lower() == ".jsonl":
-        rows = []
-        with path.open("r", encoding="utf-8") as handle:
-            for line_no, line in enumerate(handle, start=1):
-                if not line.strip():
-                    continue
-                item = json.loads(line)
-                if text_column not in item:
-                    raise ValueError(f"Missing column '{text_column}' on line {line_no}")
-                rows.append(str(item[text_column]))
-        return rows
-    with path.open("r", encoding="utf-8", newline="") as handle:
-        reader = csv.DictReader(handle)
-        if not reader.fieldnames or text_column not in reader.fieldnames:
-            raise ValueError(f"Missing column '{text_column}' in CSV header")
-        return [str(row[text_column]) for row in reader]
-def _emit_summary(summary: dict, output_format: str) -> None:
-    if output_format == "json":
-        print(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True))
-        return
-    writer = csv.DictWriter(
-        sys.stdout,
-        fieldnames=[
-            "sfr",
-            "sfr_percent",
-            "n",
-            "n_valid",
-            "n_empty",
-            "low_sfr_rate",
-            "high_sfr_rate",
-            "dominant_script_counts",
-        ],
-    )
-    writer.writeheader()
-    row = dict(summary)
-    row["dominant_script_counts"] = json.dumps(
-        row["dominant_script_counts"],
-        ensure_ascii=False,
-        sort_keys=True,
-    )
-    writer.writerow(row)
-def build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(prog="sfr", description="Script Fidelity Rate tools")
-    sub = parser.add_subparsers(dest="command", required=True)
-    score = sub.add_parser("score", help="score one text string")
-    score.add_argument("--language", required=True, help="FLEURS code or alias")
-    score.add_argument("--text", required=True, help="ASR hypothesis text")
-    score.add_argument(
-        "--digit-policy",
-        choices=["count", "ignore"],
-        default="count",
-        help="count digits as characters or treat them as neutral",
-    )
-    audit = sub.add_parser("audit", help="audit a CSV or JSONL file")
-    audit.add_argument("path", type=Path, help="CSV or JSONL file")
-    audit.add_argument("--language", required=True, help="FLEURS code or alias")
-    audit.add_argument("--text-column", default="prediction", help="prediction column")
-    audit.add_argument(
-        "--digit-policy",
-        choices=["count", "ignore"],
-        default="count",
-        help="count digits as characters or treat them as neutral",
-    )
-    audit.add_argument("--format", choices=["json", "csv"], default="json")
-    audit.add_argument("--details", action="store_true", help="include per-row details")
-    langs = sub.add_parser("languages", help="list supported FLEURS codes")
-    langs.add_argument("--format", choices=["plain", "json"], default="plain")
-    return parser
-def main(argv: list[str] | None = None) -> int:
-    parser = build_parser()
-    args = parser.parse_args(argv)
-    if args.command == "score":
-        score = compute_sfr(
-            args.text,
-            language=args.language,
-            digit_policy=args.digit_policy,
-        )
-        print("NA" if score is None else f"{score:.6f}")
-        return 0
-    if args.command == "audit":
-        predictions = _read_predictions(args.path, args.text_column)
-        summary = compute_corpus_sfr(
-            predictions,
-            language=args.language,
-            digit_policy=args.digit_policy,
-            return_details=args.details,
-        )
-        _emit_summary(summary, args.format)
-        return 0
-    if args.command == "languages":
-        languages = list_languages()
-        if args.format == "json":
-            print(json.dumps(languages, indent=2))
-        else:
-            print("\n".join(languages))
-        return 0
-    parser.error("unknown command")
-    return 2
-if __name__ == "__main__":
-    raise SystemExit(main())

script_fidelity/core.py DELETED Viewed

@@ -1,132 +0,0 @@
-"""Core Script Fidelity Rate implementation."""
-from __future__ import annotations
-import unicodedata
-from collections import Counter
-from statistics import fmean
-from .dominant import dominant_script, is_countable, script_distribution
-from .registry import get_script_config
-from .types import DigitPolicy, SFRResult, ScriptConfig
-def _is_in_range(cp: int, ranges: tuple[tuple[int, int], ...]) -> bool:
-    return any(lo <= cp <= hi for lo, hi in ranges)
-def score_text(
-    text: str,
-    language: str = "ps_af",
-    *,
-    digit_policy: DigitPolicy = "count",
-    config: ScriptConfig | None = None,
-) -> SFRResult:
-    """Score one ASR hypothesis and return numerator, denominator, and scripts."""
-    cfg = config or get_script_config(language)
-    normalized = unicodedata.normalize("NFC", text or "")
-    chars = [ch for ch in normalized if is_countable(ch, digit_policy=digit_policy)]
-    numerator = sum(1 for ch in chars if _is_in_range(ord(ch), cfg.ranges))
-    denominator = len(chars)
-    sfr = None if denominator == 0 else numerator / denominator
-    return SFRResult(
-        language=cfg.code,
-        sfr=sfr,
-        numerator=numerator,
-        denominator=denominator,
-        dominant_script=dominant_script(
-            normalized,
-            digit_policy=digit_policy,
-        ),
-        script_counts=script_distribution(
-            normalized,
-            digit_policy=digit_policy,
-        ),
-    )
-def compute_sfr(
-    text: str,
-    language: str = "ps_af",
-    *,
-    digit_policy: DigitPolicy = "count",
-) -> float | None:
-    """Compute reference-free Script Fidelity Rate for one ASR hypothesis."""
-    return score_text(text, language, digit_policy=digit_policy).sfr
-def compute_sfr_batch(
-    predictions: list[str] | tuple[str, ...],
-    language: str = "ps_af",
-    *,
-    digit_policy: DigitPolicy = "count",
-) -> list[float | None]:
-    """Compute SFR for a batch of ASR hypotheses."""
-    config = get_script_config(language)
-    return [
-        score_text(text, config.code, digit_policy=digit_policy, config=config).sfr
-        for text in predictions
-    ]
-def compute_corpus_sfr(
-    predictions: list[str] | tuple[str, ...],
-    language: str = "ps_af",
-    *,
-    digit_policy: DigitPolicy = "count",
-    low_threshold: float = 0.1,
-    high_threshold: float = 0.9,
-    return_details: bool = False,
-) -> dict:
-    """Compute corpus SFR and audit counts for a batch."""
-    config = get_script_config(language)
-    details = [
-        score_text(text, config.code, digit_policy=digit_policy, config=config)
-        for text in predictions
-    ]
-    scores = [item.sfr for item in details if item.sfr is not None]
-    n = len(details)
-    n_valid = len(scores)
-    n_empty = n - n_valid
-    corpus = fmean(scores) if scores else None
-    dominant_counts = Counter(item.dominant_script for item in details)
-    result = {
-        "sfr": corpus,
-        "sfr_percent": None if corpus is None else corpus * 100,
-        "n": n,
-        "n_valid": n_valid,
-        "n_empty": n_empty,
-        "low_sfr_rate": None
-        if n_valid == 0
-        else sum(1 for score in scores if score < low_threshold) / n_valid,
-        "high_sfr_rate": None
-        if n_valid == 0
-        else sum(1 for score in scores if score >= high_threshold) / n_valid,
-        "dominant_script_counts": dict(sorted(dominant_counts.items())),
-    }
-    if return_details:
-        result["details"] = [
-            {
-                "language": item.language,
-                "sfr": item.sfr,
-                "numerator": item.numerator,
-                "denominator": item.denominator,
-                "dominant_script": item.dominant_script,
-                "script_counts": item.script_counts,
-            }
-            for item in details
-        ]
-    return result
-compute_sf = compute_sfr
-compute_sf_batch = compute_sfr_batch

script_fidelity/data/fleurs_registry.json DELETED Viewed

@@ -1,210 +0,0 @@
-{
-  "version": "0.1.1",
-  "source": "Reviewed registry for google/fleurs configs as of 2026-05-07. The config named all is excluded.",
-  "scripts": {
-    "arabic": {
-      "name": "Arabic",
-      "ranges": [[1536, 1791], [1872, 1919], [2208, 2303], [64336, 65023], [65136, 65279], [69216, 69247], [126464, 126719]]
-    },
-    "armenian": {
-      "name": "Armenian",
-      "ranges": [[1328, 1423], [64275, 64279]]
-    },
-    "bengali": {
-      "name": "Bengali",
-      "ranges": [[2432, 2559]]
-    },
-    "cyrillic": {
-      "name": "Cyrillic",
-      "ranges": [[1024, 1279], [1280, 1327], [7296, 7311], [11744, 11775], [42560, 42655]]
-    },
-    "devanagari": {
-      "name": "Devanagari",
-      "ranges": [[2304, 2431], [43232, 43263], [72448, 72543]]
-    },
-    "ethiopic": {
-      "name": "Ethiopic",
-      "ranges": [[4608, 4991], [4992, 5023], [11648, 11743], [43776, 43823]]
-    },
-    "georgian": {
-      "name": "Georgian",
-      "ranges": [[4256, 4351], [11520, 11567], [7312, 7359]]
-    },
-    "greek": {
-      "name": "Greek",
-      "ranges": [[880, 1023], [7936, 8191]]
-    },
-    "gujarati": {
-      "name": "Gujarati",
-      "ranges": [[2688, 2815]]
-    },
-    "gurmukhi": {
-      "name": "Gurmukhi",
-      "ranges": [[2560, 2687]]
-    },
-    "han": {
-      "name": "Han",
-      "ranges": [[13312, 19903], [19968, 40959], [63744, 64255], [131072, 173791], [173824, 177983], [177984, 178207], [178208, 183983], [183984, 191471], [196608, 201551]]
-    },
-    "hangul": {
-      "name": "Hangul",
-      "ranges": [[4352, 4607], [12592, 12687], [43360, 43391], [44032, 55215], [55216, 55295]]
-    },
-    "hebrew": {
-      "name": "Hebrew",
-      "ranges": [[1424, 1535], [64285, 64335]]
-    },
-    "hiragana": {
-      "name": "Hiragana",
-      "ranges": [[12352, 12447]]
-    },
-    "kannada": {
-      "name": "Kannada",
-      "ranges": [[3200, 3327]]
-    },
-    "katakana": {
-      "name": "Katakana",
-      "ranges": [[12448, 12543], [12784, 12799], [65381, 65439]]
-    },
-    "khmer": {
-      "name": "Khmer",
-      "ranges": [[6016, 6143], [6624, 6655]]
-    },
-    "lao": {
-      "name": "Lao",
-      "ranges": [[3712, 3839]]
-    },
-    "latin": {
-      "name": "Latin",
-      "ranges": [[65, 90], [97, 122], [192, 591], [7680, 7935], [42784, 43007], [43824, 43887], [122624, 122879]]
-    },
-    "malayalam": {
-      "name": "Malayalam",
-      "ranges": [[3328, 3455]]
-    },
-    "myanmar": {
-      "name": "Myanmar",
-      "ranges": [[4096, 4255], [43392, 43487], [43488, 43519]]
-    },
-    "odia": {
-      "name": "Odia",
-      "ranges": [[2816, 2943]]
-    },
-    "tamil": {
-      "name": "Tamil",
-      "ranges": [[2944, 3071]]
-    },
-    "telugu": {
-      "name": "Telugu",
-      "ranges": [[3072, 3199]]
-    },
-    "thai": {
-      "name": "Thai",
-      "ranges": [[3584, 3711]]
-    }
-  },
-  "languages": {
-    "af_za": {"name": "Afrikaans", "scripts": ["latin"], "aliases": ["afrikaans"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "am_et": {"name": "Amharic", "scripts": ["ethiopic"], "aliases": ["amharic"]},
-    "ar_eg": {"name": "Arabic", "scripts": ["arabic"], "aliases": ["arabic", "msa"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
-    "as_in": {"name": "Assamese", "scripts": ["bengali"], "aliases": ["assamese"], "shared_script": true, "warning": "Bengali-Assamese script SFR does not identify the language."},
-    "ast_es": {"name": "Asturian", "scripts": ["latin"], "aliases": ["asturian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "az_az": {"name": "Azerbaijani", "scripts": ["latin"], "aliases": ["azerbaijani", "azeri"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "be_by": {"name": "Belarusian", "scripts": ["cyrillic"], "aliases": ["belarusian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "bg_bg": {"name": "Bulgarian", "scripts": ["cyrillic"], "aliases": ["bulgarian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "bn_in": {"name": "Bengali", "scripts": ["bengali"], "aliases": ["bengali", "bangla"], "shared_script": true, "warning": "Bengali-script SFR does not distinguish Bengali from Assamese."},
-    "bs_ba": {"name": "Bosnian", "scripts": ["latin"], "aliases": ["bosnian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ca_es": {"name": "Catalan", "scripts": ["latin"], "aliases": ["catalan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ceb_ph": {"name": "Cebuano", "scripts": ["latin"], "aliases": ["cebuano"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ckb_iq": {"name": "Central Kurdish", "scripts": ["arabic"], "aliases": ["central_kurdish", "sorani", "kurdish_sorani"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
-    "cmn_hans_cn": {"name": "Mandarin Chinese", "scripts": ["han"], "aliases": ["mandarin", "chinese", "simplified_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
-    "cs_cz": {"name": "Czech", "scripts": ["latin"], "aliases": ["czech"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "cy_gb": {"name": "Welsh", "scripts": ["latin"], "aliases": ["welsh"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "da_dk": {"name": "Danish", "scripts": ["latin"], "aliases": ["danish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "de_de": {"name": "German", "scripts": ["latin"], "aliases": ["german"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "el_gr": {"name": "Greek", "scripts": ["greek"], "aliases": ["greek"]},
-    "en_us": {"name": "English", "scripts": ["latin"], "aliases": ["english"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "es_419": {"name": "Spanish", "scripts": ["latin"], "aliases": ["spanish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "et_ee": {"name": "Estonian", "scripts": ["latin"], "aliases": ["estonian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "fa_ir": {"name": "Persian", "scripts": ["arabic"], "aliases": ["persian", "farsi"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
-    "ff_sn": {"name": "Fulah", "scripts": ["latin"], "aliases": ["fulah", "fulani"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "fi_fi": {"name": "Finnish", "scripts": ["latin"], "aliases": ["finnish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "fil_ph": {"name": "Filipino", "scripts": ["latin"], "aliases": ["filipino", "tagalog"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "fr_fr": {"name": "French", "scripts": ["latin"], "aliases": ["french"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ga_ie": {"name": "Irish", "scripts": ["latin"], "aliases": ["irish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "gl_es": {"name": "Galician", "scripts": ["latin"], "aliases": ["galician"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "gu_in": {"name": "Gujarati", "scripts": ["gujarati"], "aliases": ["gujarati"]},
-    "ha_ng": {"name": "Hausa", "scripts": ["latin"], "aliases": ["hausa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "he_il": {"name": "Hebrew", "scripts": ["hebrew"], "aliases": ["hebrew"]},
-    "hi_in": {"name": "Hindi", "scripts": ["devanagari"], "aliases": ["hindi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
-    "hr_hr": {"name": "Croatian", "scripts": ["latin"], "aliases": ["croatian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "hu_hu": {"name": "Hungarian", "scripts": ["latin"], "aliases": ["hungarian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "hy_am": {"name": "Armenian", "scripts": ["armenian"], "aliases": ["armenian"]},
-    "id_id": {"name": "Indonesian", "scripts": ["latin"], "aliases": ["indonesian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ig_ng": {"name": "Igbo", "scripts": ["latin"], "aliases": ["igbo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "is_is": {"name": "Icelandic", "scripts": ["latin"], "aliases": ["icelandic"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "it_it": {"name": "Italian", "scripts": ["latin"], "aliases": ["italian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ja_jp": {"name": "Japanese", "scripts": ["han", "hiragana", "katakana"], "aliases": ["japanese"], "shared_script": true, "warning": "Japanese SFR counts Han and kana; it is not a language identifier."},
-    "jv_id": {"name": "Javanese", "scripts": ["latin"], "aliases": ["javanese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ka_ge": {"name": "Georgian", "scripts": ["georgian"], "aliases": ["georgian"]},
-    "kam_ke": {"name": "Kamba", "scripts": ["latin"], "aliases": ["kamba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "kea_cv": {"name": "Kabuverdianu", "scripts": ["latin"], "aliases": ["kabuverdianu", "cape_verdean_creole"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "kk_kz": {"name": "Kazakh", "scripts": ["cyrillic"], "aliases": ["kazakh"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "km_kh": {"name": "Khmer", "scripts": ["khmer"], "aliases": ["khmer"]},
-    "kn_in": {"name": "Kannada", "scripts": ["kannada"], "aliases": ["kannada"]},
-    "ko_kr": {"name": "Korean", "scripts": ["hangul"], "aliases": ["korean"]},
-    "ky_kg": {"name": "Kyrgyz", "scripts": ["cyrillic"], "aliases": ["kyrgyz"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "lb_lu": {"name": "Luxembourgish", "scripts": ["latin"], "aliases": ["luxembourgish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "lg_ug": {"name": "Ganda", "scripts": ["latin"], "aliases": ["ganda", "luganda"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ln_cd": {"name": "Lingala", "scripts": ["latin"], "aliases": ["lingala"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "lo_la": {"name": "Lao", "scripts": ["lao"], "aliases": ["lao"]},
-    "lt_lt": {"name": "Lithuanian", "scripts": ["latin"], "aliases": ["lithuanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "luo_ke": {"name": "Luo", "scripts": ["latin"], "aliases": ["luo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "lv_lv": {"name": "Latvian", "scripts": ["latin"], "aliases": ["latvian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "mi_nz": {"name": "Maori", "scripts": ["latin"], "aliases": ["maori", "māori"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "mk_mk": {"name": "Macedonian", "scripts": ["cyrillic"], "aliases": ["macedonian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "ml_in": {"name": "Malayalam", "scripts": ["malayalam"], "aliases": ["malayalam"]},
-    "mn_mn": {"name": "Mongolian", "scripts": ["cyrillic"], "aliases": ["mongolian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "mr_in": {"name": "Marathi", "scripts": ["devanagari"], "aliases": ["marathi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
-    "ms_my": {"name": "Malay", "scripts": ["latin"], "aliases": ["malay"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "mt_mt": {"name": "Maltese", "scripts": ["latin"], "aliases": ["maltese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "my_mm": {"name": "Burmese", "scripts": ["myanmar"], "aliases": ["burmese", "myanmar_language"]},
-    "nb_no": {"name": "Norwegian Bokmal", "scripts": ["latin"], "aliases": ["norwegian", "norwegian_bokmal", "bokmal"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ne_np": {"name": "Nepali", "scripts": ["devanagari"], "aliases": ["nepali"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
-    "nl_nl": {"name": "Dutch", "scripts": ["latin"], "aliases": ["dutch"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "nso_za": {"name": "Northern Sotho", "scripts": ["latin"], "aliases": ["northern_sotho", "sepedi"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ny_mw": {"name": "Chichewa", "scripts": ["latin"], "aliases": ["chichewa", "nyanja"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "oc_fr": {"name": "Occitan", "scripts": ["latin"], "aliases": ["occitan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "om_et": {"name": "Oromo", "scripts": ["latin"], "aliases": ["oromo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "or_in": {"name": "Odia", "scripts": ["odia"], "aliases": ["odia", "oriya"]},
-    "pa_in": {"name": "Punjabi", "scripts": ["gurmukhi"], "aliases": ["punjabi", "eastern_punjabi"], "shared_script": true, "warning": "Gurmukhi SFR checks script, not dialect or language identity."},
-    "pl_pl": {"name": "Polish", "scripts": ["latin"], "aliases": ["polish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ps_af": {"name": "Pashto", "scripts": ["arabic"], "aliases": ["pashto", "pushto", "ps"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
-    "pt_br": {"name": "Portuguese", "scripts": ["latin"], "aliases": ["portuguese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ro_ro": {"name": "Romanian", "scripts": ["latin"], "aliases": ["romanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ru_ru": {"name": "Russian", "scripts": ["cyrillic"], "aliases": ["russian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "sd_in": {"name": "Sindhi", "scripts": ["arabic"], "aliases": ["sindhi"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
-    "sk_sk": {"name": "Slovak", "scripts": ["latin"], "aliases": ["slovak"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "sl_si": {"name": "Slovenian", "scripts": ["latin"], "aliases": ["slovenian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "sn_zw": {"name": "Shona", "scripts": ["latin"], "aliases": ["shona"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "so_so": {"name": "Somali", "scripts": ["latin"], "aliases": ["somali"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "sr_rs": {"name": "Serbian", "scripts": ["cyrillic"], "aliases": ["serbian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "sv_se": {"name": "Swedish", "scripts": ["latin"], "aliases": ["swedish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "sw_ke": {"name": "Swahili", "scripts": ["latin"], "aliases": ["swahili"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ta_in": {"name": "Tamil", "scripts": ["tamil"], "aliases": ["tamil"]},
-    "te_in": {"name": "Telugu", "scripts": ["telugu"], "aliases": ["telugu"]},
-    "tg_tj": {"name": "Tajik", "scripts": ["cyrillic"], "aliases": ["tajik"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "th_th": {"name": "Thai", "scripts": ["thai"], "aliases": ["thai"]},
-    "tr_tr": {"name": "Turkish", "scripts": ["latin"], "aliases": ["turkish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "uk_ua": {"name": "Ukrainian", "scripts": ["cyrillic"], "aliases": ["ukrainian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
-    "umb_ao": {"name": "Umbundu", "scripts": ["latin"], "aliases": ["umbundu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "ur_pk": {"name": "Urdu", "scripts": ["arabic"], "aliases": ["urdu"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
-    "uz_uz": {"name": "Uzbek", "scripts": ["latin"], "aliases": ["uzbek"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "vi_vn": {"name": "Vietnamese", "scripts": ["latin"], "aliases": ["vietnamese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "wo_sn": {"name": "Wolof", "scripts": ["latin"], "aliases": ["wolof"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "xh_za": {"name": "Xhosa", "scripts": ["latin"], "aliases": ["xhosa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "yo_ng": {"name": "Yoruba", "scripts": ["latin"], "aliases": ["yoruba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
-    "yue_hant_hk": {"name": "Cantonese", "scripts": ["han"], "aliases": ["cantonese", "traditional_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
-    "zu_za": {"name": "Zulu", "scripts": ["latin"], "aliases": ["zulu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."}
-  }
-}

script_fidelity/dominant.py DELETED Viewed

@@ -1,75 +0,0 @@
-"""Dominant script helpers for SFR audits."""
-from __future__ import annotations
-import unicodedata
-from collections import Counter
-from .registry import _registry
-from .types import DigitPolicy
-def is_countable(ch: str, digit_policy: DigitPolicy = "count") -> bool:
-    """Return whether a character should count in an SFR denominator."""
-    if digit_policy not in {"count", "ignore"}:
-        raise ValueError("digit_policy must be 'count' or 'ignore'")
-    cat = unicodedata.category(ch)
-    if digit_policy == "ignore" and cat.startswith("N"):
-        return False
-    return (
-        not ch.isspace()
-        and not cat.startswith("P")
-        and not cat.startswith("Z")
-        and not cat.startswith("C")
-        and not cat.startswith("M")
-    )
-def _in_ranges(cp: int, ranges: list[list[int]]) -> bool:
-    return any(int(lo) <= cp <= int(hi) for lo, hi in ranges)
-def script_distribution(
-    text: str,
-    *,
-    digit_policy: DigitPolicy = "count",
-) -> dict[str, int]:
-    """Count broad Unicode script families in text."""
-    normalized = unicodedata.normalize("NFC", text or "")
-    scripts = _registry()["scripts"]
-    counts: Counter[str] = Counter()
-    for ch in normalized:
-        if not is_countable(ch, digit_policy=digit_policy):
-            continue
-        cp = ord(ch)
-        label = "other"
-        for script_id, config in scripts.items():
-            if _in_ranges(cp, config["ranges"]):
-                label = script_id
-                break
-        counts[label] += 1
-    return dict(counts)
-def dominant_script(
-    text: str,
-    *,
-    digit_policy: DigitPolicy = "count",
-    threshold: float = 0.5,
-) -> str:
-    """Return the dominant script label, ``mixed``, or ``empty``."""
-    counts = script_distribution(text, digit_policy=digit_policy)
-    total = sum(counts.values())
-    if total == 0:
-        return "empty"
-    script, count = max(counts.items(), key=lambda item: item[1])
-    if count / total >= threshold:
-        return script
-    return "mixed"

script_fidelity/registry.py DELETED Viewed

@@ -1,86 +0,0 @@
-"""FLEURS language registry for Script Fidelity Rate."""
-from __future__ import annotations
-import json
-from functools import lru_cache
-from importlib.resources import files
-from .types import ScriptConfig
-@lru_cache(maxsize=1)
-def _registry() -> dict:
-    data_path = files("script_fidelity").joinpath("data/fleurs_registry.json")
-    return json.loads(data_path.read_text(encoding="utf-8"))
-def _script_ranges(script_ids: list[str]) -> tuple[tuple[int, int], ...]:
-    scripts = _registry()["scripts"]
-    ranges: list[tuple[int, int]] = []
-    seen: set[tuple[int, int]] = set()
-    for script_id in script_ids:
-        for lo, hi in scripts[script_id]["ranges"]:
-            item = (int(lo), int(hi))
-            if item not in seen:
-                ranges.append(item)
-                seen.add(item)
-    return tuple(ranges)
-@lru_cache(maxsize=1)
-def _language_configs() -> dict[str, ScriptConfig]:
-    configs: dict[str, ScriptConfig] = {}
-    for code, item in _registry()["languages"].items():
-        script_ids = item["scripts"]
-        configs[code] = ScriptConfig(
-            code=code,
-            name=item["name"],
-            script="+".join(script_ids),
-            ranges=_script_ranges(script_ids),
-            aliases=tuple(item.get("aliases", [])),
-            shared_script=bool(item.get("shared_script", False)),
-            warning=item.get("warning", ""),
-        )
-    return configs
-@lru_cache(maxsize=1)
-def _alias_map() -> dict[str, str]:
-    aliases: dict[str, str] = {}
-    for code, config in _language_configs().items():
-        aliases[code.lower()] = code
-        aliases[code.replace("_", "-").lower()] = code
-        for alias in config.aliases:
-            aliases[alias.lower()] = code
-    return aliases
-def resolve_language(language: str) -> str:
-    """Resolve a FLEURS code or alias to a canonical FLEURS code."""
-    normalized = language.strip().lower().replace(" ", "_")
-    try:
-        return _alias_map()[normalized]
-    except KeyError as exc:
-        known = ", ".join(list_languages()[:12])
-        raise ValueError(
-            f"Unknown language '{language}'. Use a FLEURS code such as ps_af, "
-            f"or an alias such as pashto. Examples: {known}, ..."
-        ) from exc
-def get_script_config(language: str) -> ScriptConfig:
-    """Return the reviewed script configuration for a language."""
-    return _language_configs()[resolve_language(language)]
-def list_languages() -> list[str]:
-    """Return canonical FLEURS language codes supported by the registry."""
-    return sorted(_language_configs())
-FLEURS_CONFIGS = tuple(list_languages())
-SCRIPT_CONFIGS = _language_configs()

script_fidelity/types.py DELETED Viewed

@@ -1,33 +0,0 @@
-"""Shared types for Script Fidelity Rate."""
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import Literal
-DigitPolicy = Literal["count", "ignore"]
-@dataclass(frozen=True)
-class ScriptConfig:
-    """Script configuration for one FLEURS language."""
-    code: str
-    name: str
-    script: str
-    ranges: tuple[tuple[int, int], ...]
-    aliases: tuple[str, ...] = ()
-    shared_script: bool = False
-    warning: str = ""
-@dataclass(frozen=True)
-class SFRResult:
-    """Per-text Script Fidelity Rate result."""
-    language: str
-    sfr: float | None
-    numerator: int
-    denominator: int
-    dominant_script: str
-    script_counts: dict[str, int]

metrics/script_fidelity_rate/script_fidelity_rate.py → script_fidelity_rate.py RENAMED Viewed

@@ -2,18 +2,9 @@
 from __future__ import annotations
-import sys
-from pathlib import Path
 import datasets
 import evaluate
-CURRENT_DIR = Path(__file__).resolve().parent
-for parent in (CURRENT_DIR, CURRENT_DIR.parent, CURRENT_DIR.parent.parent):
-    if (parent / "script_fidelity").exists():
-        sys.path.insert(0, str(parent))
-        break
 from script_fidelity import compute_corpus_sfr  # noqa: E402

 from __future__ import annotations
 import datasets
 import evaluate
 from script_fidelity import compute_corpus_sfr  # noqa: E402

script_fidelity_rate/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

script_fidelity_rate/README.md DELETED Viewed

@@ -1,50 +0,0 @@
----
-title: script_fidelity_rate
-datasets:
--
-tags:
-- evaluate
-- metric
-description: "TODO: add a description here"
-sdk: gradio
-sdk_version: 3.19.1
-app_file: app.py
-pinned: false
----
-# Metric Card for script_fidelity_rate
-***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
-## Metric Description
-*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
-## How to Use
-*Give general statement of how to use the metric*
-*Provide simplest possible example for using the metric*
-### Inputs
-*List all input arguments in the format below*
-- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
-### Output Values
-*Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
-*State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
-#### Values from Popular Papers
-*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
-### Examples
-*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
-## Limitations and Bias
-*Note any known limitations or biases that the metric has, with links and references if possible.*
-## Citation
-*Cite the source where this metric was introduced.*
-## Further References
-*Add any useful further references.*

script_fidelity_rate/requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- git+https://github.com/huggingface/evaluate@main

script_fidelity_rate/script_fidelity_rate.py DELETED Viewed

@@ -1,95 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TODO: Add a description here."""
-import evaluate
-import datasets
-# TODO: Add BibTeX citation
-_CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
-}
-"""
-# TODO: Add description of the module here
-_DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
-"""
-# TODO: Add description of the arguments of the module here
-_KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
-Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
-Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
-Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
-    >>> print(results)
-    {'accuracy': 1.0}
-"""
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
-@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class script_fidelity_rate(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
-    def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
-        return evaluate.MetricInfo(
-            # This is the description that will appear on the modules page.
-            module_type="metric",
-            description=_DESCRIPTION,
-            citation=_CITATION,
-            inputs_description=_KWARGS_DESCRIPTION,
-            # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
-            # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
-        )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

script_fidelity_rate/tests.py DELETED Viewed

@@ -1,17 +0,0 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

tests/test_cli.py DELETED Viewed

@@ -1,81 +0,0 @@
-import csv
-import json
-import subprocess
-import sys
-from pathlib import Path
-def test_cli_score():
-    result = subprocess.run(
-        [
-            sys.executable,
-            "-m",
-            "script_fidelity",
-            "score",
-            "--language",
-            "ps_af",
-            "--text",
-            "کابل کې ښه هوا ده",
-        ],
-        check=True,
-        capture_output=True,
-        text=True,
-    )
-    assert result.stdout.strip() == "1.000000"
-def test_cli_audit_jsonl(tmp_path: Path):
-    path = tmp_path / "predictions.jsonl"
-    rows = [
-        {"prediction": "کابل کې ښه هوا ده"},
-        {"prediction": "romanized output"},
-    ]
-    path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in rows))
-    result = subprocess.run(
-        [
-            sys.executable,
-            "-m",
-            "script_fidelity",
-            "audit",
-            str(path),
-            "--language",
-            "ps_af",
-            "--text-column",
-            "prediction",
-        ],
-        check=True,
-        capture_output=True,
-        text=True,
-    )
-    summary = json.loads(result.stdout)
-    assert summary["n"] == 2
-    assert summary["sfr"] == 0.5
-def test_cli_audit_csv_format(tmp_path: Path):
-    path = tmp_path / "predictions.csv"
-    with path.open("w", encoding="utf-8", newline="") as handle:
-        writer = csv.DictWriter(handle, fieldnames=["prediction"])
-        writer.writeheader()
-        writer.writerow({"prediction": "বাংলা ভাষা"})
-        writer.writerow({"prediction": "namaste"})
-    result = subprocess.run(
-        [
-            sys.executable,
-            "-m",
-            "script_fidelity",
-            "audit",
-            str(path),
-            "--language",
-            "bn_in",
-            "--format",
-            "csv",
-        ],
-        check=True,
-        capture_output=True,
-        text=True,
-    )
-    assert "sfr_percent" in result.stdout
-    assert "50.0" in result.stdout

tests/test_core.py DELETED Viewed

@@ -1,59 +0,0 @@
-from script_fidelity import (
-    compute_corpus_sfr,
-    compute_sfr,
-    compute_sfr_batch,
-    dominant_script,
-    script_distribution,
-)
-def test_pashto_positive_and_latin_collapse():
-    assert compute_sfr("کابل کې ښه هوا ده", language="ps_af") == 1.0
-    assert compute_sfr("this is romanized pashto", language="pashto") == 0.0
-def test_bengali_vs_devanagari_wrong_script():
-    assert compute_sfr("বাংলা ভাষা", language="bn_in") == 1.0
-    assert compute_sfr("नमस्ते दुनिया", language="bengali") == 0.0
-def test_somali_latin_positive_and_arabic_negative():
-    assert compute_sfr("Somali waa luuqad", language="so_so") == 1.0
-    assert compute_sfr("كابل في هواء جيد", language="somali") == 0.0
-def test_empty_punctuation_combining_and_emoji_cases():
-    assert compute_sfr("", language="ps_af") is None
-    assert compute_sfr("...?!", language="ps_af") is None
-    assert compute_sfr("\u0301\u0301", language="ps_af") is None
-    assert compute_sfr("🙂", language="ps_af") == 0.0
-    assert dominant_script("...?!") == "empty"
-def test_mixed_script_and_distribution():
-    score = compute_sfr("বাংলা भाषा", language="bn_in")
-    assert score is not None
-    assert 0.0 < score < 1.0
-    counts = script_distribution("বাংলা भाषा")
-    assert counts["bengali"] > 0
-    assert counts["devanagari"] > 0
-def test_digit_policy_count_and_ignore():
-    counted = compute_sfr("کابل 123", language="ps_af")
-    ignored = compute_sfr("کابل 123", language="ps_af", digit_policy="ignore")
-    assert counted == 4 / 7
-    assert ignored == 1.0
-def test_batch_and_corpus_summary():
-    predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
-    scores = compute_sfr_batch(predictions, language="pashto")
-    assert scores == [1.0, 0.0, None]
-    summary = compute_corpus_sfr(predictions, language="pashto")
-    assert summary["n"] == 3
-    assert summary["n_valid"] == 2
-    assert summary["n_empty"] == 1
-    assert summary["sfr"] == 0.5
-    assert summary["low_sfr_rate"] == 0.5

tests/test_evaluate_metric.py DELETED Viewed

@@ -1,27 +0,0 @@
-import pytest
-evaluate = pytest.importorskip("evaluate")
-def test_local_evaluate_metric_matches_package():
-    from script_fidelity import compute_corpus_sfr
-    predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
-    metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
-    actual = metric.compute(predictions=predictions, language="ps_af")
-    expected = compute_corpus_sfr(predictions, language="ps_af")
-    assert actual["sfr"] == expected["sfr"]
-    assert actual["n_empty"] == expected["n_empty"]
-def test_evaluate_metric_details():
-    metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
-    result = metric.compute(
-        predictions=["বাংলা ভাষা", "नमस्ते"],
-        language="bn_in",
-        return_details=True,
-    )
-    assert len(result["details"]) == 2
-    assert result["details"][0]["sfr"] == 1.0
-    assert result["details"][1]["sfr"] == 0.0

tests/test_registry.py DELETED Viewed

@@ -1,42 +0,0 @@
-from script_fidelity import FLEURS_CONFIGS, get_script_config, list_languages, resolve_language
-def test_registry_has_all_fleurs_configs_except_all():
-    codes = list_languages()
-    assert len(codes) == 102
-    assert "all" not in codes
-    assert tuple(codes) == FLEURS_CONFIGS
-def test_every_language_has_ranges():
-    for code in list_languages():
-        config = get_script_config(code)
-        assert config.code == code
-        assert config.ranges
-def test_aliases_for_paper_languages():
-    aliases = {
-        "pashto": "ps_af",
-        "urdu": "ur_pk",
-        "arabic": "ar_eg",
-        "persian": "fa_ir",
-        "farsi": "fa_ir",
-        "hindi": "hi_in",
-        "bengali": "bn_in",
-        "malayalam": "ml_in",
-        "tamil": "ta_in",
-        "somali": "so_so",
-        "georgian": "ka_ge",
-    }
-    for alias, code in aliases.items():
-        assert resolve_language(alias) == code
-def test_shared_script_metadata():
-    for code in ["ps_af", "ur_pk", "fa_ir", "ar_eg", "so_so", "hi_in"]:
-        config = get_script_config(code)
-        assert config.shared_script is True
-        assert config.warning
-    assert get_script_config("ka_ge").shared_script is False

uv.lock DELETED Viewed

The diff for this file is too large to render. See raw diff