Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- MANIFEST.in +6 -0
- README.md +281 -13
- dist/.gitignore +1 -0
- dist/script_fidelity-0.1.1-py3-none-any.whl +0 -0
- dist/script_fidelity-0.1.1.tar.gz +3 -0
- examples/ci_gate.py +7 -0
- examples/hf_evaluate.py +8 -0
- examples/pandas_dataframe.py +7 -0
- examples/plain_python.py +7 -0
- examples/transformers_compute_metrics.py +14 -0
- metrics/script_fidelity_rate/README.md +31 -0
- metrics/script_fidelity_rate/requirements.txt +2 -0
- metrics/script_fidelity_rate/script_fidelity_rate.py +79 -0
- pyproject.toml +50 -0
- script_fidelity/__init__.py +40 -0
- script_fidelity/__main__.py +3 -0
- script_fidelity/cli.py +133 -0
- script_fidelity/core.py +132 -0
- script_fidelity/data/fleurs_registry.json +210 -0
- script_fidelity/dominant.py +75 -0
- script_fidelity/registry.py +86 -0
- script_fidelity/types.py +33 -0
- script_fidelity_rate/.gitattributes +35 -0
- script_fidelity_rate/README.md +50 -0
- script_fidelity_rate/app.py +6 -0
- script_fidelity_rate/requirements.txt +1 -0
- script_fidelity_rate/script_fidelity_rate.py +95 -0
- script_fidelity_rate/tests.py +17 -0
- tests/test_cli.py +81 -0
- tests/test_core.py +59 -0
- tests/test_evaluate_metric.py +27 -0
- tests/test_registry.py +42 -0
- uv.lock +0 -0
MANIFEST.in
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include README.md
|
| 2 |
+
include pyproject.toml
|
| 3 |
+
recursive-include script_fidelity/data *.json
|
| 4 |
+
recursive-include metrics *
|
| 5 |
+
recursive-include examples *
|
| 6 |
+
recursive-include tests *.py
|
README.md
CHANGED
|
@@ -1,13 +1,281 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# script-fidelity
|
| 2 |
+
|
| 3 |
+
`script-fidelity` is a small Python package for Script Fidelity Rate (SFR), a
|
| 4 |
+
reference-free metric for multilingual ASR. SFR measures the fraction of
|
| 5 |
+
countable hypothesis characters that belong to the expected Unicode script for a
|
| 6 |
+
target language.
|
| 7 |
+
|
| 8 |
+
Quick signals:
|
| 9 |
+
|
| 10 |
+
- Install with `uv add script-fidelity`
|
| 11 |
+
- Load with HF Evaluate via `themechanism/script_fidelity_rate`
|
| 12 |
+
- Supports 102 FLEURS language configs, excluding `all`
|
| 13 |
+
- PyPI: <https://pypi.org/project/script-fidelity/>
|
| 14 |
+
|
| 15 |
+
Use SFR with WER and CER. SFR checks script validity; WER and CER measure
|
| 16 |
+
transcription error against references.
|
| 17 |
+
|
| 18 |
+
## install
|
| 19 |
+
|
| 20 |
+
For package development in this repo:
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
uv sync --extra dev
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
For a downstream project:
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
uv add script-fidelity
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Run the CLI without adding it to a project:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
uvx --from script-fidelity sfr score --language ps_af --text "کابل کې ښه هوا ده"
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## python use
|
| 39 |
+
|
| 40 |
+
```python
|
| 41 |
+
from script_fidelity import compute_sfr, compute_sfr_batch
|
| 42 |
+
|
| 43 |
+
score = compute_sfr("کابل کې ښه هوا ده", language="ps_af")
|
| 44 |
+
scores = compute_sfr_batch(
|
| 45 |
+
["کابل کې ښه هوا ده", "this is romanized output"],
|
| 46 |
+
language="pashto",
|
| 47 |
+
)
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
Digits count by default, matching the paper. Treat digits as neutral with
|
| 51 |
+
`digit_policy="ignore"`.
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
compute_sfr("کابل 2026", language="ps_af", digit_policy="ignore")
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## HF Evaluate use
|
| 58 |
+
|
| 59 |
+
Local metric:
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
import evaluate
|
| 63 |
+
|
| 64 |
+
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 65 |
+
sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
Hub metric after publishing:
|
| 69 |
+
|
| 70 |
+
```python
|
| 71 |
+
import evaluate
|
| 72 |
+
|
| 73 |
+
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 74 |
+
sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## CLI
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
sfr score --language ps_af --text "کابل کې ښه هوا ده"
|
| 81 |
+
sfr audit predictions.jsonl --language ps_af --text-column prediction
|
| 82 |
+
sfr audit predictions.csv --language bn_in --text-column transcript --format csv
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## ASR batch example
|
| 86 |
+
|
| 87 |
+
```python
|
| 88 |
+
from script_fidelity import compute_corpus_sfr
|
| 89 |
+
|
| 90 |
+
predictions = [
|
| 91 |
+
item["text"]
|
| 92 |
+
for item in whisper_outputs
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
summary = compute_corpus_sfr(predictions, language="bn_in")
|
| 96 |
+
print(summary["sfr_percent"])
|
| 97 |
+
print(summary["dominant_script_counts"])
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
## pandas dataframe example
|
| 101 |
+
|
| 102 |
+
```python
|
| 103 |
+
import pandas as pd
|
| 104 |
+
from script_fidelity import compute_sfr
|
| 105 |
+
|
| 106 |
+
df = pd.read_json("predictions.jsonl", lines=True)
|
| 107 |
+
df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
## Transformers compute_metrics example
|
| 111 |
+
|
| 112 |
+
```python
|
| 113 |
+
import evaluate
|
| 114 |
+
|
| 115 |
+
wer = evaluate.load("wer")
|
| 116 |
+
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 117 |
+
|
| 118 |
+
def compute_metrics(eval_pred):
|
| 119 |
+
predictions, labels = eval_pred
|
| 120 |
+
pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
|
| 121 |
+
label_text = processor.batch_decode(labels, skip_special_tokens=True)
|
| 122 |
+
return {
|
| 123 |
+
"wer": wer.compute(predictions=pred_text, references=label_text),
|
| 124 |
+
"sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
|
| 125 |
+
}
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
## CI gate example
|
| 129 |
+
|
| 130 |
+
```python
|
| 131 |
+
from script_fidelity import compute_corpus_sfr
|
| 132 |
+
|
| 133 |
+
summary = compute_corpus_sfr(predictions, language="ml_in")
|
| 134 |
+
if summary["sfr"] < 0.90:
|
| 135 |
+
raise SystemExit("SFR regression: Malayalam output is below 90% target script")
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## shared-script caveats
|
| 139 |
+
|
| 140 |
+
SFR is a script check, not a language identifier. Pashto, Urdu, Persian, Arabic,
|
| 141 |
+
Central Kurdish, and Sindhi share Arabic-script Unicode blocks. Latin-script
|
| 142 |
+
languages mostly detect romanization or non-Latin substitution, not language
|
| 143 |
+
identity. Pair SFR with language ID or lexical checks when shared-script
|
| 144 |
+
confusions matter.
|
| 145 |
+
|
| 146 |
+
Use `dominant_script()` and `script_distribution()` to inspect failures:
|
| 147 |
+
|
| 148 |
+
```python
|
| 149 |
+
from script_fidelity import dominant_script, script_distribution
|
| 150 |
+
|
| 151 |
+
dominant_script("this is romanized output")
|
| 152 |
+
script_distribution("বাংলা भाषा")
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
## FLEURS codes
|
| 156 |
+
|
| 157 |
+
The registry covers the 102 FLEURS language configs listed by `sfr languages`.
|
| 158 |
+
These paper languages have short aliases:
|
| 159 |
+
|
| 160 |
+
| FLEURS code | Alias | Script |
|
| 161 |
+
|---|---|---|
|
| 162 |
+
| `ps_af` | `pashto` | Arabic |
|
| 163 |
+
| `ur_pk` | `urdu` | Arabic |
|
| 164 |
+
| `ar_eg` | `arabic` | Arabic |
|
| 165 |
+
| `fa_ir` | `persian`, `farsi` | Arabic |
|
| 166 |
+
| `hi_in` | `hindi` | Devanagari |
|
| 167 |
+
| `bn_in` | `bengali`, `bangla` | Bengali |
|
| 168 |
+
| `ml_in` | `malayalam` | Malayalam |
|
| 169 |
+
| `ta_in` | `tamil` | Tamil |
|
| 170 |
+
| `so_so` | `somali` | Latin |
|
| 171 |
+
| `ka_ge` | `georgian` | Georgian |
|
| 172 |
+
|
| 173 |
+
For the full reviewed registry, see
|
| 174 |
+
`script_fidelity/data/fleurs_registry.json`.
|
| 175 |
+
|
| 176 |
+
Full code table:
|
| 177 |
+
|
| 178 |
+
| Code | Language | Script |
|
| 179 |
+
|---|---|---|
|
| 180 |
+
| `af_za` | Afrikaans | Latin |
|
| 181 |
+
| `am_et` | Amharic | Ethiopic |
|
| 182 |
+
| `ar_eg` | Arabic | Arabic |
|
| 183 |
+
| `as_in` | Assamese | Bengali |
|
| 184 |
+
| `ast_es` | Asturian | Latin |
|
| 185 |
+
| `az_az` | Azerbaijani | Latin |
|
| 186 |
+
| `be_by` | Belarusian | Cyrillic |
|
| 187 |
+
| `bg_bg` | Bulgarian | Cyrillic |
|
| 188 |
+
| `bn_in` | Bengali | Bengali |
|
| 189 |
+
| `bs_ba` | Bosnian | Latin |
|
| 190 |
+
| `ca_es` | Catalan | Latin |
|
| 191 |
+
| `ceb_ph` | Cebuano | Latin |
|
| 192 |
+
| `ckb_iq` | Central Kurdish | Arabic |
|
| 193 |
+
| `cmn_hans_cn` | Mandarin Chinese | Han |
|
| 194 |
+
| `cs_cz` | Czech | Latin |
|
| 195 |
+
| `cy_gb` | Welsh | Latin |
|
| 196 |
+
| `da_dk` | Danish | Latin |
|
| 197 |
+
| `de_de` | German | Latin |
|
| 198 |
+
| `el_gr` | Greek | Greek |
|
| 199 |
+
| `en_us` | English | Latin |
|
| 200 |
+
| `es_419` | Spanish | Latin |
|
| 201 |
+
| `et_ee` | Estonian | Latin |
|
| 202 |
+
| `fa_ir` | Persian | Arabic |
|
| 203 |
+
| `ff_sn` | Fulah | Latin |
|
| 204 |
+
| `fi_fi` | Finnish | Latin |
|
| 205 |
+
| `fil_ph` | Filipino | Latin |
|
| 206 |
+
| `fr_fr` | French | Latin |
|
| 207 |
+
| `ga_ie` | Irish | Latin |
|
| 208 |
+
| `gl_es` | Galician | Latin |
|
| 209 |
+
| `gu_in` | Gujarati | Gujarati |
|
| 210 |
+
| `ha_ng` | Hausa | Latin |
|
| 211 |
+
| `he_il` | Hebrew | Hebrew |
|
| 212 |
+
| `hi_in` | Hindi | Devanagari |
|
| 213 |
+
| `hr_hr` | Croatian | Latin |
|
| 214 |
+
| `hu_hu` | Hungarian | Latin |
|
| 215 |
+
| `hy_am` | Armenian | Armenian |
|
| 216 |
+
| `id_id` | Indonesian | Latin |
|
| 217 |
+
| `ig_ng` | Igbo | Latin |
|
| 218 |
+
| `is_is` | Icelandic | Latin |
|
| 219 |
+
| `it_it` | Italian | Latin |
|
| 220 |
+
| `ja_jp` | Japanese | Han, Hiragana, Katakana |
|
| 221 |
+
| `jv_id` | Javanese | Latin |
|
| 222 |
+
| `ka_ge` | Georgian | Georgian |
|
| 223 |
+
| `kam_ke` | Kamba | Latin |
|
| 224 |
+
| `kea_cv` | Kabuverdianu | Latin |
|
| 225 |
+
| `kk_kz` | Kazakh | Cyrillic |
|
| 226 |
+
| `km_kh` | Khmer | Khmer |
|
| 227 |
+
| `kn_in` | Kannada | Kannada |
|
| 228 |
+
| `ko_kr` | Korean | Hangul |
|
| 229 |
+
| `ky_kg` | Kyrgyz | Cyrillic |
|
| 230 |
+
| `lb_lu` | Luxembourgish | Latin |
|
| 231 |
+
| `lg_ug` | Ganda | Latin |
|
| 232 |
+
| `ln_cd` | Lingala | Latin |
|
| 233 |
+
| `lo_la` | Lao | Lao |
|
| 234 |
+
| `lt_lt` | Lithuanian | Latin |
|
| 235 |
+
| `luo_ke` | Luo | Latin |
|
| 236 |
+
| `lv_lv` | Latvian | Latin |
|
| 237 |
+
| `mi_nz` | Maori | Latin |
|
| 238 |
+
| `mk_mk` | Macedonian | Cyrillic |
|
| 239 |
+
| `ml_in` | Malayalam | Malayalam |
|
| 240 |
+
| `mn_mn` | Mongolian | Cyrillic |
|
| 241 |
+
| `mr_in` | Marathi | Devanagari |
|
| 242 |
+
| `ms_my` | Malay | Latin |
|
| 243 |
+
| `mt_mt` | Maltese | Latin |
|
| 244 |
+
| `my_mm` | Burmese | Myanmar |
|
| 245 |
+
| `nb_no` | Norwegian Bokmal | Latin |
|
| 246 |
+
| `ne_np` | Nepali | Devanagari |
|
| 247 |
+
| `nl_nl` | Dutch | Latin |
|
| 248 |
+
| `nso_za` | Northern Sotho | Latin |
|
| 249 |
+
| `ny_mw` | Chichewa | Latin |
|
| 250 |
+
| `oc_fr` | Occitan | Latin |
|
| 251 |
+
| `om_et` | Oromo | Latin |
|
| 252 |
+
| `or_in` | Odia | Odia |
|
| 253 |
+
| `pa_in` | Punjabi | Gurmukhi |
|
| 254 |
+
| `pl_pl` | Polish | Latin |
|
| 255 |
+
| `ps_af` | Pashto | Arabic |
|
| 256 |
+
| `pt_br` | Portuguese | Latin |
|
| 257 |
+
| `ro_ro` | Romanian | Latin |
|
| 258 |
+
| `ru_ru` | Russian | Cyrillic |
|
| 259 |
+
| `sd_in` | Sindhi | Arabic |
|
| 260 |
+
| `sk_sk` | Slovak | Latin |
|
| 261 |
+
| `sl_si` | Slovenian | Latin |
|
| 262 |
+
| `sn_zw` | Shona | Latin |
|
| 263 |
+
| `so_so` | Somali | Latin |
|
| 264 |
+
| `sr_rs` | Serbian | Cyrillic |
|
| 265 |
+
| `sv_se` | Swedish | Latin |
|
| 266 |
+
| `sw_ke` | Swahili | Latin |
|
| 267 |
+
| `ta_in` | Tamil | Tamil |
|
| 268 |
+
| `te_in` | Telugu | Telugu |
|
| 269 |
+
| `tg_tj` | Tajik | Cyrillic |
|
| 270 |
+
| `th_th` | Thai | Thai |
|
| 271 |
+
| `tr_tr` | Turkish | Latin |
|
| 272 |
+
| `uk_ua` | Ukrainian | Cyrillic |
|
| 273 |
+
| `umb_ao` | Umbundu | Latin |
|
| 274 |
+
| `ur_pk` | Urdu | Arabic |
|
| 275 |
+
| `uz_uz` | Uzbek | Latin |
|
| 276 |
+
| `vi_vn` | Vietnamese | Latin |
|
| 277 |
+
| `wo_sn` | Wolof | Latin |
|
| 278 |
+
| `xh_za` | Xhosa | Latin |
|
| 279 |
+
| `yo_ng` | Yoruba | Latin |
|
| 280 |
+
| `yue_hant_hk` | Cantonese | Han |
|
| 281 |
+
| `zu_za` | Zulu | Latin |
|
dist/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
dist/script_fidelity-0.1.1-py3-none-any.whl
ADDED
|
Binary file (14.2 kB). View file
|
|
|
dist/script_fidelity-0.1.1.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e36da45cddd306e6794eb59bd06cbd3fe9ae19801791bbe5c02862952aa89a8
|
| 3 |
+
size 18936
|
examples/ci_gate.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from script_fidelity import compute_corpus_sfr
|
| 2 |
+
|
| 3 |
+
predictions = ["മലയാളം വാക്യം", "malayalam romanized output"]
|
| 4 |
+
summary = compute_corpus_sfr(predictions, language="ml_in")
|
| 5 |
+
|
| 6 |
+
if summary["sfr"] < 0.90:
|
| 7 |
+
raise SystemExit("SFR regression: Malayalam output is below 90% target script")
|
examples/hf_evaluate.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
|
| 3 |
+
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 4 |
+
result = sfr.compute(
|
| 5 |
+
predictions=["کابل کې ښه هوا ده", "this is romanized output"],
|
| 6 |
+
language="ps_af",
|
| 7 |
+
)
|
| 8 |
+
print(result["sfr_percent"])
|
examples/pandas_dataframe.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
from script_fidelity import compute_sfr
|
| 4 |
+
|
| 5 |
+
df = pd.read_json("predictions.jsonl", lines=True)
|
| 6 |
+
df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
|
| 7 |
+
print(df[["prediction", "sfr"]].head())
|
examples/plain_python.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from script_fidelity import compute_corpus_sfr, compute_sfr
|
| 2 |
+
|
| 3 |
+
text = "کابل کې ښه هوا ده"
|
| 4 |
+
print(compute_sfr(text, language="ps_af"))
|
| 5 |
+
|
| 6 |
+
predictions = [text, "this is romanized output"]
|
| 7 |
+
print(compute_corpus_sfr(predictions, language="pashto"))
|
examples/transformers_compute_metrics.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
|
| 3 |
+
wer = evaluate.load("wer")
|
| 4 |
+
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def compute_metrics(eval_pred):
|
| 8 |
+
predictions, labels = eval_pred
|
| 9 |
+
pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
|
| 10 |
+
label_text = processor.batch_decode(labels, skip_special_tokens=True)
|
| 11 |
+
return {
|
| 12 |
+
"wer": wer.compute(predictions=pred_text, references=label_text),
|
| 13 |
+
"sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
|
| 14 |
+
}
|
metrics/script_fidelity_rate/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Script Fidelity Rate
|
| 2 |
+
|
| 3 |
+
This directory is the Hugging Face Evaluate metric module for Script Fidelity
|
| 4 |
+
Rate (SFR).
|
| 5 |
+
|
| 6 |
+
The Python package is published as `script-fidelity` on PyPI:
|
| 7 |
+
<https://pypi.org/project/script-fidelity/>. The import name is
|
| 8 |
+
`script_fidelity`.
|
| 9 |
+
|
| 10 |
+
```python
|
| 11 |
+
import evaluate
|
| 12 |
+
|
| 13 |
+
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 14 |
+
result = sfr.compute(
|
| 15 |
+
predictions=["کابل کې ښه هوا ده", "this is romanized output"],
|
| 16 |
+
language="ps_af",
|
| 17 |
+
)
|
| 18 |
+
print(result["sfr_percent"])
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
Hub use after publishing:
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
import evaluate
|
| 25 |
+
|
| 26 |
+
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 27 |
+
sfr.compute(predictions=["کابل کې ښه هوا ده"], language="pashto")
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Use SFR with WER and CER, not instead of them. SFR checks whether output is in
|
| 31 |
+
the intended script. It does not measure lexical accuracy.
|
metrics/script_fidelity_rate/requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
evaluate>=0.4.0,<1.0
|
| 2 |
+
script-fidelity>=0.1.1
|
metrics/script_fidelity_rate/script_fidelity_rate.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hugging Face Evaluate metric for Script Fidelity Rate."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import datasets
|
| 9 |
+
import evaluate
|
| 10 |
+
|
| 11 |
+
CURRENT_DIR = Path(__file__).resolve().parent
|
| 12 |
+
for parent in (CURRENT_DIR, CURRENT_DIR.parent, CURRENT_DIR.parent.parent):
|
| 13 |
+
if (parent / "script_fidelity").exists():
|
| 14 |
+
sys.path.insert(0, str(parent))
|
| 15 |
+
break
|
| 16 |
+
|
| 17 |
+
from script_fidelity import compute_corpus_sfr # noqa: E402
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
_DESCRIPTION = """
|
| 21 |
+
Script Fidelity Rate (SFR) is a reference-free metric for multilingual ASR.
|
| 22 |
+
It computes the fraction of countable hypothesis characters that belong to the
|
| 23 |
+
expected Unicode script for a target FLEURS language code.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
_CITATION = """
|
| 27 |
+
@misc{scriptfidelity2026,
|
| 28 |
+
title = {Script Collapse in Multilingual ASR: A Reference-Free Metric and 100-Pair Benchmark},
|
| 29 |
+
year = {2026}
|
| 30 |
+
}
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
_KWARGS_DESCRIPTION = """
|
| 34 |
+
Args:
|
| 35 |
+
predictions: List of ASR hypothesis strings.
|
| 36 |
+
language: FLEURS language code or alias, for example "ps_af" or "pashto".
|
| 37 |
+
digit_policy: "count" keeps digits in the denominator. "ignore" treats
|
| 38 |
+
digits as neutral.
|
| 39 |
+
return_details: Return per-example SFR details.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Corpus SFR, percent SFR, empty counts, low/high SFR rates, and dominant
|
| 43 |
+
script counts.
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class ScriptFidelityRate(evaluate.Metric):
|
| 48 |
+
"""Evaluate community metric wrapper for SFR."""
|
| 49 |
+
|
| 50 |
+
def _info(self) -> evaluate.MetricInfo:
|
| 51 |
+
return evaluate.MetricInfo(
|
| 52 |
+
description=_DESCRIPTION,
|
| 53 |
+
citation=_CITATION,
|
| 54 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
| 55 |
+
features=datasets.Features(
|
| 56 |
+
{
|
| 57 |
+
"predictions": datasets.Value("string"),
|
| 58 |
+
}
|
| 59 |
+
),
|
| 60 |
+
reference_urls=[
|
| 61 |
+
"https://huggingface.co/datasets/themechanism/script-fidelity-benchmark"
|
| 62 |
+
],
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
def _compute(
|
| 66 |
+
self,
|
| 67 |
+
predictions: list[str],
|
| 68 |
+
language: str,
|
| 69 |
+
digit_policy: str = "count",
|
| 70 |
+
return_details: bool = False,
|
| 71 |
+
) -> dict:
|
| 72 |
+
if digit_policy not in {"count", "ignore"}:
|
| 73 |
+
raise ValueError("digit_policy must be 'count' or 'ignore'")
|
| 74 |
+
return compute_corpus_sfr(
|
| 75 |
+
predictions,
|
| 76 |
+
language=language,
|
| 77 |
+
digit_policy=digit_policy, # type: ignore[arg-type]
|
| 78 |
+
return_details=return_details,
|
| 79 |
+
)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=69", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "script-fidelity"
|
| 7 |
+
version = "0.1.1"
|
| 8 |
+
description = "Reference-free script fidelity metric for multilingual ASR."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
license = "MIT"
|
| 12 |
+
authors = [{ name = "Anonymous" }]
|
| 13 |
+
keywords = [
|
| 14 |
+
"asr",
|
| 15 |
+
"speech-recognition",
|
| 16 |
+
"evaluation",
|
| 17 |
+
"unicode",
|
| 18 |
+
"script-fidelity",
|
| 19 |
+
"fleurs",
|
| 20 |
+
]
|
| 21 |
+
classifiers = [
|
| 22 |
+
"Development Status :: 3 - Alpha",
|
| 23 |
+
"Intended Audience :: Science/Research",
|
| 24 |
+
"Programming Language :: Python :: 3",
|
| 25 |
+
"Programming Language :: Python :: 3.10",
|
| 26 |
+
"Programming Language :: Python :: 3.11",
|
| 27 |
+
"Programming Language :: Python :: 3.12",
|
| 28 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 29 |
+
]
|
| 30 |
+
dependencies = []
|
| 31 |
+
|
| 32 |
+
[project.optional-dependencies]
|
| 33 |
+
evaluate = ["evaluate>=0.4.0,<1.0"]
|
| 34 |
+
dev = [
|
| 35 |
+
"evaluate>=0.4.0,<1.0",
|
| 36 |
+
"pytest>=8.0",
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
[project.scripts]
|
| 40 |
+
sfr = "script_fidelity.cli:main"
|
| 41 |
+
|
| 42 |
+
[tool.setuptools.packages.find]
|
| 43 |
+
include = ["script_fidelity*"]
|
| 44 |
+
|
| 45 |
+
[tool.setuptools.package-data]
|
| 46 |
+
script_fidelity = ["data/*.json"]
|
| 47 |
+
|
| 48 |
+
[tool.pytest.ini_options]
|
| 49 |
+
testpaths = ["tests"]
|
| 50 |
+
pythonpath = ["."]
|
script_fidelity/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reference-free script fidelity metrics for multilingual ASR."""
|
| 2 |
+
|
| 3 |
+
from .core import (
|
| 4 |
+
compute_corpus_sfr,
|
| 5 |
+
compute_sf,
|
| 6 |
+
compute_sf_batch,
|
| 7 |
+
compute_sfr,
|
| 8 |
+
compute_sfr_batch,
|
| 9 |
+
score_text,
|
| 10 |
+
)
|
| 11 |
+
from .dominant import dominant_script, script_distribution
|
| 12 |
+
from .registry import (
|
| 13 |
+
FLEURS_CONFIGS,
|
| 14 |
+
SCRIPT_CONFIGS,
|
| 15 |
+
get_script_config,
|
| 16 |
+
list_languages,
|
| 17 |
+
resolve_language,
|
| 18 |
+
)
|
| 19 |
+
from .types import DigitPolicy, SFRResult, ScriptConfig
|
| 20 |
+
|
| 21 |
+
__all__ = [
|
| 22 |
+
"DigitPolicy",
|
| 23 |
+
"FLEURS_CONFIGS",
|
| 24 |
+
"SCRIPT_CONFIGS",
|
| 25 |
+
"SFRResult",
|
| 26 |
+
"ScriptConfig",
|
| 27 |
+
"compute_corpus_sfr",
|
| 28 |
+
"compute_sf",
|
| 29 |
+
"compute_sf_batch",
|
| 30 |
+
"compute_sfr",
|
| 31 |
+
"compute_sfr_batch",
|
| 32 |
+
"dominant_script",
|
| 33 |
+
"get_script_config",
|
| 34 |
+
"list_languages",
|
| 35 |
+
"resolve_language",
|
| 36 |
+
"score_text",
|
| 37 |
+
"script_distribution",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
__version__ = "0.1.1"
|
script_fidelity/__main__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .cli import main
|
| 2 |
+
|
| 3 |
+
raise SystemExit(main())
|
script_fidelity/cli.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Command line interface for Script Fidelity Rate."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import csv
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from .core import compute_corpus_sfr, compute_sfr
|
| 12 |
+
from .registry import list_languages
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _read_predictions(path: Path, text_column: str) -> list[str]:
|
| 16 |
+
if path.suffix.lower() == ".jsonl":
|
| 17 |
+
rows = []
|
| 18 |
+
with path.open("r", encoding="utf-8") as handle:
|
| 19 |
+
for line_no, line in enumerate(handle, start=1):
|
| 20 |
+
if not line.strip():
|
| 21 |
+
continue
|
| 22 |
+
item = json.loads(line)
|
| 23 |
+
if text_column not in item:
|
| 24 |
+
raise ValueError(f"Missing column '{text_column}' on line {line_no}")
|
| 25 |
+
rows.append(str(item[text_column]))
|
| 26 |
+
return rows
|
| 27 |
+
|
| 28 |
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
| 29 |
+
reader = csv.DictReader(handle)
|
| 30 |
+
if not reader.fieldnames or text_column not in reader.fieldnames:
|
| 31 |
+
raise ValueError(f"Missing column '{text_column}' in CSV header")
|
| 32 |
+
return [str(row[text_column]) for row in reader]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _emit_summary(summary: dict, output_format: str) -> None:
|
| 36 |
+
if output_format == "json":
|
| 37 |
+
print(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True))
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
writer = csv.DictWriter(
|
| 41 |
+
sys.stdout,
|
| 42 |
+
fieldnames=[
|
| 43 |
+
"sfr",
|
| 44 |
+
"sfr_percent",
|
| 45 |
+
"n",
|
| 46 |
+
"n_valid",
|
| 47 |
+
"n_empty",
|
| 48 |
+
"low_sfr_rate",
|
| 49 |
+
"high_sfr_rate",
|
| 50 |
+
"dominant_script_counts",
|
| 51 |
+
],
|
| 52 |
+
)
|
| 53 |
+
writer.writeheader()
|
| 54 |
+
row = dict(summary)
|
| 55 |
+
row["dominant_script_counts"] = json.dumps(
|
| 56 |
+
row["dominant_script_counts"],
|
| 57 |
+
ensure_ascii=False,
|
| 58 |
+
sort_keys=True,
|
| 59 |
+
)
|
| 60 |
+
writer.writerow(row)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 64 |
+
parser = argparse.ArgumentParser(prog="sfr", description="Script Fidelity Rate tools")
|
| 65 |
+
sub = parser.add_subparsers(dest="command", required=True)
|
| 66 |
+
|
| 67 |
+
score = sub.add_parser("score", help="score one text string")
|
| 68 |
+
score.add_argument("--language", required=True, help="FLEURS code or alias")
|
| 69 |
+
score.add_argument("--text", required=True, help="ASR hypothesis text")
|
| 70 |
+
score.add_argument(
|
| 71 |
+
"--digit-policy",
|
| 72 |
+
choices=["count", "ignore"],
|
| 73 |
+
default="count",
|
| 74 |
+
help="count digits as characters or treat them as neutral",
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
audit = sub.add_parser("audit", help="audit a CSV or JSONL file")
|
| 78 |
+
audit.add_argument("path", type=Path, help="CSV or JSONL file")
|
| 79 |
+
audit.add_argument("--language", required=True, help="FLEURS code or alias")
|
| 80 |
+
audit.add_argument("--text-column", default="prediction", help="prediction column")
|
| 81 |
+
audit.add_argument(
|
| 82 |
+
"--digit-policy",
|
| 83 |
+
choices=["count", "ignore"],
|
| 84 |
+
default="count",
|
| 85 |
+
help="count digits as characters or treat them as neutral",
|
| 86 |
+
)
|
| 87 |
+
audit.add_argument("--format", choices=["json", "csv"], default="json")
|
| 88 |
+
audit.add_argument("--details", action="store_true", help="include per-row details")
|
| 89 |
+
|
| 90 |
+
langs = sub.add_parser("languages", help="list supported FLEURS codes")
|
| 91 |
+
langs.add_argument("--format", choices=["plain", "json"], default="plain")
|
| 92 |
+
|
| 93 |
+
return parser
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main(argv: list[str] | None = None) -> int:
|
| 97 |
+
parser = build_parser()
|
| 98 |
+
args = parser.parse_args(argv)
|
| 99 |
+
|
| 100 |
+
if args.command == "score":
|
| 101 |
+
score = compute_sfr(
|
| 102 |
+
args.text,
|
| 103 |
+
language=args.language,
|
| 104 |
+
digit_policy=args.digit_policy,
|
| 105 |
+
)
|
| 106 |
+
print("NA" if score is None else f"{score:.6f}")
|
| 107 |
+
return 0
|
| 108 |
+
|
| 109 |
+
if args.command == "audit":
|
| 110 |
+
predictions = _read_predictions(args.path, args.text_column)
|
| 111 |
+
summary = compute_corpus_sfr(
|
| 112 |
+
predictions,
|
| 113 |
+
language=args.language,
|
| 114 |
+
digit_policy=args.digit_policy,
|
| 115 |
+
return_details=args.details,
|
| 116 |
+
)
|
| 117 |
+
_emit_summary(summary, args.format)
|
| 118 |
+
return 0
|
| 119 |
+
|
| 120 |
+
if args.command == "languages":
|
| 121 |
+
languages = list_languages()
|
| 122 |
+
if args.format == "json":
|
| 123 |
+
print(json.dumps(languages, indent=2))
|
| 124 |
+
else:
|
| 125 |
+
print("\n".join(languages))
|
| 126 |
+
return 0
|
| 127 |
+
|
| 128 |
+
parser.error("unknown command")
|
| 129 |
+
return 2
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
raise SystemExit(main())
|
script_fidelity/core.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core Script Fidelity Rate implementation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import unicodedata
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from statistics import fmean
|
| 8 |
+
|
| 9 |
+
from .dominant import dominant_script, is_countable, script_distribution
|
| 10 |
+
from .registry import get_script_config
|
| 11 |
+
from .types import DigitPolicy, SFRResult, ScriptConfig
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _is_in_range(cp: int, ranges: tuple[tuple[int, int], ...]) -> bool:
|
| 15 |
+
return any(lo <= cp <= hi for lo, hi in ranges)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def score_text(
|
| 19 |
+
text: str,
|
| 20 |
+
language: str = "ps_af",
|
| 21 |
+
*,
|
| 22 |
+
digit_policy: DigitPolicy = "count",
|
| 23 |
+
config: ScriptConfig | None = None,
|
| 24 |
+
) -> SFRResult:
|
| 25 |
+
"""Score one ASR hypothesis and return numerator, denominator, and scripts."""
|
| 26 |
+
|
| 27 |
+
cfg = config or get_script_config(language)
|
| 28 |
+
normalized = unicodedata.normalize("NFC", text or "")
|
| 29 |
+
chars = [ch for ch in normalized if is_countable(ch, digit_policy=digit_policy)]
|
| 30 |
+
|
| 31 |
+
numerator = sum(1 for ch in chars if _is_in_range(ord(ch), cfg.ranges))
|
| 32 |
+
denominator = len(chars)
|
| 33 |
+
sfr = None if denominator == 0 else numerator / denominator
|
| 34 |
+
|
| 35 |
+
return SFRResult(
|
| 36 |
+
language=cfg.code,
|
| 37 |
+
sfr=sfr,
|
| 38 |
+
numerator=numerator,
|
| 39 |
+
denominator=denominator,
|
| 40 |
+
dominant_script=dominant_script(
|
| 41 |
+
normalized,
|
| 42 |
+
digit_policy=digit_policy,
|
| 43 |
+
),
|
| 44 |
+
script_counts=script_distribution(
|
| 45 |
+
normalized,
|
| 46 |
+
digit_policy=digit_policy,
|
| 47 |
+
),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def compute_sfr(
|
| 52 |
+
text: str,
|
| 53 |
+
language: str = "ps_af",
|
| 54 |
+
*,
|
| 55 |
+
digit_policy: DigitPolicy = "count",
|
| 56 |
+
) -> float | None:
|
| 57 |
+
"""Compute reference-free Script Fidelity Rate for one ASR hypothesis."""
|
| 58 |
+
|
| 59 |
+
return score_text(text, language, digit_policy=digit_policy).sfr
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def compute_sfr_batch(
|
| 63 |
+
predictions: list[str] | tuple[str, ...],
|
| 64 |
+
language: str = "ps_af",
|
| 65 |
+
*,
|
| 66 |
+
digit_policy: DigitPolicy = "count",
|
| 67 |
+
) -> list[float | None]:
|
| 68 |
+
"""Compute SFR for a batch of ASR hypotheses."""
|
| 69 |
+
|
| 70 |
+
config = get_script_config(language)
|
| 71 |
+
return [
|
| 72 |
+
score_text(text, config.code, digit_policy=digit_policy, config=config).sfr
|
| 73 |
+
for text in predictions
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def compute_corpus_sfr(
|
| 78 |
+
predictions: list[str] | tuple[str, ...],
|
| 79 |
+
language: str = "ps_af",
|
| 80 |
+
*,
|
| 81 |
+
digit_policy: DigitPolicy = "count",
|
| 82 |
+
low_threshold: float = 0.1,
|
| 83 |
+
high_threshold: float = 0.9,
|
| 84 |
+
return_details: bool = False,
|
| 85 |
+
) -> dict:
|
| 86 |
+
"""Compute corpus SFR and audit counts for a batch."""
|
| 87 |
+
|
| 88 |
+
config = get_script_config(language)
|
| 89 |
+
details = [
|
| 90 |
+
score_text(text, config.code, digit_policy=digit_policy, config=config)
|
| 91 |
+
for text in predictions
|
| 92 |
+
]
|
| 93 |
+
scores = [item.sfr for item in details if item.sfr is not None]
|
| 94 |
+
n = len(details)
|
| 95 |
+
n_valid = len(scores)
|
| 96 |
+
n_empty = n - n_valid
|
| 97 |
+
corpus = fmean(scores) if scores else None
|
| 98 |
+
|
| 99 |
+
dominant_counts = Counter(item.dominant_script for item in details)
|
| 100 |
+
result = {
|
| 101 |
+
"sfr": corpus,
|
| 102 |
+
"sfr_percent": None if corpus is None else corpus * 100,
|
| 103 |
+
"n": n,
|
| 104 |
+
"n_valid": n_valid,
|
| 105 |
+
"n_empty": n_empty,
|
| 106 |
+
"low_sfr_rate": None
|
| 107 |
+
if n_valid == 0
|
| 108 |
+
else sum(1 for score in scores if score < low_threshold) / n_valid,
|
| 109 |
+
"high_sfr_rate": None
|
| 110 |
+
if n_valid == 0
|
| 111 |
+
else sum(1 for score in scores if score >= high_threshold) / n_valid,
|
| 112 |
+
"dominant_script_counts": dict(sorted(dominant_counts.items())),
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
if return_details:
|
| 116 |
+
result["details"] = [
|
| 117 |
+
{
|
| 118 |
+
"language": item.language,
|
| 119 |
+
"sfr": item.sfr,
|
| 120 |
+
"numerator": item.numerator,
|
| 121 |
+
"denominator": item.denominator,
|
| 122 |
+
"dominant_script": item.dominant_script,
|
| 123 |
+
"script_counts": item.script_counts,
|
| 124 |
+
}
|
| 125 |
+
for item in details
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
return result
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
compute_sf = compute_sfr
|
| 132 |
+
compute_sf_batch = compute_sfr_batch
|
script_fidelity/data/fleurs_registry.json
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "0.1.1",
|
| 3 |
+
"source": "Reviewed registry for google/fleurs configs as of 2026-05-07. The config named all is excluded.",
|
| 4 |
+
"scripts": {
|
| 5 |
+
"arabic": {
|
| 6 |
+
"name": "Arabic",
|
| 7 |
+
"ranges": [[1536, 1791], [1872, 1919], [2208, 2303], [64336, 65023], [65136, 65279], [69216, 69247], [126464, 126719]]
|
| 8 |
+
},
|
| 9 |
+
"armenian": {
|
| 10 |
+
"name": "Armenian",
|
| 11 |
+
"ranges": [[1328, 1423], [64275, 64279]]
|
| 12 |
+
},
|
| 13 |
+
"bengali": {
|
| 14 |
+
"name": "Bengali",
|
| 15 |
+
"ranges": [[2432, 2559]]
|
| 16 |
+
},
|
| 17 |
+
"cyrillic": {
|
| 18 |
+
"name": "Cyrillic",
|
| 19 |
+
"ranges": [[1024, 1279], [1280, 1327], [7296, 7311], [11744, 11775], [42560, 42655]]
|
| 20 |
+
},
|
| 21 |
+
"devanagari": {
|
| 22 |
+
"name": "Devanagari",
|
| 23 |
+
"ranges": [[2304, 2431], [43232, 43263], [72448, 72543]]
|
| 24 |
+
},
|
| 25 |
+
"ethiopic": {
|
| 26 |
+
"name": "Ethiopic",
|
| 27 |
+
"ranges": [[4608, 4991], [4992, 5023], [11648, 11743], [43776, 43823]]
|
| 28 |
+
},
|
| 29 |
+
"georgian": {
|
| 30 |
+
"name": "Georgian",
|
| 31 |
+
"ranges": [[4256, 4351], [11520, 11567], [7312, 7359]]
|
| 32 |
+
},
|
| 33 |
+
"greek": {
|
| 34 |
+
"name": "Greek",
|
| 35 |
+
"ranges": [[880, 1023], [7936, 8191]]
|
| 36 |
+
},
|
| 37 |
+
"gujarati": {
|
| 38 |
+
"name": "Gujarati",
|
| 39 |
+
"ranges": [[2688, 2815]]
|
| 40 |
+
},
|
| 41 |
+
"gurmukhi": {
|
| 42 |
+
"name": "Gurmukhi",
|
| 43 |
+
"ranges": [[2560, 2687]]
|
| 44 |
+
},
|
| 45 |
+
"han": {
|
| 46 |
+
"name": "Han",
|
| 47 |
+
"ranges": [[13312, 19903], [19968, 40959], [63744, 64255], [131072, 173791], [173824, 177983], [177984, 178207], [178208, 183983], [183984, 191471], [196608, 201551]]
|
| 48 |
+
},
|
| 49 |
+
"hangul": {
|
| 50 |
+
"name": "Hangul",
|
| 51 |
+
"ranges": [[4352, 4607], [12592, 12687], [43360, 43391], [44032, 55215], [55216, 55295]]
|
| 52 |
+
},
|
| 53 |
+
"hebrew": {
|
| 54 |
+
"name": "Hebrew",
|
| 55 |
+
"ranges": [[1424, 1535], [64285, 64335]]
|
| 56 |
+
},
|
| 57 |
+
"hiragana": {
|
| 58 |
+
"name": "Hiragana",
|
| 59 |
+
"ranges": [[12352, 12447]]
|
| 60 |
+
},
|
| 61 |
+
"kannada": {
|
| 62 |
+
"name": "Kannada",
|
| 63 |
+
"ranges": [[3200, 3327]]
|
| 64 |
+
},
|
| 65 |
+
"katakana": {
|
| 66 |
+
"name": "Katakana",
|
| 67 |
+
"ranges": [[12448, 12543], [12784, 12799], [65381, 65439]]
|
| 68 |
+
},
|
| 69 |
+
"khmer": {
|
| 70 |
+
"name": "Khmer",
|
| 71 |
+
"ranges": [[6016, 6143], [6624, 6655]]
|
| 72 |
+
},
|
| 73 |
+
"lao": {
|
| 74 |
+
"name": "Lao",
|
| 75 |
+
"ranges": [[3712, 3839]]
|
| 76 |
+
},
|
| 77 |
+
"latin": {
|
| 78 |
+
"name": "Latin",
|
| 79 |
+
"ranges": [[65, 90], [97, 122], [192, 591], [7680, 7935], [42784, 43007], [43824, 43887], [122624, 122879]]
|
| 80 |
+
},
|
| 81 |
+
"malayalam": {
|
| 82 |
+
"name": "Malayalam",
|
| 83 |
+
"ranges": [[3328, 3455]]
|
| 84 |
+
},
|
| 85 |
+
"myanmar": {
|
| 86 |
+
"name": "Myanmar",
|
| 87 |
+
"ranges": [[4096, 4255], [43392, 43487], [43488, 43519]]
|
| 88 |
+
},
|
| 89 |
+
"odia": {
|
| 90 |
+
"name": "Odia",
|
| 91 |
+
"ranges": [[2816, 2943]]
|
| 92 |
+
},
|
| 93 |
+
"tamil": {
|
| 94 |
+
"name": "Tamil",
|
| 95 |
+
"ranges": [[2944, 3071]]
|
| 96 |
+
},
|
| 97 |
+
"telugu": {
|
| 98 |
+
"name": "Telugu",
|
| 99 |
+
"ranges": [[3072, 3199]]
|
| 100 |
+
},
|
| 101 |
+
"thai": {
|
| 102 |
+
"name": "Thai",
|
| 103 |
+
"ranges": [[3584, 3711]]
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"languages": {
|
| 107 |
+
"af_za": {"name": "Afrikaans", "scripts": ["latin"], "aliases": ["afrikaans"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 108 |
+
"am_et": {"name": "Amharic", "scripts": ["ethiopic"], "aliases": ["amharic"]},
|
| 109 |
+
"ar_eg": {"name": "Arabic", "scripts": ["arabic"], "aliases": ["arabic", "msa"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 110 |
+
"as_in": {"name": "Assamese", "scripts": ["bengali"], "aliases": ["assamese"], "shared_script": true, "warning": "Bengali-Assamese script SFR does not identify the language."},
|
| 111 |
+
"ast_es": {"name": "Asturian", "scripts": ["latin"], "aliases": ["asturian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 112 |
+
"az_az": {"name": "Azerbaijani", "scripts": ["latin"], "aliases": ["azerbaijani", "azeri"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 113 |
+
"be_by": {"name": "Belarusian", "scripts": ["cyrillic"], "aliases": ["belarusian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 114 |
+
"bg_bg": {"name": "Bulgarian", "scripts": ["cyrillic"], "aliases": ["bulgarian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 115 |
+
"bn_in": {"name": "Bengali", "scripts": ["bengali"], "aliases": ["bengali", "bangla"], "shared_script": true, "warning": "Bengali-script SFR does not distinguish Bengali from Assamese."},
|
| 116 |
+
"bs_ba": {"name": "Bosnian", "scripts": ["latin"], "aliases": ["bosnian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 117 |
+
"ca_es": {"name": "Catalan", "scripts": ["latin"], "aliases": ["catalan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 118 |
+
"ceb_ph": {"name": "Cebuano", "scripts": ["latin"], "aliases": ["cebuano"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 119 |
+
"ckb_iq": {"name": "Central Kurdish", "scripts": ["arabic"], "aliases": ["central_kurdish", "sorani", "kurdish_sorani"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 120 |
+
"cmn_hans_cn": {"name": "Mandarin Chinese", "scripts": ["han"], "aliases": ["mandarin", "chinese", "simplified_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
|
| 121 |
+
"cs_cz": {"name": "Czech", "scripts": ["latin"], "aliases": ["czech"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 122 |
+
"cy_gb": {"name": "Welsh", "scripts": ["latin"], "aliases": ["welsh"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 123 |
+
"da_dk": {"name": "Danish", "scripts": ["latin"], "aliases": ["danish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 124 |
+
"de_de": {"name": "German", "scripts": ["latin"], "aliases": ["german"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 125 |
+
"el_gr": {"name": "Greek", "scripts": ["greek"], "aliases": ["greek"]},
|
| 126 |
+
"en_us": {"name": "English", "scripts": ["latin"], "aliases": ["english"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 127 |
+
"es_419": {"name": "Spanish", "scripts": ["latin"], "aliases": ["spanish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 128 |
+
"et_ee": {"name": "Estonian", "scripts": ["latin"], "aliases": ["estonian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 129 |
+
"fa_ir": {"name": "Persian", "scripts": ["arabic"], "aliases": ["persian", "farsi"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 130 |
+
"ff_sn": {"name": "Fulah", "scripts": ["latin"], "aliases": ["fulah", "fulani"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 131 |
+
"fi_fi": {"name": "Finnish", "scripts": ["latin"], "aliases": ["finnish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 132 |
+
"fil_ph": {"name": "Filipino", "scripts": ["latin"], "aliases": ["filipino", "tagalog"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 133 |
+
"fr_fr": {"name": "French", "scripts": ["latin"], "aliases": ["french"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 134 |
+
"ga_ie": {"name": "Irish", "scripts": ["latin"], "aliases": ["irish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 135 |
+
"gl_es": {"name": "Galician", "scripts": ["latin"], "aliases": ["galician"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 136 |
+
"gu_in": {"name": "Gujarati", "scripts": ["gujarati"], "aliases": ["gujarati"]},
|
| 137 |
+
"ha_ng": {"name": "Hausa", "scripts": ["latin"], "aliases": ["hausa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 138 |
+
"he_il": {"name": "Hebrew", "scripts": ["hebrew"], "aliases": ["hebrew"]},
|
| 139 |
+
"hi_in": {"name": "Hindi", "scripts": ["devanagari"], "aliases": ["hindi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
|
| 140 |
+
"hr_hr": {"name": "Croatian", "scripts": ["latin"], "aliases": ["croatian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 141 |
+
"hu_hu": {"name": "Hungarian", "scripts": ["latin"], "aliases": ["hungarian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 142 |
+
"hy_am": {"name": "Armenian", "scripts": ["armenian"], "aliases": ["armenian"]},
|
| 143 |
+
"id_id": {"name": "Indonesian", "scripts": ["latin"], "aliases": ["indonesian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 144 |
+
"ig_ng": {"name": "Igbo", "scripts": ["latin"], "aliases": ["igbo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 145 |
+
"is_is": {"name": "Icelandic", "scripts": ["latin"], "aliases": ["icelandic"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 146 |
+
"it_it": {"name": "Italian", "scripts": ["latin"], "aliases": ["italian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 147 |
+
"ja_jp": {"name": "Japanese", "scripts": ["han", "hiragana", "katakana"], "aliases": ["japanese"], "shared_script": true, "warning": "Japanese SFR counts Han and kana; it is not a language identifier."},
|
| 148 |
+
"jv_id": {"name": "Javanese", "scripts": ["latin"], "aliases": ["javanese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 149 |
+
"ka_ge": {"name": "Georgian", "scripts": ["georgian"], "aliases": ["georgian"]},
|
| 150 |
+
"kam_ke": {"name": "Kamba", "scripts": ["latin"], "aliases": ["kamba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 151 |
+
"kea_cv": {"name": "Kabuverdianu", "scripts": ["latin"], "aliases": ["kabuverdianu", "cape_verdean_creole"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 152 |
+
"kk_kz": {"name": "Kazakh", "scripts": ["cyrillic"], "aliases": ["kazakh"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 153 |
+
"km_kh": {"name": "Khmer", "scripts": ["khmer"], "aliases": ["khmer"]},
|
| 154 |
+
"kn_in": {"name": "Kannada", "scripts": ["kannada"], "aliases": ["kannada"]},
|
| 155 |
+
"ko_kr": {"name": "Korean", "scripts": ["hangul"], "aliases": ["korean"]},
|
| 156 |
+
"ky_kg": {"name": "Kyrgyz", "scripts": ["cyrillic"], "aliases": ["kyrgyz"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 157 |
+
"lb_lu": {"name": "Luxembourgish", "scripts": ["latin"], "aliases": ["luxembourgish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 158 |
+
"lg_ug": {"name": "Ganda", "scripts": ["latin"], "aliases": ["ganda", "luganda"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 159 |
+
"ln_cd": {"name": "Lingala", "scripts": ["latin"], "aliases": ["lingala"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 160 |
+
"lo_la": {"name": "Lao", "scripts": ["lao"], "aliases": ["lao"]},
|
| 161 |
+
"lt_lt": {"name": "Lithuanian", "scripts": ["latin"], "aliases": ["lithuanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 162 |
+
"luo_ke": {"name": "Luo", "scripts": ["latin"], "aliases": ["luo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 163 |
+
"lv_lv": {"name": "Latvian", "scripts": ["latin"], "aliases": ["latvian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 164 |
+
"mi_nz": {"name": "Maori", "scripts": ["latin"], "aliases": ["maori", "māori"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 165 |
+
"mk_mk": {"name": "Macedonian", "scripts": ["cyrillic"], "aliases": ["macedonian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 166 |
+
"ml_in": {"name": "Malayalam", "scripts": ["malayalam"], "aliases": ["malayalam"]},
|
| 167 |
+
"mn_mn": {"name": "Mongolian", "scripts": ["cyrillic"], "aliases": ["mongolian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 168 |
+
"mr_in": {"name": "Marathi", "scripts": ["devanagari"], "aliases": ["marathi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
|
| 169 |
+
"ms_my": {"name": "Malay", "scripts": ["latin"], "aliases": ["malay"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 170 |
+
"mt_mt": {"name": "Maltese", "scripts": ["latin"], "aliases": ["maltese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 171 |
+
"my_mm": {"name": "Burmese", "scripts": ["myanmar"], "aliases": ["burmese", "myanmar_language"]},
|
| 172 |
+
"nb_no": {"name": "Norwegian Bokmal", "scripts": ["latin"], "aliases": ["norwegian", "norwegian_bokmal", "bokmal"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 173 |
+
"ne_np": {"name": "Nepali", "scripts": ["devanagari"], "aliases": ["nepali"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
|
| 174 |
+
"nl_nl": {"name": "Dutch", "scripts": ["latin"], "aliases": ["dutch"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 175 |
+
"nso_za": {"name": "Northern Sotho", "scripts": ["latin"], "aliases": ["northern_sotho", "sepedi"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 176 |
+
"ny_mw": {"name": "Chichewa", "scripts": ["latin"], "aliases": ["chichewa", "nyanja"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 177 |
+
"oc_fr": {"name": "Occitan", "scripts": ["latin"], "aliases": ["occitan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 178 |
+
"om_et": {"name": "Oromo", "scripts": ["latin"], "aliases": ["oromo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 179 |
+
"or_in": {"name": "Odia", "scripts": ["odia"], "aliases": ["odia", "oriya"]},
|
| 180 |
+
"pa_in": {"name": "Punjabi", "scripts": ["gurmukhi"], "aliases": ["punjabi", "eastern_punjabi"], "shared_script": true, "warning": "Gurmukhi SFR checks script, not dialect or language identity."},
|
| 181 |
+
"pl_pl": {"name": "Polish", "scripts": ["latin"], "aliases": ["polish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 182 |
+
"ps_af": {"name": "Pashto", "scripts": ["arabic"], "aliases": ["pashto", "pushto", "ps"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 183 |
+
"pt_br": {"name": "Portuguese", "scripts": ["latin"], "aliases": ["portuguese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 184 |
+
"ro_ro": {"name": "Romanian", "scripts": ["latin"], "aliases": ["romanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 185 |
+
"ru_ru": {"name": "Russian", "scripts": ["cyrillic"], "aliases": ["russian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 186 |
+
"sd_in": {"name": "Sindhi", "scripts": ["arabic"], "aliases": ["sindhi"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 187 |
+
"sk_sk": {"name": "Slovak", "scripts": ["latin"], "aliases": ["slovak"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 188 |
+
"sl_si": {"name": "Slovenian", "scripts": ["latin"], "aliases": ["slovenian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 189 |
+
"sn_zw": {"name": "Shona", "scripts": ["latin"], "aliases": ["shona"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 190 |
+
"so_so": {"name": "Somali", "scripts": ["latin"], "aliases": ["somali"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 191 |
+
"sr_rs": {"name": "Serbian", "scripts": ["cyrillic"], "aliases": ["serbian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 192 |
+
"sv_se": {"name": "Swedish", "scripts": ["latin"], "aliases": ["swedish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 193 |
+
"sw_ke": {"name": "Swahili", "scripts": ["latin"], "aliases": ["swahili"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 194 |
+
"ta_in": {"name": "Tamil", "scripts": ["tamil"], "aliases": ["tamil"]},
|
| 195 |
+
"te_in": {"name": "Telugu", "scripts": ["telugu"], "aliases": ["telugu"]},
|
| 196 |
+
"tg_tj": {"name": "Tajik", "scripts": ["cyrillic"], "aliases": ["tajik"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 197 |
+
"th_th": {"name": "Thai", "scripts": ["thai"], "aliases": ["thai"]},
|
| 198 |
+
"tr_tr": {"name": "Turkish", "scripts": ["latin"], "aliases": ["turkish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 199 |
+
"uk_ua": {"name": "Ukrainian", "scripts": ["cyrillic"], "aliases": ["ukrainian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 200 |
+
"umb_ao": {"name": "Umbundu", "scripts": ["latin"], "aliases": ["umbundu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 201 |
+
"ur_pk": {"name": "Urdu", "scripts": ["arabic"], "aliases": ["urdu"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 202 |
+
"uz_uz": {"name": "Uzbek", "scripts": ["latin"], "aliases": ["uzbek"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 203 |
+
"vi_vn": {"name": "Vietnamese", "scripts": ["latin"], "aliases": ["vietnamese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 204 |
+
"wo_sn": {"name": "Wolof", "scripts": ["latin"], "aliases": ["wolof"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 205 |
+
"xh_za": {"name": "Xhosa", "scripts": ["latin"], "aliases": ["xhosa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 206 |
+
"yo_ng": {"name": "Yoruba", "scripts": ["latin"], "aliases": ["yoruba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 207 |
+
"yue_hant_hk": {"name": "Cantonese", "scripts": ["han"], "aliases": ["cantonese", "traditional_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
|
| 208 |
+
"zu_za": {"name": "Zulu", "scripts": ["latin"], "aliases": ["zulu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."}
|
| 209 |
+
}
|
| 210 |
+
}
|
script_fidelity/dominant.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dominant script helpers for SFR audits."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import unicodedata
|
| 6 |
+
from collections import Counter
|
| 7 |
+
|
| 8 |
+
from .registry import _registry
|
| 9 |
+
from .types import DigitPolicy
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def is_countable(ch: str, digit_policy: DigitPolicy = "count") -> bool:
|
| 13 |
+
"""Return whether a character should count in an SFR denominator."""
|
| 14 |
+
|
| 15 |
+
if digit_policy not in {"count", "ignore"}:
|
| 16 |
+
raise ValueError("digit_policy must be 'count' or 'ignore'")
|
| 17 |
+
|
| 18 |
+
cat = unicodedata.category(ch)
|
| 19 |
+
if digit_policy == "ignore" and cat.startswith("N"):
|
| 20 |
+
return False
|
| 21 |
+
return (
|
| 22 |
+
not ch.isspace()
|
| 23 |
+
and not cat.startswith("P")
|
| 24 |
+
and not cat.startswith("Z")
|
| 25 |
+
and not cat.startswith("C")
|
| 26 |
+
and not cat.startswith("M")
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _in_ranges(cp: int, ranges: list[list[int]]) -> bool:
|
| 31 |
+
return any(int(lo) <= cp <= int(hi) for lo, hi in ranges)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def script_distribution(
|
| 35 |
+
text: str,
|
| 36 |
+
*,
|
| 37 |
+
digit_policy: DigitPolicy = "count",
|
| 38 |
+
) -> dict[str, int]:
|
| 39 |
+
"""Count broad Unicode script families in text."""
|
| 40 |
+
|
| 41 |
+
normalized = unicodedata.normalize("NFC", text or "")
|
| 42 |
+
scripts = _registry()["scripts"]
|
| 43 |
+
counts: Counter[str] = Counter()
|
| 44 |
+
|
| 45 |
+
for ch in normalized:
|
| 46 |
+
if not is_countable(ch, digit_policy=digit_policy):
|
| 47 |
+
continue
|
| 48 |
+
cp = ord(ch)
|
| 49 |
+
label = "other"
|
| 50 |
+
for script_id, config in scripts.items():
|
| 51 |
+
if _in_ranges(cp, config["ranges"]):
|
| 52 |
+
label = script_id
|
| 53 |
+
break
|
| 54 |
+
counts[label] += 1
|
| 55 |
+
|
| 56 |
+
return dict(counts)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def dominant_script(
|
| 60 |
+
text: str,
|
| 61 |
+
*,
|
| 62 |
+
digit_policy: DigitPolicy = "count",
|
| 63 |
+
threshold: float = 0.5,
|
| 64 |
+
) -> str:
|
| 65 |
+
"""Return the dominant script label, ``mixed``, or ``empty``."""
|
| 66 |
+
|
| 67 |
+
counts = script_distribution(text, digit_policy=digit_policy)
|
| 68 |
+
total = sum(counts.values())
|
| 69 |
+
if total == 0:
|
| 70 |
+
return "empty"
|
| 71 |
+
|
| 72 |
+
script, count = max(counts.items(), key=lambda item: item[1])
|
| 73 |
+
if count / total >= threshold:
|
| 74 |
+
return script
|
| 75 |
+
return "mixed"
|
script_fidelity/registry.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FLEURS language registry for Script Fidelity Rate."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from importlib.resources import files
|
| 8 |
+
|
| 9 |
+
from .types import ScriptConfig
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@lru_cache(maxsize=1)
|
| 13 |
+
def _registry() -> dict:
|
| 14 |
+
data_path = files("script_fidelity").joinpath("data/fleurs_registry.json")
|
| 15 |
+
return json.loads(data_path.read_text(encoding="utf-8"))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _script_ranges(script_ids: list[str]) -> tuple[tuple[int, int], ...]:
|
| 19 |
+
scripts = _registry()["scripts"]
|
| 20 |
+
ranges: list[tuple[int, int]] = []
|
| 21 |
+
seen: set[tuple[int, int]] = set()
|
| 22 |
+
for script_id in script_ids:
|
| 23 |
+
for lo, hi in scripts[script_id]["ranges"]:
|
| 24 |
+
item = (int(lo), int(hi))
|
| 25 |
+
if item not in seen:
|
| 26 |
+
ranges.append(item)
|
| 27 |
+
seen.add(item)
|
| 28 |
+
return tuple(ranges)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@lru_cache(maxsize=1)
|
| 32 |
+
def _language_configs() -> dict[str, ScriptConfig]:
|
| 33 |
+
configs: dict[str, ScriptConfig] = {}
|
| 34 |
+
for code, item in _registry()["languages"].items():
|
| 35 |
+
script_ids = item["scripts"]
|
| 36 |
+
configs[code] = ScriptConfig(
|
| 37 |
+
code=code,
|
| 38 |
+
name=item["name"],
|
| 39 |
+
script="+".join(script_ids),
|
| 40 |
+
ranges=_script_ranges(script_ids),
|
| 41 |
+
aliases=tuple(item.get("aliases", [])),
|
| 42 |
+
shared_script=bool(item.get("shared_script", False)),
|
| 43 |
+
warning=item.get("warning", ""),
|
| 44 |
+
)
|
| 45 |
+
return configs
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@lru_cache(maxsize=1)
|
| 49 |
+
def _alias_map() -> dict[str, str]:
|
| 50 |
+
aliases: dict[str, str] = {}
|
| 51 |
+
for code, config in _language_configs().items():
|
| 52 |
+
aliases[code.lower()] = code
|
| 53 |
+
aliases[code.replace("_", "-").lower()] = code
|
| 54 |
+
for alias in config.aliases:
|
| 55 |
+
aliases[alias.lower()] = code
|
| 56 |
+
return aliases
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def resolve_language(language: str) -> str:
|
| 60 |
+
"""Resolve a FLEURS code or alias to a canonical FLEURS code."""
|
| 61 |
+
|
| 62 |
+
normalized = language.strip().lower().replace(" ", "_")
|
| 63 |
+
try:
|
| 64 |
+
return _alias_map()[normalized]
|
| 65 |
+
except KeyError as exc:
|
| 66 |
+
known = ", ".join(list_languages()[:12])
|
| 67 |
+
raise ValueError(
|
| 68 |
+
f"Unknown language '{language}'. Use a FLEURS code such as ps_af, "
|
| 69 |
+
f"or an alias such as pashto. Examples: {known}, ..."
|
| 70 |
+
) from exc
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def get_script_config(language: str) -> ScriptConfig:
|
| 74 |
+
"""Return the reviewed script configuration for a language."""
|
| 75 |
+
|
| 76 |
+
return _language_configs()[resolve_language(language)]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def list_languages() -> list[str]:
|
| 80 |
+
"""Return canonical FLEURS language codes supported by the registry."""
|
| 81 |
+
|
| 82 |
+
return sorted(_language_configs())
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
FLEURS_CONFIGS = tuple(list_languages())
|
| 86 |
+
SCRIPT_CONFIGS = _language_configs()
|
script_fidelity/types.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared types for Script Fidelity Rate."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Literal
|
| 7 |
+
|
| 8 |
+
DigitPolicy = Literal["count", "ignore"]
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass(frozen=True)
|
| 12 |
+
class ScriptConfig:
|
| 13 |
+
"""Script configuration for one FLEURS language."""
|
| 14 |
+
|
| 15 |
+
code: str
|
| 16 |
+
name: str
|
| 17 |
+
script: str
|
| 18 |
+
ranges: tuple[tuple[int, int], ...]
|
| 19 |
+
aliases: tuple[str, ...] = ()
|
| 20 |
+
shared_script: bool = False
|
| 21 |
+
warning: str = ""
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass(frozen=True)
|
| 25 |
+
class SFRResult:
|
| 26 |
+
"""Per-text Script Fidelity Rate result."""
|
| 27 |
+
|
| 28 |
+
language: str
|
| 29 |
+
sfr: float | None
|
| 30 |
+
numerator: int
|
| 31 |
+
denominator: int
|
| 32 |
+
dominant_script: str
|
| 33 |
+
script_counts: dict[str, int]
|
script_fidelity_rate/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
script_fidelity_rate/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: script_fidelity_rate
|
| 3 |
+
datasets:
|
| 4 |
+
-
|
| 5 |
+
tags:
|
| 6 |
+
- evaluate
|
| 7 |
+
- metric
|
| 8 |
+
description: "TODO: add a description here"
|
| 9 |
+
sdk: gradio
|
| 10 |
+
sdk_version: 3.19.1
|
| 11 |
+
app_file: app.py
|
| 12 |
+
pinned: false
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Metric Card for script_fidelity_rate
|
| 16 |
+
|
| 17 |
+
***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
|
| 18 |
+
|
| 19 |
+
## Metric Description
|
| 20 |
+
*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
|
| 21 |
+
|
| 22 |
+
## How to Use
|
| 23 |
+
*Give general statement of how to use the metric*
|
| 24 |
+
|
| 25 |
+
*Provide simplest possible example for using the metric*
|
| 26 |
+
|
| 27 |
+
### Inputs
|
| 28 |
+
*List all input arguments in the format below*
|
| 29 |
+
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
|
| 30 |
+
|
| 31 |
+
### Output Values
|
| 32 |
+
|
| 33 |
+
*Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
|
| 34 |
+
|
| 35 |
+
*State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
|
| 36 |
+
|
| 37 |
+
#### Values from Popular Papers
|
| 38 |
+
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
| 39 |
+
|
| 40 |
+
### Examples
|
| 41 |
+
*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
|
| 42 |
+
|
| 43 |
+
## Limitations and Bias
|
| 44 |
+
*Note any known limitations or biases that the metric has, with links and references if possible.*
|
| 45 |
+
|
| 46 |
+
## Citation
|
| 47 |
+
*Cite the source where this metric was introduced.*
|
| 48 |
+
|
| 49 |
+
## Further References
|
| 50 |
+
*Add any useful further references.*
|
script_fidelity_rate/app.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
from evaluate.utils import launch_gradio_widget
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
module = evaluate.load("themechanism/script_fidelity_rate")
|
| 6 |
+
launch_gradio_widget(module)
|
script_fidelity_rate/requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/evaluate@main
|
script_fidelity_rate/script_fidelity_rate.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""TODO: Add a description here."""
|
| 15 |
+
|
| 16 |
+
import evaluate
|
| 17 |
+
import datasets
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# TODO: Add BibTeX citation
|
| 21 |
+
_CITATION = """\
|
| 22 |
+
@InProceedings{huggingface:module,
|
| 23 |
+
title = {A great new module},
|
| 24 |
+
authors={huggingface, Inc.},
|
| 25 |
+
year={2020}
|
| 26 |
+
}
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
# TODO: Add description of the module here
|
| 30 |
+
_DESCRIPTION = """\
|
| 31 |
+
This new module is designed to solve this great ML task and is crafted with a lot of care.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# TODO: Add description of the arguments of the module here
|
| 36 |
+
_KWARGS_DESCRIPTION = """
|
| 37 |
+
Calculates how good are predictions given some references, using certain scores
|
| 38 |
+
Args:
|
| 39 |
+
predictions: list of predictions to score. Each predictions
|
| 40 |
+
should be a string with tokens separated by spaces.
|
| 41 |
+
references: list of reference for each prediction. Each
|
| 42 |
+
reference should be a string with tokens separated by spaces.
|
| 43 |
+
Returns:
|
| 44 |
+
accuracy: description of the first score,
|
| 45 |
+
another_score: description of the second score,
|
| 46 |
+
Examples:
|
| 47 |
+
Examples should be written in doctest format, and should illustrate how
|
| 48 |
+
to use the function.
|
| 49 |
+
|
| 50 |
+
>>> my_new_module = evaluate.load("my_new_module")
|
| 51 |
+
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
|
| 52 |
+
>>> print(results)
|
| 53 |
+
{'accuracy': 1.0}
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
# TODO: Define external resources urls if needed
|
| 57 |
+
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 61 |
+
class script_fidelity_rate(evaluate.Metric):
|
| 62 |
+
"""TODO: Short description of my evaluation module."""
|
| 63 |
+
|
| 64 |
+
def _info(self):
|
| 65 |
+
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
| 66 |
+
return evaluate.MetricInfo(
|
| 67 |
+
# This is the description that will appear on the modules page.
|
| 68 |
+
module_type="metric",
|
| 69 |
+
description=_DESCRIPTION,
|
| 70 |
+
citation=_CITATION,
|
| 71 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
| 72 |
+
# This defines the format of each prediction and reference
|
| 73 |
+
features=datasets.Features({
|
| 74 |
+
'predictions': datasets.Value('int64'),
|
| 75 |
+
'references': datasets.Value('int64'),
|
| 76 |
+
}),
|
| 77 |
+
# Homepage of the module for documentation
|
| 78 |
+
homepage="http://module.homepage",
|
| 79 |
+
# Additional links to the codebase or references
|
| 80 |
+
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
| 81 |
+
reference_urls=["http://path.to.reference.url/new_module"]
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def _download_and_prepare(self, dl_manager):
|
| 85 |
+
"""Optional: download external resources useful to compute the scores"""
|
| 86 |
+
# TODO: Download external resources if needed
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
+
def _compute(self, predictions, references):
|
| 90 |
+
"""Returns the scores"""
|
| 91 |
+
# TODO: Compute the different scores of the module
|
| 92 |
+
accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
|
| 93 |
+
return {
|
| 94 |
+
"accuracy": accuracy,
|
| 95 |
+
}
|
script_fidelity_rate/tests.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
test_cases = [
|
| 2 |
+
{
|
| 3 |
+
"predictions": [0, 0],
|
| 4 |
+
"references": [1, 1],
|
| 5 |
+
"result": {"metric_score": 0}
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"predictions": [1, 1],
|
| 9 |
+
"references": [1, 1],
|
| 10 |
+
"result": {"metric_score": 1}
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"predictions": [1, 0],
|
| 14 |
+
"references": [1, 1],
|
| 15 |
+
"result": {"metric_score": 0.5}
|
| 16 |
+
}
|
| 17 |
+
]
|
tests/test_cli.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import json
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_cli_score():
|
| 9 |
+
result = subprocess.run(
|
| 10 |
+
[
|
| 11 |
+
sys.executable,
|
| 12 |
+
"-m",
|
| 13 |
+
"script_fidelity",
|
| 14 |
+
"score",
|
| 15 |
+
"--language",
|
| 16 |
+
"ps_af",
|
| 17 |
+
"--text",
|
| 18 |
+
"کابل کې ښه هوا ده",
|
| 19 |
+
],
|
| 20 |
+
check=True,
|
| 21 |
+
capture_output=True,
|
| 22 |
+
text=True,
|
| 23 |
+
)
|
| 24 |
+
assert result.stdout.strip() == "1.000000"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_cli_audit_jsonl(tmp_path: Path):
|
| 28 |
+
path = tmp_path / "predictions.jsonl"
|
| 29 |
+
rows = [
|
| 30 |
+
{"prediction": "کابل کې ښه هوا ده"},
|
| 31 |
+
{"prediction": "romanized output"},
|
| 32 |
+
]
|
| 33 |
+
path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in rows))
|
| 34 |
+
|
| 35 |
+
result = subprocess.run(
|
| 36 |
+
[
|
| 37 |
+
sys.executable,
|
| 38 |
+
"-m",
|
| 39 |
+
"script_fidelity",
|
| 40 |
+
"audit",
|
| 41 |
+
str(path),
|
| 42 |
+
"--language",
|
| 43 |
+
"ps_af",
|
| 44 |
+
"--text-column",
|
| 45 |
+
"prediction",
|
| 46 |
+
],
|
| 47 |
+
check=True,
|
| 48 |
+
capture_output=True,
|
| 49 |
+
text=True,
|
| 50 |
+
)
|
| 51 |
+
summary = json.loads(result.stdout)
|
| 52 |
+
assert summary["n"] == 2
|
| 53 |
+
assert summary["sfr"] == 0.5
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def test_cli_audit_csv_format(tmp_path: Path):
|
| 57 |
+
path = tmp_path / "predictions.csv"
|
| 58 |
+
with path.open("w", encoding="utf-8", newline="") as handle:
|
| 59 |
+
writer = csv.DictWriter(handle, fieldnames=["prediction"])
|
| 60 |
+
writer.writeheader()
|
| 61 |
+
writer.writerow({"prediction": "বাংলা ভাষা"})
|
| 62 |
+
writer.writerow({"prediction": "namaste"})
|
| 63 |
+
|
| 64 |
+
result = subprocess.run(
|
| 65 |
+
[
|
| 66 |
+
sys.executable,
|
| 67 |
+
"-m",
|
| 68 |
+
"script_fidelity",
|
| 69 |
+
"audit",
|
| 70 |
+
str(path),
|
| 71 |
+
"--language",
|
| 72 |
+
"bn_in",
|
| 73 |
+
"--format",
|
| 74 |
+
"csv",
|
| 75 |
+
],
|
| 76 |
+
check=True,
|
| 77 |
+
capture_output=True,
|
| 78 |
+
text=True,
|
| 79 |
+
)
|
| 80 |
+
assert "sfr_percent" in result.stdout
|
| 81 |
+
assert "50.0" in result.stdout
|
tests/test_core.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from script_fidelity import (
|
| 2 |
+
compute_corpus_sfr,
|
| 3 |
+
compute_sfr,
|
| 4 |
+
compute_sfr_batch,
|
| 5 |
+
dominant_script,
|
| 6 |
+
script_distribution,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_pashto_positive_and_latin_collapse():
|
| 11 |
+
assert compute_sfr("کابل کې ښه هوا ده", language="ps_af") == 1.0
|
| 12 |
+
assert compute_sfr("this is romanized pashto", language="pashto") == 0.0
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_bengali_vs_devanagari_wrong_script():
|
| 16 |
+
assert compute_sfr("বাংলা ভাষা", language="bn_in") == 1.0
|
| 17 |
+
assert compute_sfr("नमस्ते दुनिया", language="bengali") == 0.0
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_somali_latin_positive_and_arabic_negative():
|
| 21 |
+
assert compute_sfr("Somali waa luuqad", language="so_so") == 1.0
|
| 22 |
+
assert compute_sfr("كابل في هواء جيد", language="somali") == 0.0
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_empty_punctuation_combining_and_emoji_cases():
|
| 26 |
+
assert compute_sfr("", language="ps_af") is None
|
| 27 |
+
assert compute_sfr("...?!", language="ps_af") is None
|
| 28 |
+
assert compute_sfr("\u0301\u0301", language="ps_af") is None
|
| 29 |
+
assert compute_sfr("🙂", language="ps_af") == 0.0
|
| 30 |
+
assert dominant_script("...?!") == "empty"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_mixed_script_and_distribution():
|
| 34 |
+
score = compute_sfr("বাংলা भाषा", language="bn_in")
|
| 35 |
+
assert score is not None
|
| 36 |
+
assert 0.0 < score < 1.0
|
| 37 |
+
counts = script_distribution("বাংলা भाषा")
|
| 38 |
+
assert counts["bengali"] > 0
|
| 39 |
+
assert counts["devanagari"] > 0
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_digit_policy_count_and_ignore():
|
| 43 |
+
counted = compute_sfr("کابل 123", language="ps_af")
|
| 44 |
+
ignored = compute_sfr("کابل 123", language="ps_af", digit_policy="ignore")
|
| 45 |
+
assert counted == 4 / 7
|
| 46 |
+
assert ignored == 1.0
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def test_batch_and_corpus_summary():
|
| 50 |
+
predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
|
| 51 |
+
scores = compute_sfr_batch(predictions, language="pashto")
|
| 52 |
+
assert scores == [1.0, 0.0, None]
|
| 53 |
+
|
| 54 |
+
summary = compute_corpus_sfr(predictions, language="pashto")
|
| 55 |
+
assert summary["n"] == 3
|
| 56 |
+
assert summary["n_valid"] == 2
|
| 57 |
+
assert summary["n_empty"] == 1
|
| 58 |
+
assert summary["sfr"] == 0.5
|
| 59 |
+
assert summary["low_sfr_rate"] == 0.5
|
tests/test_evaluate_metric.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
evaluate = pytest.importorskip("evaluate")
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_local_evaluate_metric_matches_package():
|
| 8 |
+
from script_fidelity import compute_corpus_sfr
|
| 9 |
+
|
| 10 |
+
predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
|
| 11 |
+
metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 12 |
+
actual = metric.compute(predictions=predictions, language="ps_af")
|
| 13 |
+
expected = compute_corpus_sfr(predictions, language="ps_af")
|
| 14 |
+
assert actual["sfr"] == expected["sfr"]
|
| 15 |
+
assert actual["n_empty"] == expected["n_empty"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_evaluate_metric_details():
|
| 19 |
+
metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 20 |
+
result = metric.compute(
|
| 21 |
+
predictions=["বাংলা ভাষা", "नमस्ते"],
|
| 22 |
+
language="bn_in",
|
| 23 |
+
return_details=True,
|
| 24 |
+
)
|
| 25 |
+
assert len(result["details"]) == 2
|
| 26 |
+
assert result["details"][0]["sfr"] == 1.0
|
| 27 |
+
assert result["details"][1]["sfr"] == 0.0
|
tests/test_registry.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from script_fidelity import FLEURS_CONFIGS, get_script_config, list_languages, resolve_language
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_registry_has_all_fleurs_configs_except_all():
|
| 5 |
+
codes = list_languages()
|
| 6 |
+
assert len(codes) == 102
|
| 7 |
+
assert "all" not in codes
|
| 8 |
+
assert tuple(codes) == FLEURS_CONFIGS
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_every_language_has_ranges():
|
| 12 |
+
for code in list_languages():
|
| 13 |
+
config = get_script_config(code)
|
| 14 |
+
assert config.code == code
|
| 15 |
+
assert config.ranges
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_aliases_for_paper_languages():
|
| 19 |
+
aliases = {
|
| 20 |
+
"pashto": "ps_af",
|
| 21 |
+
"urdu": "ur_pk",
|
| 22 |
+
"arabic": "ar_eg",
|
| 23 |
+
"persian": "fa_ir",
|
| 24 |
+
"farsi": "fa_ir",
|
| 25 |
+
"hindi": "hi_in",
|
| 26 |
+
"bengali": "bn_in",
|
| 27 |
+
"malayalam": "ml_in",
|
| 28 |
+
"tamil": "ta_in",
|
| 29 |
+
"somali": "so_so",
|
| 30 |
+
"georgian": "ka_ge",
|
| 31 |
+
}
|
| 32 |
+
for alias, code in aliases.items():
|
| 33 |
+
assert resolve_language(alias) == code
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_shared_script_metadata():
|
| 37 |
+
for code in ["ps_af", "ur_pk", "fa_ir", "ar_eg", "so_so", "hi_in"]:
|
| 38 |
+
config = get_script_config(code)
|
| 39 |
+
assert config.shared_script is True
|
| 40 |
+
assert config.warning
|
| 41 |
+
|
| 42 |
+
assert get_script_config("ka_ge").shared_script is False
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|