Spaces:
Sleeping
Sleeping
Fix metric Space configuration
Browse files- MANIFEST.in +0 -6
- README.md +31 -267
- script_fidelity_rate/app.py → app.py +2 -2
- dist/.gitignore +0 -1
- dist/script_fidelity-0.1.1-py3-none-any.whl +0 -0
- dist/script_fidelity-0.1.1.tar.gz +0 -3
- examples/ci_gate.py +0 -7
- examples/hf_evaluate.py +0 -8
- examples/pandas_dataframe.py +0 -7
- examples/plain_python.py +0 -7
- examples/transformers_compute_metrics.py +0 -14
- metrics/script_fidelity_rate/README.md +0 -31
- pyproject.toml +0 -50
- metrics/script_fidelity_rate/requirements.txt → requirements.txt +0 -0
- script_fidelity/__init__.py +0 -40
- script_fidelity/__main__.py +0 -3
- script_fidelity/cli.py +0 -133
- script_fidelity/core.py +0 -132
- script_fidelity/data/fleurs_registry.json +0 -210
- script_fidelity/dominant.py +0 -75
- script_fidelity/registry.py +0 -86
- script_fidelity/types.py +0 -33
- metrics/script_fidelity_rate/script_fidelity_rate.py → script_fidelity_rate.py +0 -9
- script_fidelity_rate/.gitattributes +0 -35
- script_fidelity_rate/README.md +0 -50
- script_fidelity_rate/requirements.txt +0 -1
- script_fidelity_rate/script_fidelity_rate.py +0 -95
- script_fidelity_rate/tests.py +0 -17
- tests/test_cli.py +0 -81
- tests/test_core.py +0 -59
- tests/test_evaluate_metric.py +0 -27
- tests/test_registry.py +0 -42
- uv.lock +0 -0
MANIFEST.in
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
include README.md
|
| 2 |
-
include pyproject.toml
|
| 3 |
-
recursive-include script_fidelity/data *.json
|
| 4 |
-
recursive-include metrics *
|
| 5 |
-
recursive-include examples *
|
| 6 |
-
recursive-include tests *.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,281 +1,45 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
-
|
| 12 |
-
-
|
| 13 |
-
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
``
|
| 23 |
-
uv sync --extra dev
|
| 24 |
-
```
|
| 25 |
-
|
| 26 |
-
For a downstream project:
|
| 27 |
-
|
| 28 |
-
```bash
|
| 29 |
-
uv add script-fidelity
|
| 30 |
-
```
|
| 31 |
-
|
| 32 |
-
Run the CLI without adding it to a project:
|
| 33 |
-
|
| 34 |
-
```bash
|
| 35 |
-
uvx --from script-fidelity sfr score --language ps_af --text "کابل کې ښه هوا ده"
|
| 36 |
-
```
|
| 37 |
-
|
| 38 |
-
## python use
|
| 39 |
-
|
| 40 |
-
```python
|
| 41 |
-
from script_fidelity import compute_sfr, compute_sfr_batch
|
| 42 |
-
|
| 43 |
-
score = compute_sfr("کابل کې ښه هوا ده", language="ps_af")
|
| 44 |
-
scores = compute_sfr_batch(
|
| 45 |
-
["کابل کې ښه هوا ده", "this is romanized output"],
|
| 46 |
-
language="pashto",
|
| 47 |
-
)
|
| 48 |
-
```
|
| 49 |
-
|
| 50 |
-
Digits count by default, matching the paper. Treat digits as neutral with
|
| 51 |
-
`digit_policy="ignore"`.
|
| 52 |
-
|
| 53 |
-
```python
|
| 54 |
-
compute_sfr("کابل 2026", language="ps_af", digit_policy="ignore")
|
| 55 |
-
```
|
| 56 |
-
|
| 57 |
-
## HF Evaluate use
|
| 58 |
-
|
| 59 |
-
Local metric:
|
| 60 |
|
| 61 |
```python
|
| 62 |
import evaluate
|
| 63 |
|
| 64 |
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 65 |
-
sfr.compute(
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
```python
|
| 71 |
-
import evaluate
|
| 72 |
-
|
| 73 |
-
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 74 |
-
sfr.compute(predictions=["کابل کې ښه هوا ده"], language="ps_af")
|
| 75 |
-
```
|
| 76 |
-
|
| 77 |
-
## CLI
|
| 78 |
-
|
| 79 |
-
```bash
|
| 80 |
-
sfr score --language ps_af --text "کابل کې ښه هوا ده"
|
| 81 |
-
sfr audit predictions.jsonl --language ps_af --text-column prediction
|
| 82 |
-
sfr audit predictions.csv --language bn_in --text-column transcript --format csv
|
| 83 |
-
```
|
| 84 |
-
|
| 85 |
-
## ASR batch example
|
| 86 |
-
|
| 87 |
-
```python
|
| 88 |
-
from script_fidelity import compute_corpus_sfr
|
| 89 |
-
|
| 90 |
-
predictions = [
|
| 91 |
-
item["text"]
|
| 92 |
-
for item in whisper_outputs
|
| 93 |
-
]
|
| 94 |
-
|
| 95 |
-
summary = compute_corpus_sfr(predictions, language="bn_in")
|
| 96 |
-
print(summary["sfr_percent"])
|
| 97 |
-
print(summary["dominant_script_counts"])
|
| 98 |
-
```
|
| 99 |
-
|
| 100 |
-
## pandas dataframe example
|
| 101 |
-
|
| 102 |
-
```python
|
| 103 |
-
import pandas as pd
|
| 104 |
-
from script_fidelity import compute_sfr
|
| 105 |
-
|
| 106 |
-
df = pd.read_json("predictions.jsonl", lines=True)
|
| 107 |
-
df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
|
| 108 |
```
|
| 109 |
|
| 110 |
-
|
| 111 |
|
| 112 |
```python
|
| 113 |
import evaluate
|
| 114 |
|
| 115 |
-
wer = evaluate.load("wer")
|
| 116 |
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 117 |
-
|
| 118 |
-
def compute_metrics(eval_pred):
|
| 119 |
-
predictions, labels = eval_pred
|
| 120 |
-
pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
|
| 121 |
-
label_text = processor.batch_decode(labels, skip_special_tokens=True)
|
| 122 |
-
return {
|
| 123 |
-
"wer": wer.compute(predictions=pred_text, references=label_text),
|
| 124 |
-
"sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
|
| 125 |
-
}
|
| 126 |
```
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
```python
|
| 131 |
-
from script_fidelity import compute_corpus_sfr
|
| 132 |
-
|
| 133 |
-
summary = compute_corpus_sfr(predictions, language="ml_in")
|
| 134 |
-
if summary["sfr"] < 0.90:
|
| 135 |
-
raise SystemExit("SFR regression: Malayalam output is below 90% target script")
|
| 136 |
-
```
|
| 137 |
-
|
| 138 |
-
## shared-script caveats
|
| 139 |
-
|
| 140 |
-
SFR is a script check, not a language identifier. Pashto, Urdu, Persian, Arabic,
|
| 141 |
-
Central Kurdish, and Sindhi share Arabic-script Unicode blocks. Latin-script
|
| 142 |
-
languages mostly detect romanization or non-Latin substitution, not language
|
| 143 |
-
identity. Pair SFR with language ID or lexical checks when shared-script
|
| 144 |
-
confusions matter.
|
| 145 |
-
|
| 146 |
-
Use `dominant_script()` and `script_distribution()` to inspect failures:
|
| 147 |
-
|
| 148 |
-
```python
|
| 149 |
-
from script_fidelity import dominant_script, script_distribution
|
| 150 |
-
|
| 151 |
-
dominant_script("this is romanized output")
|
| 152 |
-
script_distribution("বাংলা भाषा")
|
| 153 |
-
```
|
| 154 |
-
|
| 155 |
-
## FLEURS codes
|
| 156 |
-
|
| 157 |
-
The registry covers the 102 FLEURS language configs listed by `sfr languages`.
|
| 158 |
-
These paper languages have short aliases:
|
| 159 |
-
|
| 160 |
-
| FLEURS code | Alias | Script |
|
| 161 |
-
|---|---|---|
|
| 162 |
-
| `ps_af` | `pashto` | Arabic |
|
| 163 |
-
| `ur_pk` | `urdu` | Arabic |
|
| 164 |
-
| `ar_eg` | `arabic` | Arabic |
|
| 165 |
-
| `fa_ir` | `persian`, `farsi` | Arabic |
|
| 166 |
-
| `hi_in` | `hindi` | Devanagari |
|
| 167 |
-
| `bn_in` | `bengali`, `bangla` | Bengali |
|
| 168 |
-
| `ml_in` | `malayalam` | Malayalam |
|
| 169 |
-
| `ta_in` | `tamil` | Tamil |
|
| 170 |
-
| `so_so` | `somali` | Latin |
|
| 171 |
-
| `ka_ge` | `georgian` | Georgian |
|
| 172 |
-
|
| 173 |
-
For the full reviewed registry, see
|
| 174 |
-
`script_fidelity/data/fleurs_registry.json`.
|
| 175 |
-
|
| 176 |
-
Full code table:
|
| 177 |
-
|
| 178 |
-
| Code | Language | Script |
|
| 179 |
-
|---|---|---|
|
| 180 |
-
| `af_za` | Afrikaans | Latin |
|
| 181 |
-
| `am_et` | Amharic | Ethiopic |
|
| 182 |
-
| `ar_eg` | Arabic | Arabic |
|
| 183 |
-
| `as_in` | Assamese | Bengali |
|
| 184 |
-
| `ast_es` | Asturian | Latin |
|
| 185 |
-
| `az_az` | Azerbaijani | Latin |
|
| 186 |
-
| `be_by` | Belarusian | Cyrillic |
|
| 187 |
-
| `bg_bg` | Bulgarian | Cyrillic |
|
| 188 |
-
| `bn_in` | Bengali | Bengali |
|
| 189 |
-
| `bs_ba` | Bosnian | Latin |
|
| 190 |
-
| `ca_es` | Catalan | Latin |
|
| 191 |
-
| `ceb_ph` | Cebuano | Latin |
|
| 192 |
-
| `ckb_iq` | Central Kurdish | Arabic |
|
| 193 |
-
| `cmn_hans_cn` | Mandarin Chinese | Han |
|
| 194 |
-
| `cs_cz` | Czech | Latin |
|
| 195 |
-
| `cy_gb` | Welsh | Latin |
|
| 196 |
-
| `da_dk` | Danish | Latin |
|
| 197 |
-
| `de_de` | German | Latin |
|
| 198 |
-
| `el_gr` | Greek | Greek |
|
| 199 |
-
| `en_us` | English | Latin |
|
| 200 |
-
| `es_419` | Spanish | Latin |
|
| 201 |
-
| `et_ee` | Estonian | Latin |
|
| 202 |
-
| `fa_ir` | Persian | Arabic |
|
| 203 |
-
| `ff_sn` | Fulah | Latin |
|
| 204 |
-
| `fi_fi` | Finnish | Latin |
|
| 205 |
-
| `fil_ph` | Filipino | Latin |
|
| 206 |
-
| `fr_fr` | French | Latin |
|
| 207 |
-
| `ga_ie` | Irish | Latin |
|
| 208 |
-
| `gl_es` | Galician | Latin |
|
| 209 |
-
| `gu_in` | Gujarati | Gujarati |
|
| 210 |
-
| `ha_ng` | Hausa | Latin |
|
| 211 |
-
| `he_il` | Hebrew | Hebrew |
|
| 212 |
-
| `hi_in` | Hindi | Devanagari |
|
| 213 |
-
| `hr_hr` | Croatian | Latin |
|
| 214 |
-
| `hu_hu` | Hungarian | Latin |
|
| 215 |
-
| `hy_am` | Armenian | Armenian |
|
| 216 |
-
| `id_id` | Indonesian | Latin |
|
| 217 |
-
| `ig_ng` | Igbo | Latin |
|
| 218 |
-
| `is_is` | Icelandic | Latin |
|
| 219 |
-
| `it_it` | Italian | Latin |
|
| 220 |
-
| `ja_jp` | Japanese | Han, Hiragana, Katakana |
|
| 221 |
-
| `jv_id` | Javanese | Latin |
|
| 222 |
-
| `ka_ge` | Georgian | Georgian |
|
| 223 |
-
| `kam_ke` | Kamba | Latin |
|
| 224 |
-
| `kea_cv` | Kabuverdianu | Latin |
|
| 225 |
-
| `kk_kz` | Kazakh | Cyrillic |
|
| 226 |
-
| `km_kh` | Khmer | Khmer |
|
| 227 |
-
| `kn_in` | Kannada | Kannada |
|
| 228 |
-
| `ko_kr` | Korean | Hangul |
|
| 229 |
-
| `ky_kg` | Kyrgyz | Cyrillic |
|
| 230 |
-
| `lb_lu` | Luxembourgish | Latin |
|
| 231 |
-
| `lg_ug` | Ganda | Latin |
|
| 232 |
-
| `ln_cd` | Lingala | Latin |
|
| 233 |
-
| `lo_la` | Lao | Lao |
|
| 234 |
-
| `lt_lt` | Lithuanian | Latin |
|
| 235 |
-
| `luo_ke` | Luo | Latin |
|
| 236 |
-
| `lv_lv` | Latvian | Latin |
|
| 237 |
-
| `mi_nz` | Maori | Latin |
|
| 238 |
-
| `mk_mk` | Macedonian | Cyrillic |
|
| 239 |
-
| `ml_in` | Malayalam | Malayalam |
|
| 240 |
-
| `mn_mn` | Mongolian | Cyrillic |
|
| 241 |
-
| `mr_in` | Marathi | Devanagari |
|
| 242 |
-
| `ms_my` | Malay | Latin |
|
| 243 |
-
| `mt_mt` | Maltese | Latin |
|
| 244 |
-
| `my_mm` | Burmese | Myanmar |
|
| 245 |
-
| `nb_no` | Norwegian Bokmal | Latin |
|
| 246 |
-
| `ne_np` | Nepali | Devanagari |
|
| 247 |
-
| `nl_nl` | Dutch | Latin |
|
| 248 |
-
| `nso_za` | Northern Sotho | Latin |
|
| 249 |
-
| `ny_mw` | Chichewa | Latin |
|
| 250 |
-
| `oc_fr` | Occitan | Latin |
|
| 251 |
-
| `om_et` | Oromo | Latin |
|
| 252 |
-
| `or_in` | Odia | Odia |
|
| 253 |
-
| `pa_in` | Punjabi | Gurmukhi |
|
| 254 |
-
| `pl_pl` | Polish | Latin |
|
| 255 |
-
| `ps_af` | Pashto | Arabic |
|
| 256 |
-
| `pt_br` | Portuguese | Latin |
|
| 257 |
-
| `ro_ro` | Romanian | Latin |
|
| 258 |
-
| `ru_ru` | Russian | Cyrillic |
|
| 259 |
-
| `sd_in` | Sindhi | Arabic |
|
| 260 |
-
| `sk_sk` | Slovak | Latin |
|
| 261 |
-
| `sl_si` | Slovenian | Latin |
|
| 262 |
-
| `sn_zw` | Shona | Latin |
|
| 263 |
-
| `so_so` | Somali | Latin |
|
| 264 |
-
| `sr_rs` | Serbian | Cyrillic |
|
| 265 |
-
| `sv_se` | Swedish | Latin |
|
| 266 |
-
| `sw_ke` | Swahili | Latin |
|
| 267 |
-
| `ta_in` | Tamil | Tamil |
|
| 268 |
-
| `te_in` | Telugu | Telugu |
|
| 269 |
-
| `tg_tj` | Tajik | Cyrillic |
|
| 270 |
-
| `th_th` | Thai | Thai |
|
| 271 |
-
| `tr_tr` | Turkish | Latin |
|
| 272 |
-
| `uk_ua` | Ukrainian | Cyrillic |
|
| 273 |
-
| `umb_ao` | Umbundu | Latin |
|
| 274 |
-
| `ur_pk` | Urdu | Arabic |
|
| 275 |
-
| `uz_uz` | Uzbek | Latin |
|
| 276 |
-
| `vi_vn` | Vietnamese | Latin |
|
| 277 |
-
| `wo_sn` | Wolof | Latin |
|
| 278 |
-
| `xh_za` | Xhosa | Latin |
|
| 279 |
-
| `yo_ng` | Yoruba | Latin |
|
| 280 |
-
| `yue_hant_hk` | Cantonese | Han |
|
| 281 |
-
| `zu_za` | Zulu | Latin |
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Script Fidelity Rate
|
| 3 |
+
sdk: gradio
|
| 4 |
+
app_file: app.py
|
| 5 |
+
pinned: false
|
| 6 |
+
license: mit
|
| 7 |
+
tags:
|
| 8 |
+
- evaluate
|
| 9 |
+
- metric
|
| 10 |
+
- automatic-speech-recognition
|
| 11 |
+
- unicode
|
| 12 |
+
- multilingual-asr
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Script Fidelity Rate
|
| 16 |
+
|
| 17 |
+
This directory is the Hugging Face Evaluate metric module for Script Fidelity
|
| 18 |
+
Rate (SFR).
|
| 19 |
+
|
| 20 |
+
The Python package is published as `script-fidelity` on PyPI:
|
| 21 |
+
<https://pypi.org/project/script-fidelity/>. The import name is
|
| 22 |
+
`script_fidelity`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
```python
|
| 25 |
import evaluate
|
| 26 |
|
| 27 |
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 28 |
+
result = sfr.compute(
|
| 29 |
+
predictions=["کابل کې ښه هوا ده", "this is romanized output"],
|
| 30 |
+
language="ps_af",
|
| 31 |
+
)
|
| 32 |
+
print(result["sfr_percent"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
```
|
| 34 |
|
| 35 |
+
Hub use:
|
| 36 |
|
| 37 |
```python
|
| 38 |
import evaluate
|
| 39 |
|
|
|
|
| 40 |
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 41 |
+
sfr.compute(predictions=["کابل کې ښه هوا ده"], language="pashto")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
```
|
| 43 |
|
| 44 |
+
Use SFR with WER and CER, not instead of them. SFR checks whether output is in
|
| 45 |
+
the intended script. It does not measure lexical accuracy.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity_rate/app.py → app.py
RENAMED
|
@@ -2,5 +2,5 @@ import evaluate
|
|
| 2 |
from evaluate.utils import launch_gradio_widget
|
| 3 |
|
| 4 |
|
| 5 |
-
module = evaluate.load("themechanism/script_fidelity_rate")
|
| 6 |
-
launch_gradio_widget(module)
|
|
|
|
| 2 |
from evaluate.utils import launch_gradio_widget
|
| 3 |
|
| 4 |
|
| 5 |
+
module = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 6 |
+
launch_gradio_widget(module)
|
dist/.gitignore
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
*
|
|
|
|
|
|
dist/script_fidelity-0.1.1-py3-none-any.whl
DELETED
|
Binary file (14.2 kB)
|
|
|
dist/script_fidelity-0.1.1.tar.gz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4e36da45cddd306e6794eb59bd06cbd3fe9ae19801791bbe5c02862952aa89a8
|
| 3 |
-
size 18936
|
|
|
|
|
|
|
|
|
|
|
|
examples/ci_gate.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
from script_fidelity import compute_corpus_sfr
|
| 2 |
-
|
| 3 |
-
predictions = ["മലയാളം വാക്യം", "malayalam romanized output"]
|
| 4 |
-
summary = compute_corpus_sfr(predictions, language="ml_in")
|
| 5 |
-
|
| 6 |
-
if summary["sfr"] < 0.90:
|
| 7 |
-
raise SystemExit("SFR regression: Malayalam output is below 90% target script")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/hf_evaluate.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
import evaluate
|
| 2 |
-
|
| 3 |
-
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 4 |
-
result = sfr.compute(
|
| 5 |
-
predictions=["کابل کې ښه هوا ده", "this is romanized output"],
|
| 6 |
-
language="ps_af",
|
| 7 |
-
)
|
| 8 |
-
print(result["sfr_percent"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/pandas_dataframe.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
|
| 3 |
-
from script_fidelity import compute_sfr
|
| 4 |
-
|
| 5 |
-
df = pd.read_json("predictions.jsonl", lines=True)
|
| 6 |
-
df["sfr"] = df["prediction"].map(lambda text: compute_sfr(text, language="ps_af"))
|
| 7 |
-
print(df[["prediction", "sfr"]].head())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/plain_python.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
from script_fidelity import compute_corpus_sfr, compute_sfr
|
| 2 |
-
|
| 3 |
-
text = "کابل کې ښه هوا ده"
|
| 4 |
-
print(compute_sfr(text, language="ps_af"))
|
| 5 |
-
|
| 6 |
-
predictions = [text, "this is romanized output"]
|
| 7 |
-
print(compute_corpus_sfr(predictions, language="pashto"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/transformers_compute_metrics.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
import evaluate
|
| 2 |
-
|
| 3 |
-
wer = evaluate.load("wer")
|
| 4 |
-
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def compute_metrics(eval_pred):
|
| 8 |
-
predictions, labels = eval_pred
|
| 9 |
-
pred_text = processor.batch_decode(predictions, skip_special_tokens=True)
|
| 10 |
-
label_text = processor.batch_decode(labels, skip_special_tokens=True)
|
| 11 |
-
return {
|
| 12 |
-
"wer": wer.compute(predictions=pred_text, references=label_text),
|
| 13 |
-
"sfr": sfr.compute(predictions=pred_text, language="ps_af")["sfr"],
|
| 14 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metrics/script_fidelity_rate/README.md
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
# Script Fidelity Rate
|
| 2 |
-
|
| 3 |
-
This directory is the Hugging Face Evaluate metric module for Script Fidelity
|
| 4 |
-
Rate (SFR).
|
| 5 |
-
|
| 6 |
-
The Python package is published as `script-fidelity` on PyPI:
|
| 7 |
-
<https://pypi.org/project/script-fidelity/>. The import name is
|
| 8 |
-
`script_fidelity`.
|
| 9 |
-
|
| 10 |
-
```python
|
| 11 |
-
import evaluate
|
| 12 |
-
|
| 13 |
-
sfr = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 14 |
-
result = sfr.compute(
|
| 15 |
-
predictions=["کابل کې ښه هوا ده", "this is romanized output"],
|
| 16 |
-
language="ps_af",
|
| 17 |
-
)
|
| 18 |
-
print(result["sfr_percent"])
|
| 19 |
-
```
|
| 20 |
-
|
| 21 |
-
Hub use after publishing:
|
| 22 |
-
|
| 23 |
-
```python
|
| 24 |
-
import evaluate
|
| 25 |
-
|
| 26 |
-
sfr = evaluate.load("themechanism/script_fidelity_rate", module_type="metric")
|
| 27 |
-
sfr.compute(predictions=["کابل کې ښه هوا ده"], language="pashto")
|
| 28 |
-
```
|
| 29 |
-
|
| 30 |
-
Use SFR with WER and CER, not instead of them. SFR checks whether output is in
|
| 31 |
-
the intended script. It does not measure lexical accuracy.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
[build-system]
|
| 2 |
-
requires = ["setuptools>=69", "wheel"]
|
| 3 |
-
build-backend = "setuptools.build_meta"
|
| 4 |
-
|
| 5 |
-
[project]
|
| 6 |
-
name = "script-fidelity"
|
| 7 |
-
version = "0.1.1"
|
| 8 |
-
description = "Reference-free script fidelity metric for multilingual ASR."
|
| 9 |
-
readme = "README.md"
|
| 10 |
-
requires-python = ">=3.10"
|
| 11 |
-
license = "MIT"
|
| 12 |
-
authors = [{ name = "Anonymous" }]
|
| 13 |
-
keywords = [
|
| 14 |
-
"asr",
|
| 15 |
-
"speech-recognition",
|
| 16 |
-
"evaluation",
|
| 17 |
-
"unicode",
|
| 18 |
-
"script-fidelity",
|
| 19 |
-
"fleurs",
|
| 20 |
-
]
|
| 21 |
-
classifiers = [
|
| 22 |
-
"Development Status :: 3 - Alpha",
|
| 23 |
-
"Intended Audience :: Science/Research",
|
| 24 |
-
"Programming Language :: Python :: 3",
|
| 25 |
-
"Programming Language :: Python :: 3.10",
|
| 26 |
-
"Programming Language :: Python :: 3.11",
|
| 27 |
-
"Programming Language :: Python :: 3.12",
|
| 28 |
-
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 29 |
-
]
|
| 30 |
-
dependencies = []
|
| 31 |
-
|
| 32 |
-
[project.optional-dependencies]
|
| 33 |
-
evaluate = ["evaluate>=0.4.0,<1.0"]
|
| 34 |
-
dev = [
|
| 35 |
-
"evaluate>=0.4.0,<1.0",
|
| 36 |
-
"pytest>=8.0",
|
| 37 |
-
]
|
| 38 |
-
|
| 39 |
-
[project.scripts]
|
| 40 |
-
sfr = "script_fidelity.cli:main"
|
| 41 |
-
|
| 42 |
-
[tool.setuptools.packages.find]
|
| 43 |
-
include = ["script_fidelity*"]
|
| 44 |
-
|
| 45 |
-
[tool.setuptools.package-data]
|
| 46 |
-
script_fidelity = ["data/*.json"]
|
| 47 |
-
|
| 48 |
-
[tool.pytest.ini_options]
|
| 49 |
-
testpaths = ["tests"]
|
| 50 |
-
pythonpath = ["."]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metrics/script_fidelity_rate/requirements.txt → requirements.txt
RENAMED
|
File without changes
|
script_fidelity/__init__.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
"""Reference-free script fidelity metrics for multilingual ASR."""
|
| 2 |
-
|
| 3 |
-
from .core import (
|
| 4 |
-
compute_corpus_sfr,
|
| 5 |
-
compute_sf,
|
| 6 |
-
compute_sf_batch,
|
| 7 |
-
compute_sfr,
|
| 8 |
-
compute_sfr_batch,
|
| 9 |
-
score_text,
|
| 10 |
-
)
|
| 11 |
-
from .dominant import dominant_script, script_distribution
|
| 12 |
-
from .registry import (
|
| 13 |
-
FLEURS_CONFIGS,
|
| 14 |
-
SCRIPT_CONFIGS,
|
| 15 |
-
get_script_config,
|
| 16 |
-
list_languages,
|
| 17 |
-
resolve_language,
|
| 18 |
-
)
|
| 19 |
-
from .types import DigitPolicy, SFRResult, ScriptConfig
|
| 20 |
-
|
| 21 |
-
__all__ = [
|
| 22 |
-
"DigitPolicy",
|
| 23 |
-
"FLEURS_CONFIGS",
|
| 24 |
-
"SCRIPT_CONFIGS",
|
| 25 |
-
"SFRResult",
|
| 26 |
-
"ScriptConfig",
|
| 27 |
-
"compute_corpus_sfr",
|
| 28 |
-
"compute_sf",
|
| 29 |
-
"compute_sf_batch",
|
| 30 |
-
"compute_sfr",
|
| 31 |
-
"compute_sfr_batch",
|
| 32 |
-
"dominant_script",
|
| 33 |
-
"get_script_config",
|
| 34 |
-
"list_languages",
|
| 35 |
-
"resolve_language",
|
| 36 |
-
"score_text",
|
| 37 |
-
"script_distribution",
|
| 38 |
-
]
|
| 39 |
-
|
| 40 |
-
__version__ = "0.1.1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/__main__.py
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
from .cli import main
|
| 2 |
-
|
| 3 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/cli.py
DELETED
|
@@ -1,133 +0,0 @@
|
|
| 1 |
-
"""Command line interface for Script Fidelity Rate."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import argparse
|
| 6 |
-
import csv
|
| 7 |
-
import json
|
| 8 |
-
import sys
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
from .core import compute_corpus_sfr, compute_sfr
|
| 12 |
-
from .registry import list_languages
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def _read_predictions(path: Path, text_column: str) -> list[str]:
|
| 16 |
-
if path.suffix.lower() == ".jsonl":
|
| 17 |
-
rows = []
|
| 18 |
-
with path.open("r", encoding="utf-8") as handle:
|
| 19 |
-
for line_no, line in enumerate(handle, start=1):
|
| 20 |
-
if not line.strip():
|
| 21 |
-
continue
|
| 22 |
-
item = json.loads(line)
|
| 23 |
-
if text_column not in item:
|
| 24 |
-
raise ValueError(f"Missing column '{text_column}' on line {line_no}")
|
| 25 |
-
rows.append(str(item[text_column]))
|
| 26 |
-
return rows
|
| 27 |
-
|
| 28 |
-
with path.open("r", encoding="utf-8", newline="") as handle:
|
| 29 |
-
reader = csv.DictReader(handle)
|
| 30 |
-
if not reader.fieldnames or text_column not in reader.fieldnames:
|
| 31 |
-
raise ValueError(f"Missing column '{text_column}' in CSV header")
|
| 32 |
-
return [str(row[text_column]) for row in reader]
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def _emit_summary(summary: dict, output_format: str) -> None:
|
| 36 |
-
if output_format == "json":
|
| 37 |
-
print(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True))
|
| 38 |
-
return
|
| 39 |
-
|
| 40 |
-
writer = csv.DictWriter(
|
| 41 |
-
sys.stdout,
|
| 42 |
-
fieldnames=[
|
| 43 |
-
"sfr",
|
| 44 |
-
"sfr_percent",
|
| 45 |
-
"n",
|
| 46 |
-
"n_valid",
|
| 47 |
-
"n_empty",
|
| 48 |
-
"low_sfr_rate",
|
| 49 |
-
"high_sfr_rate",
|
| 50 |
-
"dominant_script_counts",
|
| 51 |
-
],
|
| 52 |
-
)
|
| 53 |
-
writer.writeheader()
|
| 54 |
-
row = dict(summary)
|
| 55 |
-
row["dominant_script_counts"] = json.dumps(
|
| 56 |
-
row["dominant_script_counts"],
|
| 57 |
-
ensure_ascii=False,
|
| 58 |
-
sort_keys=True,
|
| 59 |
-
)
|
| 60 |
-
writer.writerow(row)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def build_parser() -> argparse.ArgumentParser:
|
| 64 |
-
parser = argparse.ArgumentParser(prog="sfr", description="Script Fidelity Rate tools")
|
| 65 |
-
sub = parser.add_subparsers(dest="command", required=True)
|
| 66 |
-
|
| 67 |
-
score = sub.add_parser("score", help="score one text string")
|
| 68 |
-
score.add_argument("--language", required=True, help="FLEURS code or alias")
|
| 69 |
-
score.add_argument("--text", required=True, help="ASR hypothesis text")
|
| 70 |
-
score.add_argument(
|
| 71 |
-
"--digit-policy",
|
| 72 |
-
choices=["count", "ignore"],
|
| 73 |
-
default="count",
|
| 74 |
-
help="count digits as characters or treat them as neutral",
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
audit = sub.add_parser("audit", help="audit a CSV or JSONL file")
|
| 78 |
-
audit.add_argument("path", type=Path, help="CSV or JSONL file")
|
| 79 |
-
audit.add_argument("--language", required=True, help="FLEURS code or alias")
|
| 80 |
-
audit.add_argument("--text-column", default="prediction", help="prediction column")
|
| 81 |
-
audit.add_argument(
|
| 82 |
-
"--digit-policy",
|
| 83 |
-
choices=["count", "ignore"],
|
| 84 |
-
default="count",
|
| 85 |
-
help="count digits as characters or treat them as neutral",
|
| 86 |
-
)
|
| 87 |
-
audit.add_argument("--format", choices=["json", "csv"], default="json")
|
| 88 |
-
audit.add_argument("--details", action="store_true", help="include per-row details")
|
| 89 |
-
|
| 90 |
-
langs = sub.add_parser("languages", help="list supported FLEURS codes")
|
| 91 |
-
langs.add_argument("--format", choices=["plain", "json"], default="plain")
|
| 92 |
-
|
| 93 |
-
return parser
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
def main(argv: list[str] | None = None) -> int:
|
| 97 |
-
parser = build_parser()
|
| 98 |
-
args = parser.parse_args(argv)
|
| 99 |
-
|
| 100 |
-
if args.command == "score":
|
| 101 |
-
score = compute_sfr(
|
| 102 |
-
args.text,
|
| 103 |
-
language=args.language,
|
| 104 |
-
digit_policy=args.digit_policy,
|
| 105 |
-
)
|
| 106 |
-
print("NA" if score is None else f"{score:.6f}")
|
| 107 |
-
return 0
|
| 108 |
-
|
| 109 |
-
if args.command == "audit":
|
| 110 |
-
predictions = _read_predictions(args.path, args.text_column)
|
| 111 |
-
summary = compute_corpus_sfr(
|
| 112 |
-
predictions,
|
| 113 |
-
language=args.language,
|
| 114 |
-
digit_policy=args.digit_policy,
|
| 115 |
-
return_details=args.details,
|
| 116 |
-
)
|
| 117 |
-
_emit_summary(summary, args.format)
|
| 118 |
-
return 0
|
| 119 |
-
|
| 120 |
-
if args.command == "languages":
|
| 121 |
-
languages = list_languages()
|
| 122 |
-
if args.format == "json":
|
| 123 |
-
print(json.dumps(languages, indent=2))
|
| 124 |
-
else:
|
| 125 |
-
print("\n".join(languages))
|
| 126 |
-
return 0
|
| 127 |
-
|
| 128 |
-
parser.error("unknown command")
|
| 129 |
-
return 2
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
if __name__ == "__main__":
|
| 133 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/core.py
DELETED
|
@@ -1,132 +0,0 @@
|
|
| 1 |
-
"""Core Script Fidelity Rate implementation."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import unicodedata
|
| 6 |
-
from collections import Counter
|
| 7 |
-
from statistics import fmean
|
| 8 |
-
|
| 9 |
-
from .dominant import dominant_script, is_countable, script_distribution
|
| 10 |
-
from .registry import get_script_config
|
| 11 |
-
from .types import DigitPolicy, SFRResult, ScriptConfig
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def _is_in_range(cp: int, ranges: tuple[tuple[int, int], ...]) -> bool:
|
| 15 |
-
return any(lo <= cp <= hi for lo, hi in ranges)
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def score_text(
|
| 19 |
-
text: str,
|
| 20 |
-
language: str = "ps_af",
|
| 21 |
-
*,
|
| 22 |
-
digit_policy: DigitPolicy = "count",
|
| 23 |
-
config: ScriptConfig | None = None,
|
| 24 |
-
) -> SFRResult:
|
| 25 |
-
"""Score one ASR hypothesis and return numerator, denominator, and scripts."""
|
| 26 |
-
|
| 27 |
-
cfg = config or get_script_config(language)
|
| 28 |
-
normalized = unicodedata.normalize("NFC", text or "")
|
| 29 |
-
chars = [ch for ch in normalized if is_countable(ch, digit_policy=digit_policy)]
|
| 30 |
-
|
| 31 |
-
numerator = sum(1 for ch in chars if _is_in_range(ord(ch), cfg.ranges))
|
| 32 |
-
denominator = len(chars)
|
| 33 |
-
sfr = None if denominator == 0 else numerator / denominator
|
| 34 |
-
|
| 35 |
-
return SFRResult(
|
| 36 |
-
language=cfg.code,
|
| 37 |
-
sfr=sfr,
|
| 38 |
-
numerator=numerator,
|
| 39 |
-
denominator=denominator,
|
| 40 |
-
dominant_script=dominant_script(
|
| 41 |
-
normalized,
|
| 42 |
-
digit_policy=digit_policy,
|
| 43 |
-
),
|
| 44 |
-
script_counts=script_distribution(
|
| 45 |
-
normalized,
|
| 46 |
-
digit_policy=digit_policy,
|
| 47 |
-
),
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def compute_sfr(
|
| 52 |
-
text: str,
|
| 53 |
-
language: str = "ps_af",
|
| 54 |
-
*,
|
| 55 |
-
digit_policy: DigitPolicy = "count",
|
| 56 |
-
) -> float | None:
|
| 57 |
-
"""Compute reference-free Script Fidelity Rate for one ASR hypothesis."""
|
| 58 |
-
|
| 59 |
-
return score_text(text, language, digit_policy=digit_policy).sfr
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def compute_sfr_batch(
|
| 63 |
-
predictions: list[str] | tuple[str, ...],
|
| 64 |
-
language: str = "ps_af",
|
| 65 |
-
*,
|
| 66 |
-
digit_policy: DigitPolicy = "count",
|
| 67 |
-
) -> list[float | None]:
|
| 68 |
-
"""Compute SFR for a batch of ASR hypotheses."""
|
| 69 |
-
|
| 70 |
-
config = get_script_config(language)
|
| 71 |
-
return [
|
| 72 |
-
score_text(text, config.code, digit_policy=digit_policy, config=config).sfr
|
| 73 |
-
for text in predictions
|
| 74 |
-
]
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def compute_corpus_sfr(
|
| 78 |
-
predictions: list[str] | tuple[str, ...],
|
| 79 |
-
language: str = "ps_af",
|
| 80 |
-
*,
|
| 81 |
-
digit_policy: DigitPolicy = "count",
|
| 82 |
-
low_threshold: float = 0.1,
|
| 83 |
-
high_threshold: float = 0.9,
|
| 84 |
-
return_details: bool = False,
|
| 85 |
-
) -> dict:
|
| 86 |
-
"""Compute corpus SFR and audit counts for a batch."""
|
| 87 |
-
|
| 88 |
-
config = get_script_config(language)
|
| 89 |
-
details = [
|
| 90 |
-
score_text(text, config.code, digit_policy=digit_policy, config=config)
|
| 91 |
-
for text in predictions
|
| 92 |
-
]
|
| 93 |
-
scores = [item.sfr for item in details if item.sfr is not None]
|
| 94 |
-
n = len(details)
|
| 95 |
-
n_valid = len(scores)
|
| 96 |
-
n_empty = n - n_valid
|
| 97 |
-
corpus = fmean(scores) if scores else None
|
| 98 |
-
|
| 99 |
-
dominant_counts = Counter(item.dominant_script for item in details)
|
| 100 |
-
result = {
|
| 101 |
-
"sfr": corpus,
|
| 102 |
-
"sfr_percent": None if corpus is None else corpus * 100,
|
| 103 |
-
"n": n,
|
| 104 |
-
"n_valid": n_valid,
|
| 105 |
-
"n_empty": n_empty,
|
| 106 |
-
"low_sfr_rate": None
|
| 107 |
-
if n_valid == 0
|
| 108 |
-
else sum(1 for score in scores if score < low_threshold) / n_valid,
|
| 109 |
-
"high_sfr_rate": None
|
| 110 |
-
if n_valid == 0
|
| 111 |
-
else sum(1 for score in scores if score >= high_threshold) / n_valid,
|
| 112 |
-
"dominant_script_counts": dict(sorted(dominant_counts.items())),
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
if return_details:
|
| 116 |
-
result["details"] = [
|
| 117 |
-
{
|
| 118 |
-
"language": item.language,
|
| 119 |
-
"sfr": item.sfr,
|
| 120 |
-
"numerator": item.numerator,
|
| 121 |
-
"denominator": item.denominator,
|
| 122 |
-
"dominant_script": item.dominant_script,
|
| 123 |
-
"script_counts": item.script_counts,
|
| 124 |
-
}
|
| 125 |
-
for item in details
|
| 126 |
-
]
|
| 127 |
-
|
| 128 |
-
return result
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
compute_sf = compute_sfr
|
| 132 |
-
compute_sf_batch = compute_sfr_batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/data/fleurs_registry.json
DELETED
|
@@ -1,210 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"version": "0.1.1",
|
| 3 |
-
"source": "Reviewed registry for google/fleurs configs as of 2026-05-07. The config named all is excluded.",
|
| 4 |
-
"scripts": {
|
| 5 |
-
"arabic": {
|
| 6 |
-
"name": "Arabic",
|
| 7 |
-
"ranges": [[1536, 1791], [1872, 1919], [2208, 2303], [64336, 65023], [65136, 65279], [69216, 69247], [126464, 126719]]
|
| 8 |
-
},
|
| 9 |
-
"armenian": {
|
| 10 |
-
"name": "Armenian",
|
| 11 |
-
"ranges": [[1328, 1423], [64275, 64279]]
|
| 12 |
-
},
|
| 13 |
-
"bengali": {
|
| 14 |
-
"name": "Bengali",
|
| 15 |
-
"ranges": [[2432, 2559]]
|
| 16 |
-
},
|
| 17 |
-
"cyrillic": {
|
| 18 |
-
"name": "Cyrillic",
|
| 19 |
-
"ranges": [[1024, 1279], [1280, 1327], [7296, 7311], [11744, 11775], [42560, 42655]]
|
| 20 |
-
},
|
| 21 |
-
"devanagari": {
|
| 22 |
-
"name": "Devanagari",
|
| 23 |
-
"ranges": [[2304, 2431], [43232, 43263], [72448, 72543]]
|
| 24 |
-
},
|
| 25 |
-
"ethiopic": {
|
| 26 |
-
"name": "Ethiopic",
|
| 27 |
-
"ranges": [[4608, 4991], [4992, 5023], [11648, 11743], [43776, 43823]]
|
| 28 |
-
},
|
| 29 |
-
"georgian": {
|
| 30 |
-
"name": "Georgian",
|
| 31 |
-
"ranges": [[4256, 4351], [11520, 11567], [7312, 7359]]
|
| 32 |
-
},
|
| 33 |
-
"greek": {
|
| 34 |
-
"name": "Greek",
|
| 35 |
-
"ranges": [[880, 1023], [7936, 8191]]
|
| 36 |
-
},
|
| 37 |
-
"gujarati": {
|
| 38 |
-
"name": "Gujarati",
|
| 39 |
-
"ranges": [[2688, 2815]]
|
| 40 |
-
},
|
| 41 |
-
"gurmukhi": {
|
| 42 |
-
"name": "Gurmukhi",
|
| 43 |
-
"ranges": [[2560, 2687]]
|
| 44 |
-
},
|
| 45 |
-
"han": {
|
| 46 |
-
"name": "Han",
|
| 47 |
-
"ranges": [[13312, 19903], [19968, 40959], [63744, 64255], [131072, 173791], [173824, 177983], [177984, 178207], [178208, 183983], [183984, 191471], [196608, 201551]]
|
| 48 |
-
},
|
| 49 |
-
"hangul": {
|
| 50 |
-
"name": "Hangul",
|
| 51 |
-
"ranges": [[4352, 4607], [12592, 12687], [43360, 43391], [44032, 55215], [55216, 55295]]
|
| 52 |
-
},
|
| 53 |
-
"hebrew": {
|
| 54 |
-
"name": "Hebrew",
|
| 55 |
-
"ranges": [[1424, 1535], [64285, 64335]]
|
| 56 |
-
},
|
| 57 |
-
"hiragana": {
|
| 58 |
-
"name": "Hiragana",
|
| 59 |
-
"ranges": [[12352, 12447]]
|
| 60 |
-
},
|
| 61 |
-
"kannada": {
|
| 62 |
-
"name": "Kannada",
|
| 63 |
-
"ranges": [[3200, 3327]]
|
| 64 |
-
},
|
| 65 |
-
"katakana": {
|
| 66 |
-
"name": "Katakana",
|
| 67 |
-
"ranges": [[12448, 12543], [12784, 12799], [65381, 65439]]
|
| 68 |
-
},
|
| 69 |
-
"khmer": {
|
| 70 |
-
"name": "Khmer",
|
| 71 |
-
"ranges": [[6016, 6143], [6624, 6655]]
|
| 72 |
-
},
|
| 73 |
-
"lao": {
|
| 74 |
-
"name": "Lao",
|
| 75 |
-
"ranges": [[3712, 3839]]
|
| 76 |
-
},
|
| 77 |
-
"latin": {
|
| 78 |
-
"name": "Latin",
|
| 79 |
-
"ranges": [[65, 90], [97, 122], [192, 591], [7680, 7935], [42784, 43007], [43824, 43887], [122624, 122879]]
|
| 80 |
-
},
|
| 81 |
-
"malayalam": {
|
| 82 |
-
"name": "Malayalam",
|
| 83 |
-
"ranges": [[3328, 3455]]
|
| 84 |
-
},
|
| 85 |
-
"myanmar": {
|
| 86 |
-
"name": "Myanmar",
|
| 87 |
-
"ranges": [[4096, 4255], [43392, 43487], [43488, 43519]]
|
| 88 |
-
},
|
| 89 |
-
"odia": {
|
| 90 |
-
"name": "Odia",
|
| 91 |
-
"ranges": [[2816, 2943]]
|
| 92 |
-
},
|
| 93 |
-
"tamil": {
|
| 94 |
-
"name": "Tamil",
|
| 95 |
-
"ranges": [[2944, 3071]]
|
| 96 |
-
},
|
| 97 |
-
"telugu": {
|
| 98 |
-
"name": "Telugu",
|
| 99 |
-
"ranges": [[3072, 3199]]
|
| 100 |
-
},
|
| 101 |
-
"thai": {
|
| 102 |
-
"name": "Thai",
|
| 103 |
-
"ranges": [[3584, 3711]]
|
| 104 |
-
}
|
| 105 |
-
},
|
| 106 |
-
"languages": {
|
| 107 |
-
"af_za": {"name": "Afrikaans", "scripts": ["latin"], "aliases": ["afrikaans"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 108 |
-
"am_et": {"name": "Amharic", "scripts": ["ethiopic"], "aliases": ["amharic"]},
|
| 109 |
-
"ar_eg": {"name": "Arabic", "scripts": ["arabic"], "aliases": ["arabic", "msa"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 110 |
-
"as_in": {"name": "Assamese", "scripts": ["bengali"], "aliases": ["assamese"], "shared_script": true, "warning": "Bengali-Assamese script SFR does not identify the language."},
|
| 111 |
-
"ast_es": {"name": "Asturian", "scripts": ["latin"], "aliases": ["asturian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 112 |
-
"az_az": {"name": "Azerbaijani", "scripts": ["latin"], "aliases": ["azerbaijani", "azeri"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 113 |
-
"be_by": {"name": "Belarusian", "scripts": ["cyrillic"], "aliases": ["belarusian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 114 |
-
"bg_bg": {"name": "Bulgarian", "scripts": ["cyrillic"], "aliases": ["bulgarian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 115 |
-
"bn_in": {"name": "Bengali", "scripts": ["bengali"], "aliases": ["bengali", "bangla"], "shared_script": true, "warning": "Bengali-script SFR does not distinguish Bengali from Assamese."},
|
| 116 |
-
"bs_ba": {"name": "Bosnian", "scripts": ["latin"], "aliases": ["bosnian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 117 |
-
"ca_es": {"name": "Catalan", "scripts": ["latin"], "aliases": ["catalan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 118 |
-
"ceb_ph": {"name": "Cebuano", "scripts": ["latin"], "aliases": ["cebuano"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 119 |
-
"ckb_iq": {"name": "Central Kurdish", "scripts": ["arabic"], "aliases": ["central_kurdish", "sorani", "kurdish_sorani"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 120 |
-
"cmn_hans_cn": {"name": "Mandarin Chinese", "scripts": ["han"], "aliases": ["mandarin", "chinese", "simplified_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
|
| 121 |
-
"cs_cz": {"name": "Czech", "scripts": ["latin"], "aliases": ["czech"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 122 |
-
"cy_gb": {"name": "Welsh", "scripts": ["latin"], "aliases": ["welsh"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 123 |
-
"da_dk": {"name": "Danish", "scripts": ["latin"], "aliases": ["danish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 124 |
-
"de_de": {"name": "German", "scripts": ["latin"], "aliases": ["german"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 125 |
-
"el_gr": {"name": "Greek", "scripts": ["greek"], "aliases": ["greek"]},
|
| 126 |
-
"en_us": {"name": "English", "scripts": ["latin"], "aliases": ["english"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 127 |
-
"es_419": {"name": "Spanish", "scripts": ["latin"], "aliases": ["spanish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 128 |
-
"et_ee": {"name": "Estonian", "scripts": ["latin"], "aliases": ["estonian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 129 |
-
"fa_ir": {"name": "Persian", "scripts": ["arabic"], "aliases": ["persian", "farsi"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 130 |
-
"ff_sn": {"name": "Fulah", "scripts": ["latin"], "aliases": ["fulah", "fulani"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 131 |
-
"fi_fi": {"name": "Finnish", "scripts": ["latin"], "aliases": ["finnish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 132 |
-
"fil_ph": {"name": "Filipino", "scripts": ["latin"], "aliases": ["filipino", "tagalog"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 133 |
-
"fr_fr": {"name": "French", "scripts": ["latin"], "aliases": ["french"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 134 |
-
"ga_ie": {"name": "Irish", "scripts": ["latin"], "aliases": ["irish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 135 |
-
"gl_es": {"name": "Galician", "scripts": ["latin"], "aliases": ["galician"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 136 |
-
"gu_in": {"name": "Gujarati", "scripts": ["gujarati"], "aliases": ["gujarati"]},
|
| 137 |
-
"ha_ng": {"name": "Hausa", "scripts": ["latin"], "aliases": ["hausa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 138 |
-
"he_il": {"name": "Hebrew", "scripts": ["hebrew"], "aliases": ["hebrew"]},
|
| 139 |
-
"hi_in": {"name": "Hindi", "scripts": ["devanagari"], "aliases": ["hindi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
|
| 140 |
-
"hr_hr": {"name": "Croatian", "scripts": ["latin"], "aliases": ["croatian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 141 |
-
"hu_hu": {"name": "Hungarian", "scripts": ["latin"], "aliases": ["hungarian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 142 |
-
"hy_am": {"name": "Armenian", "scripts": ["armenian"], "aliases": ["armenian"]},
|
| 143 |
-
"id_id": {"name": "Indonesian", "scripts": ["latin"], "aliases": ["indonesian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 144 |
-
"ig_ng": {"name": "Igbo", "scripts": ["latin"], "aliases": ["igbo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 145 |
-
"is_is": {"name": "Icelandic", "scripts": ["latin"], "aliases": ["icelandic"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 146 |
-
"it_it": {"name": "Italian", "scripts": ["latin"], "aliases": ["italian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 147 |
-
"ja_jp": {"name": "Japanese", "scripts": ["han", "hiragana", "katakana"], "aliases": ["japanese"], "shared_script": true, "warning": "Japanese SFR counts Han and kana; it is not a language identifier."},
|
| 148 |
-
"jv_id": {"name": "Javanese", "scripts": ["latin"], "aliases": ["javanese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 149 |
-
"ka_ge": {"name": "Georgian", "scripts": ["georgian"], "aliases": ["georgian"]},
|
| 150 |
-
"kam_ke": {"name": "Kamba", "scripts": ["latin"], "aliases": ["kamba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 151 |
-
"kea_cv": {"name": "Kabuverdianu", "scripts": ["latin"], "aliases": ["kabuverdianu", "cape_verdean_creole"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 152 |
-
"kk_kz": {"name": "Kazakh", "scripts": ["cyrillic"], "aliases": ["kazakh"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 153 |
-
"km_kh": {"name": "Khmer", "scripts": ["khmer"], "aliases": ["khmer"]},
|
| 154 |
-
"kn_in": {"name": "Kannada", "scripts": ["kannada"], "aliases": ["kannada"]},
|
| 155 |
-
"ko_kr": {"name": "Korean", "scripts": ["hangul"], "aliases": ["korean"]},
|
| 156 |
-
"ky_kg": {"name": "Kyrgyz", "scripts": ["cyrillic"], "aliases": ["kyrgyz"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 157 |
-
"lb_lu": {"name": "Luxembourgish", "scripts": ["latin"], "aliases": ["luxembourgish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 158 |
-
"lg_ug": {"name": "Ganda", "scripts": ["latin"], "aliases": ["ganda", "luganda"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 159 |
-
"ln_cd": {"name": "Lingala", "scripts": ["latin"], "aliases": ["lingala"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 160 |
-
"lo_la": {"name": "Lao", "scripts": ["lao"], "aliases": ["lao"]},
|
| 161 |
-
"lt_lt": {"name": "Lithuanian", "scripts": ["latin"], "aliases": ["lithuanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 162 |
-
"luo_ke": {"name": "Luo", "scripts": ["latin"], "aliases": ["luo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 163 |
-
"lv_lv": {"name": "Latvian", "scripts": ["latin"], "aliases": ["latvian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 164 |
-
"mi_nz": {"name": "Maori", "scripts": ["latin"], "aliases": ["maori", "māori"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 165 |
-
"mk_mk": {"name": "Macedonian", "scripts": ["cyrillic"], "aliases": ["macedonian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 166 |
-
"ml_in": {"name": "Malayalam", "scripts": ["malayalam"], "aliases": ["malayalam"]},
|
| 167 |
-
"mn_mn": {"name": "Mongolian", "scripts": ["cyrillic"], "aliases": ["mongolian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 168 |
-
"mr_in": {"name": "Marathi", "scripts": ["devanagari"], "aliases": ["marathi"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
|
| 169 |
-
"ms_my": {"name": "Malay", "scripts": ["latin"], "aliases": ["malay"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 170 |
-
"mt_mt": {"name": "Maltese", "scripts": ["latin"], "aliases": ["maltese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 171 |
-
"my_mm": {"name": "Burmese", "scripts": ["myanmar"], "aliases": ["burmese", "myanmar_language"]},
|
| 172 |
-
"nb_no": {"name": "Norwegian Bokmal", "scripts": ["latin"], "aliases": ["norwegian", "norwegian_bokmal", "bokmal"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 173 |
-
"ne_np": {"name": "Nepali", "scripts": ["devanagari"], "aliases": ["nepali"], "shared_script": true, "warning": "Devanagari SFR does not identify the language."},
|
| 174 |
-
"nl_nl": {"name": "Dutch", "scripts": ["latin"], "aliases": ["dutch"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 175 |
-
"nso_za": {"name": "Northern Sotho", "scripts": ["latin"], "aliases": ["northern_sotho", "sepedi"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 176 |
-
"ny_mw": {"name": "Chichewa", "scripts": ["latin"], "aliases": ["chichewa", "nyanja"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 177 |
-
"oc_fr": {"name": "Occitan", "scripts": ["latin"], "aliases": ["occitan"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 178 |
-
"om_et": {"name": "Oromo", "scripts": ["latin"], "aliases": ["oromo"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 179 |
-
"or_in": {"name": "Odia", "scripts": ["odia"], "aliases": ["odia", "oriya"]},
|
| 180 |
-
"pa_in": {"name": "Punjabi", "scripts": ["gurmukhi"], "aliases": ["punjabi", "eastern_punjabi"], "shared_script": true, "warning": "Gurmukhi SFR checks script, not dialect or language identity."},
|
| 181 |
-
"pl_pl": {"name": "Polish", "scripts": ["latin"], "aliases": ["polish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 182 |
-
"ps_af": {"name": "Pashto", "scripts": ["arabic"], "aliases": ["pashto", "pushto", "ps"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 183 |
-
"pt_br": {"name": "Portuguese", "scripts": ["latin"], "aliases": ["portuguese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 184 |
-
"ro_ro": {"name": "Romanian", "scripts": ["latin"], "aliases": ["romanian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 185 |
-
"ru_ru": {"name": "Russian", "scripts": ["cyrillic"], "aliases": ["russian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 186 |
-
"sd_in": {"name": "Sindhi", "scripts": ["arabic"], "aliases": ["sindhi"], "shared_script": true, "warning": "Arabic-script languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 187 |
-
"sk_sk": {"name": "Slovak", "scripts": ["latin"], "aliases": ["slovak"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 188 |
-
"sl_si": {"name": "Slovenian", "scripts": ["latin"], "aliases": ["slovenian"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 189 |
-
"sn_zw": {"name": "Shona", "scripts": ["latin"], "aliases": ["shona"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 190 |
-
"so_so": {"name": "Somali", "scripts": ["latin"], "aliases": ["somali"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 191 |
-
"sr_rs": {"name": "Serbian", "scripts": ["cyrillic"], "aliases": ["serbian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 192 |
-
"sv_se": {"name": "Swedish", "scripts": ["latin"], "aliases": ["swedish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 193 |
-
"sw_ke": {"name": "Swahili", "scripts": ["latin"], "aliases": ["swahili"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 194 |
-
"ta_in": {"name": "Tamil", "scripts": ["tamil"], "aliases": ["tamil"]},
|
| 195 |
-
"te_in": {"name": "Telugu", "scripts": ["telugu"], "aliases": ["telugu"]},
|
| 196 |
-
"tg_tj": {"name": "Tajik", "scripts": ["cyrillic"], "aliases": ["tajik"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 197 |
-
"th_th": {"name": "Thai", "scripts": ["thai"], "aliases": ["thai"]},
|
| 198 |
-
"tr_tr": {"name": "Turkish", "scripts": ["latin"], "aliases": ["turkish"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 199 |
-
"uk_ua": {"name": "Ukrainian", "scripts": ["cyrillic"], "aliases": ["ukrainian"], "shared_script": true, "warning": "Cyrillic SFR does not identify the language."},
|
| 200 |
-
"umb_ao": {"name": "Umbundu", "scripts": ["latin"], "aliases": ["umbundu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 201 |
-
"ur_pk": {"name": "Urdu", "scripts": ["arabic"], "aliases": ["urdu"], "shared_script": true, "warning": "Perso-Arabic languages share Unicode blocks; use LID or lexical checks for wrong-language cases."},
|
| 202 |
-
"uz_uz": {"name": "Uzbek", "scripts": ["latin"], "aliases": ["uzbek"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 203 |
-
"vi_vn": {"name": "Vietnamese", "scripts": ["latin"], "aliases": ["vietnamese"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 204 |
-
"wo_sn": {"name": "Wolof", "scripts": ["latin"], "aliases": ["wolof"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 205 |
-
"xh_za": {"name": "Xhosa", "scripts": ["latin"], "aliases": ["xhosa"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 206 |
-
"yo_ng": {"name": "Yoruba", "scripts": ["latin"], "aliases": ["yoruba"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."},
|
| 207 |
-
"yue_hant_hk": {"name": "Cantonese", "scripts": ["han"], "aliases": ["cantonese", "traditional_chinese"], "shared_script": true, "warning": "Han SFR does not identify the spoken language or script variant."},
|
| 208 |
-
"zu_za": {"name": "Zulu", "scripts": ["latin"], "aliases": ["zulu"], "shared_script": true, "warning": "High SFR checks Latin output, not language identity."}
|
| 209 |
-
}
|
| 210 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/dominant.py
DELETED
|
@@ -1,75 +0,0 @@
|
|
| 1 |
-
"""Dominant script helpers for SFR audits."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import unicodedata
|
| 6 |
-
from collections import Counter
|
| 7 |
-
|
| 8 |
-
from .registry import _registry
|
| 9 |
-
from .types import DigitPolicy
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def is_countable(ch: str, digit_policy: DigitPolicy = "count") -> bool:
|
| 13 |
-
"""Return whether a character should count in an SFR denominator."""
|
| 14 |
-
|
| 15 |
-
if digit_policy not in {"count", "ignore"}:
|
| 16 |
-
raise ValueError("digit_policy must be 'count' or 'ignore'")
|
| 17 |
-
|
| 18 |
-
cat = unicodedata.category(ch)
|
| 19 |
-
if digit_policy == "ignore" and cat.startswith("N"):
|
| 20 |
-
return False
|
| 21 |
-
return (
|
| 22 |
-
not ch.isspace()
|
| 23 |
-
and not cat.startswith("P")
|
| 24 |
-
and not cat.startswith("Z")
|
| 25 |
-
and not cat.startswith("C")
|
| 26 |
-
and not cat.startswith("M")
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def _in_ranges(cp: int, ranges: list[list[int]]) -> bool:
|
| 31 |
-
return any(int(lo) <= cp <= int(hi) for lo, hi in ranges)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def script_distribution(
|
| 35 |
-
text: str,
|
| 36 |
-
*,
|
| 37 |
-
digit_policy: DigitPolicy = "count",
|
| 38 |
-
) -> dict[str, int]:
|
| 39 |
-
"""Count broad Unicode script families in text."""
|
| 40 |
-
|
| 41 |
-
normalized = unicodedata.normalize("NFC", text or "")
|
| 42 |
-
scripts = _registry()["scripts"]
|
| 43 |
-
counts: Counter[str] = Counter()
|
| 44 |
-
|
| 45 |
-
for ch in normalized:
|
| 46 |
-
if not is_countable(ch, digit_policy=digit_policy):
|
| 47 |
-
continue
|
| 48 |
-
cp = ord(ch)
|
| 49 |
-
label = "other"
|
| 50 |
-
for script_id, config in scripts.items():
|
| 51 |
-
if _in_ranges(cp, config["ranges"]):
|
| 52 |
-
label = script_id
|
| 53 |
-
break
|
| 54 |
-
counts[label] += 1
|
| 55 |
-
|
| 56 |
-
return dict(counts)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def dominant_script(
|
| 60 |
-
text: str,
|
| 61 |
-
*,
|
| 62 |
-
digit_policy: DigitPolicy = "count",
|
| 63 |
-
threshold: float = 0.5,
|
| 64 |
-
) -> str:
|
| 65 |
-
"""Return the dominant script label, ``mixed``, or ``empty``."""
|
| 66 |
-
|
| 67 |
-
counts = script_distribution(text, digit_policy=digit_policy)
|
| 68 |
-
total = sum(counts.values())
|
| 69 |
-
if total == 0:
|
| 70 |
-
return "empty"
|
| 71 |
-
|
| 72 |
-
script, count = max(counts.items(), key=lambda item: item[1])
|
| 73 |
-
if count / total >= threshold:
|
| 74 |
-
return script
|
| 75 |
-
return "mixed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/registry.py
DELETED
|
@@ -1,86 +0,0 @@
|
|
| 1 |
-
"""FLEURS language registry for Script Fidelity Rate."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import json
|
| 6 |
-
from functools import lru_cache
|
| 7 |
-
from importlib.resources import files
|
| 8 |
-
|
| 9 |
-
from .types import ScriptConfig
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
@lru_cache(maxsize=1)
|
| 13 |
-
def _registry() -> dict:
|
| 14 |
-
data_path = files("script_fidelity").joinpath("data/fleurs_registry.json")
|
| 15 |
-
return json.loads(data_path.read_text(encoding="utf-8"))
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def _script_ranges(script_ids: list[str]) -> tuple[tuple[int, int], ...]:
|
| 19 |
-
scripts = _registry()["scripts"]
|
| 20 |
-
ranges: list[tuple[int, int]] = []
|
| 21 |
-
seen: set[tuple[int, int]] = set()
|
| 22 |
-
for script_id in script_ids:
|
| 23 |
-
for lo, hi in scripts[script_id]["ranges"]:
|
| 24 |
-
item = (int(lo), int(hi))
|
| 25 |
-
if item not in seen:
|
| 26 |
-
ranges.append(item)
|
| 27 |
-
seen.add(item)
|
| 28 |
-
return tuple(ranges)
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
@lru_cache(maxsize=1)
|
| 32 |
-
def _language_configs() -> dict[str, ScriptConfig]:
|
| 33 |
-
configs: dict[str, ScriptConfig] = {}
|
| 34 |
-
for code, item in _registry()["languages"].items():
|
| 35 |
-
script_ids = item["scripts"]
|
| 36 |
-
configs[code] = ScriptConfig(
|
| 37 |
-
code=code,
|
| 38 |
-
name=item["name"],
|
| 39 |
-
script="+".join(script_ids),
|
| 40 |
-
ranges=_script_ranges(script_ids),
|
| 41 |
-
aliases=tuple(item.get("aliases", [])),
|
| 42 |
-
shared_script=bool(item.get("shared_script", False)),
|
| 43 |
-
warning=item.get("warning", ""),
|
| 44 |
-
)
|
| 45 |
-
return configs
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
@lru_cache(maxsize=1)
|
| 49 |
-
def _alias_map() -> dict[str, str]:
|
| 50 |
-
aliases: dict[str, str] = {}
|
| 51 |
-
for code, config in _language_configs().items():
|
| 52 |
-
aliases[code.lower()] = code
|
| 53 |
-
aliases[code.replace("_", "-").lower()] = code
|
| 54 |
-
for alias in config.aliases:
|
| 55 |
-
aliases[alias.lower()] = code
|
| 56 |
-
return aliases
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def resolve_language(language: str) -> str:
|
| 60 |
-
"""Resolve a FLEURS code or alias to a canonical FLEURS code."""
|
| 61 |
-
|
| 62 |
-
normalized = language.strip().lower().replace(" ", "_")
|
| 63 |
-
try:
|
| 64 |
-
return _alias_map()[normalized]
|
| 65 |
-
except KeyError as exc:
|
| 66 |
-
known = ", ".join(list_languages()[:12])
|
| 67 |
-
raise ValueError(
|
| 68 |
-
f"Unknown language '{language}'. Use a FLEURS code such as ps_af, "
|
| 69 |
-
f"or an alias such as pashto. Examples: {known}, ..."
|
| 70 |
-
) from exc
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def get_script_config(language: str) -> ScriptConfig:
|
| 74 |
-
"""Return the reviewed script configuration for a language."""
|
| 75 |
-
|
| 76 |
-
return _language_configs()[resolve_language(language)]
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def list_languages() -> list[str]:
|
| 80 |
-
"""Return canonical FLEURS language codes supported by the registry."""
|
| 81 |
-
|
| 82 |
-
return sorted(_language_configs())
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
FLEURS_CONFIGS = tuple(list_languages())
|
| 86 |
-
SCRIPT_CONFIGS = _language_configs()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity/types.py
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
"""Shared types for Script Fidelity Rate."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from dataclasses import dataclass
|
| 6 |
-
from typing import Literal
|
| 7 |
-
|
| 8 |
-
DigitPolicy = Literal["count", "ignore"]
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
@dataclass(frozen=True)
|
| 12 |
-
class ScriptConfig:
|
| 13 |
-
"""Script configuration for one FLEURS language."""
|
| 14 |
-
|
| 15 |
-
code: str
|
| 16 |
-
name: str
|
| 17 |
-
script: str
|
| 18 |
-
ranges: tuple[tuple[int, int], ...]
|
| 19 |
-
aliases: tuple[str, ...] = ()
|
| 20 |
-
shared_script: bool = False
|
| 21 |
-
warning: str = ""
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
@dataclass(frozen=True)
|
| 25 |
-
class SFRResult:
|
| 26 |
-
"""Per-text Script Fidelity Rate result."""
|
| 27 |
-
|
| 28 |
-
language: str
|
| 29 |
-
sfr: float | None
|
| 30 |
-
numerator: int
|
| 31 |
-
denominator: int
|
| 32 |
-
dominant_script: str
|
| 33 |
-
script_counts: dict[str, int]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metrics/script_fidelity_rate/script_fidelity_rate.py → script_fidelity_rate.py
RENAMED
|
@@ -2,18 +2,9 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
import sys
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
import datasets
|
| 9 |
import evaluate
|
| 10 |
|
| 11 |
-
CURRENT_DIR = Path(__file__).resolve().parent
|
| 12 |
-
for parent in (CURRENT_DIR, CURRENT_DIR.parent, CURRENT_DIR.parent.parent):
|
| 13 |
-
if (parent / "script_fidelity").exists():
|
| 14 |
-
sys.path.insert(0, str(parent))
|
| 15 |
-
break
|
| 16 |
-
|
| 17 |
from script_fidelity import compute_corpus_sfr # noqa: E402
|
| 18 |
|
| 19 |
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
import datasets
|
| 6 |
import evaluate
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from script_fidelity import compute_corpus_sfr # noqa: E402
|
| 9 |
|
| 10 |
|
script_fidelity_rate/.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity_rate/README.md
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: script_fidelity_rate
|
| 3 |
-
datasets:
|
| 4 |
-
-
|
| 5 |
-
tags:
|
| 6 |
-
- evaluate
|
| 7 |
-
- metric
|
| 8 |
-
description: "TODO: add a description here"
|
| 9 |
-
sdk: gradio
|
| 10 |
-
sdk_version: 3.19.1
|
| 11 |
-
app_file: app.py
|
| 12 |
-
pinned: false
|
| 13 |
-
---
|
| 14 |
-
|
| 15 |
-
# Metric Card for script_fidelity_rate
|
| 16 |
-
|
| 17 |
-
***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
|
| 18 |
-
|
| 19 |
-
## Metric Description
|
| 20 |
-
*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
|
| 21 |
-
|
| 22 |
-
## How to Use
|
| 23 |
-
*Give general statement of how to use the metric*
|
| 24 |
-
|
| 25 |
-
*Provide simplest possible example for using the metric*
|
| 26 |
-
|
| 27 |
-
### Inputs
|
| 28 |
-
*List all input arguments in the format below*
|
| 29 |
-
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
|
| 30 |
-
|
| 31 |
-
### Output Values
|
| 32 |
-
|
| 33 |
-
*Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
|
| 34 |
-
|
| 35 |
-
*State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
|
| 36 |
-
|
| 37 |
-
#### Values from Popular Papers
|
| 38 |
-
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
| 39 |
-
|
| 40 |
-
### Examples
|
| 41 |
-
*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
|
| 42 |
-
|
| 43 |
-
## Limitations and Bias
|
| 44 |
-
*Note any known limitations or biases that the metric has, with links and references if possible.*
|
| 45 |
-
|
| 46 |
-
## Citation
|
| 47 |
-
*Cite the source where this metric was introduced.*
|
| 48 |
-
|
| 49 |
-
## Further References
|
| 50 |
-
*Add any useful further references.*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity_rate/requirements.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
git+https://github.com/huggingface/evaluate@main
|
|
|
|
|
|
script_fidelity_rate/script_fidelity_rate.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
| 1 |
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
| 2 |
-
#
|
| 3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
-
# you may not use this file except in compliance with the License.
|
| 5 |
-
# You may obtain a copy of the License at
|
| 6 |
-
#
|
| 7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
-
#
|
| 9 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
-
# See the License for the specific language governing permissions and
|
| 13 |
-
# limitations under the License.
|
| 14 |
-
"""TODO: Add a description here."""
|
| 15 |
-
|
| 16 |
-
import evaluate
|
| 17 |
-
import datasets
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
# TODO: Add BibTeX citation
|
| 21 |
-
_CITATION = """\
|
| 22 |
-
@InProceedings{huggingface:module,
|
| 23 |
-
title = {A great new module},
|
| 24 |
-
authors={huggingface, Inc.},
|
| 25 |
-
year={2020}
|
| 26 |
-
}
|
| 27 |
-
"""
|
| 28 |
-
|
| 29 |
-
# TODO: Add description of the module here
|
| 30 |
-
_DESCRIPTION = """\
|
| 31 |
-
This new module is designed to solve this great ML task and is crafted with a lot of care.
|
| 32 |
-
"""
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
# TODO: Add description of the arguments of the module here
|
| 36 |
-
_KWARGS_DESCRIPTION = """
|
| 37 |
-
Calculates how good are predictions given some references, using certain scores
|
| 38 |
-
Args:
|
| 39 |
-
predictions: list of predictions to score. Each predictions
|
| 40 |
-
should be a string with tokens separated by spaces.
|
| 41 |
-
references: list of reference for each prediction. Each
|
| 42 |
-
reference should be a string with tokens separated by spaces.
|
| 43 |
-
Returns:
|
| 44 |
-
accuracy: description of the first score,
|
| 45 |
-
another_score: description of the second score,
|
| 46 |
-
Examples:
|
| 47 |
-
Examples should be written in doctest format, and should illustrate how
|
| 48 |
-
to use the function.
|
| 49 |
-
|
| 50 |
-
>>> my_new_module = evaluate.load("my_new_module")
|
| 51 |
-
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
|
| 52 |
-
>>> print(results)
|
| 53 |
-
{'accuracy': 1.0}
|
| 54 |
-
"""
|
| 55 |
-
|
| 56 |
-
# TODO: Define external resources urls if needed
|
| 57 |
-
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 61 |
-
class script_fidelity_rate(evaluate.Metric):
|
| 62 |
-
"""TODO: Short description of my evaluation module."""
|
| 63 |
-
|
| 64 |
-
def _info(self):
|
| 65 |
-
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
| 66 |
-
return evaluate.MetricInfo(
|
| 67 |
-
# This is the description that will appear on the modules page.
|
| 68 |
-
module_type="metric",
|
| 69 |
-
description=_DESCRIPTION,
|
| 70 |
-
citation=_CITATION,
|
| 71 |
-
inputs_description=_KWARGS_DESCRIPTION,
|
| 72 |
-
# This defines the format of each prediction and reference
|
| 73 |
-
features=datasets.Features({
|
| 74 |
-
'predictions': datasets.Value('int64'),
|
| 75 |
-
'references': datasets.Value('int64'),
|
| 76 |
-
}),
|
| 77 |
-
# Homepage of the module for documentation
|
| 78 |
-
homepage="http://module.homepage",
|
| 79 |
-
# Additional links to the codebase or references
|
| 80 |
-
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
| 81 |
-
reference_urls=["http://path.to.reference.url/new_module"]
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
def _download_and_prepare(self, dl_manager):
|
| 85 |
-
"""Optional: download external resources useful to compute the scores"""
|
| 86 |
-
# TODO: Download external resources if needed
|
| 87 |
-
pass
|
| 88 |
-
|
| 89 |
-
def _compute(self, predictions, references):
|
| 90 |
-
"""Returns the scores"""
|
| 91 |
-
# TODO: Compute the different scores of the module
|
| 92 |
-
accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
|
| 93 |
-
return {
|
| 94 |
-
"accuracy": accuracy,
|
| 95 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script_fidelity_rate/tests.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
test_cases = [
|
| 2 |
-
{
|
| 3 |
-
"predictions": [0, 0],
|
| 4 |
-
"references": [1, 1],
|
| 5 |
-
"result": {"metric_score": 0}
|
| 6 |
-
},
|
| 7 |
-
{
|
| 8 |
-
"predictions": [1, 1],
|
| 9 |
-
"references": [1, 1],
|
| 10 |
-
"result": {"metric_score": 1}
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"predictions": [1, 0],
|
| 14 |
-
"references": [1, 1],
|
| 15 |
-
"result": {"metric_score": 0.5}
|
| 16 |
-
}
|
| 17 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_cli.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
import csv
|
| 2 |
-
import json
|
| 3 |
-
import subprocess
|
| 4 |
-
import sys
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def test_cli_score():
|
| 9 |
-
result = subprocess.run(
|
| 10 |
-
[
|
| 11 |
-
sys.executable,
|
| 12 |
-
"-m",
|
| 13 |
-
"script_fidelity",
|
| 14 |
-
"score",
|
| 15 |
-
"--language",
|
| 16 |
-
"ps_af",
|
| 17 |
-
"--text",
|
| 18 |
-
"کابل کې ښه هوا ده",
|
| 19 |
-
],
|
| 20 |
-
check=True,
|
| 21 |
-
capture_output=True,
|
| 22 |
-
text=True,
|
| 23 |
-
)
|
| 24 |
-
assert result.stdout.strip() == "1.000000"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def test_cli_audit_jsonl(tmp_path: Path):
|
| 28 |
-
path = tmp_path / "predictions.jsonl"
|
| 29 |
-
rows = [
|
| 30 |
-
{"prediction": "کابل کې ښه هوا ده"},
|
| 31 |
-
{"prediction": "romanized output"},
|
| 32 |
-
]
|
| 33 |
-
path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in rows))
|
| 34 |
-
|
| 35 |
-
result = subprocess.run(
|
| 36 |
-
[
|
| 37 |
-
sys.executable,
|
| 38 |
-
"-m",
|
| 39 |
-
"script_fidelity",
|
| 40 |
-
"audit",
|
| 41 |
-
str(path),
|
| 42 |
-
"--language",
|
| 43 |
-
"ps_af",
|
| 44 |
-
"--text-column",
|
| 45 |
-
"prediction",
|
| 46 |
-
],
|
| 47 |
-
check=True,
|
| 48 |
-
capture_output=True,
|
| 49 |
-
text=True,
|
| 50 |
-
)
|
| 51 |
-
summary = json.loads(result.stdout)
|
| 52 |
-
assert summary["n"] == 2
|
| 53 |
-
assert summary["sfr"] == 0.5
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def test_cli_audit_csv_format(tmp_path: Path):
|
| 57 |
-
path = tmp_path / "predictions.csv"
|
| 58 |
-
with path.open("w", encoding="utf-8", newline="") as handle:
|
| 59 |
-
writer = csv.DictWriter(handle, fieldnames=["prediction"])
|
| 60 |
-
writer.writeheader()
|
| 61 |
-
writer.writerow({"prediction": "বাংলা ভাষা"})
|
| 62 |
-
writer.writerow({"prediction": "namaste"})
|
| 63 |
-
|
| 64 |
-
result = subprocess.run(
|
| 65 |
-
[
|
| 66 |
-
sys.executable,
|
| 67 |
-
"-m",
|
| 68 |
-
"script_fidelity",
|
| 69 |
-
"audit",
|
| 70 |
-
str(path),
|
| 71 |
-
"--language",
|
| 72 |
-
"bn_in",
|
| 73 |
-
"--format",
|
| 74 |
-
"csv",
|
| 75 |
-
],
|
| 76 |
-
check=True,
|
| 77 |
-
capture_output=True,
|
| 78 |
-
text=True,
|
| 79 |
-
)
|
| 80 |
-
assert "sfr_percent" in result.stdout
|
| 81 |
-
assert "50.0" in result.stdout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_core.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
from script_fidelity import (
|
| 2 |
-
compute_corpus_sfr,
|
| 3 |
-
compute_sfr,
|
| 4 |
-
compute_sfr_batch,
|
| 5 |
-
dominant_script,
|
| 6 |
-
script_distribution,
|
| 7 |
-
)
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def test_pashto_positive_and_latin_collapse():
|
| 11 |
-
assert compute_sfr("کابل کې ښه هوا ده", language="ps_af") == 1.0
|
| 12 |
-
assert compute_sfr("this is romanized pashto", language="pashto") == 0.0
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def test_bengali_vs_devanagari_wrong_script():
|
| 16 |
-
assert compute_sfr("বাংলা ভাষা", language="bn_in") == 1.0
|
| 17 |
-
assert compute_sfr("नमस्ते दुनिया", language="bengali") == 0.0
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def test_somali_latin_positive_and_arabic_negative():
|
| 21 |
-
assert compute_sfr("Somali waa luuqad", language="so_so") == 1.0
|
| 22 |
-
assert compute_sfr("كابل في هواء جيد", language="somali") == 0.0
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def test_empty_punctuation_combining_and_emoji_cases():
|
| 26 |
-
assert compute_sfr("", language="ps_af") is None
|
| 27 |
-
assert compute_sfr("...?!", language="ps_af") is None
|
| 28 |
-
assert compute_sfr("\u0301\u0301", language="ps_af") is None
|
| 29 |
-
assert compute_sfr("🙂", language="ps_af") == 0.0
|
| 30 |
-
assert dominant_script("...?!") == "empty"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def test_mixed_script_and_distribution():
|
| 34 |
-
score = compute_sfr("বাংলা भाषा", language="bn_in")
|
| 35 |
-
assert score is not None
|
| 36 |
-
assert 0.0 < score < 1.0
|
| 37 |
-
counts = script_distribution("বাংলা भाषा")
|
| 38 |
-
assert counts["bengali"] > 0
|
| 39 |
-
assert counts["devanagari"] > 0
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def test_digit_policy_count_and_ignore():
|
| 43 |
-
counted = compute_sfr("کابل 123", language="ps_af")
|
| 44 |
-
ignored = compute_sfr("کابل 123", language="ps_af", digit_policy="ignore")
|
| 45 |
-
assert counted == 4 / 7
|
| 46 |
-
assert ignored == 1.0
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def test_batch_and_corpus_summary():
|
| 50 |
-
predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
|
| 51 |
-
scores = compute_sfr_batch(predictions, language="pashto")
|
| 52 |
-
assert scores == [1.0, 0.0, None]
|
| 53 |
-
|
| 54 |
-
summary = compute_corpus_sfr(predictions, language="pashto")
|
| 55 |
-
assert summary["n"] == 3
|
| 56 |
-
assert summary["n_valid"] == 2
|
| 57 |
-
assert summary["n_empty"] == 1
|
| 58 |
-
assert summary["sfr"] == 0.5
|
| 59 |
-
assert summary["low_sfr_rate"] == 0.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_evaluate_metric.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
import pytest
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
evaluate = pytest.importorskip("evaluate")
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def test_local_evaluate_metric_matches_package():
|
| 8 |
-
from script_fidelity import compute_corpus_sfr
|
| 9 |
-
|
| 10 |
-
predictions = ["کابل کې ښه هوا ده", "romanized output", "..."]
|
| 11 |
-
metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 12 |
-
actual = metric.compute(predictions=predictions, language="ps_af")
|
| 13 |
-
expected = compute_corpus_sfr(predictions, language="ps_af")
|
| 14 |
-
assert actual["sfr"] == expected["sfr"]
|
| 15 |
-
assert actual["n_empty"] == expected["n_empty"]
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def test_evaluate_metric_details():
|
| 19 |
-
metric = evaluate.load("./metrics/script_fidelity_rate", module_type="metric")
|
| 20 |
-
result = metric.compute(
|
| 21 |
-
predictions=["বাংলা ভাষা", "नमस्ते"],
|
| 22 |
-
language="bn_in",
|
| 23 |
-
return_details=True,
|
| 24 |
-
)
|
| 25 |
-
assert len(result["details"]) == 2
|
| 26 |
-
assert result["details"][0]["sfr"] == 1.0
|
| 27 |
-
assert result["details"][1]["sfr"] == 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_registry.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
from script_fidelity import FLEURS_CONFIGS, get_script_config, list_languages, resolve_language
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def test_registry_has_all_fleurs_configs_except_all():
|
| 5 |
-
codes = list_languages()
|
| 6 |
-
assert len(codes) == 102
|
| 7 |
-
assert "all" not in codes
|
| 8 |
-
assert tuple(codes) == FLEURS_CONFIGS
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def test_every_language_has_ranges():
|
| 12 |
-
for code in list_languages():
|
| 13 |
-
config = get_script_config(code)
|
| 14 |
-
assert config.code == code
|
| 15 |
-
assert config.ranges
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def test_aliases_for_paper_languages():
|
| 19 |
-
aliases = {
|
| 20 |
-
"pashto": "ps_af",
|
| 21 |
-
"urdu": "ur_pk",
|
| 22 |
-
"arabic": "ar_eg",
|
| 23 |
-
"persian": "fa_ir",
|
| 24 |
-
"farsi": "fa_ir",
|
| 25 |
-
"hindi": "hi_in",
|
| 26 |
-
"bengali": "bn_in",
|
| 27 |
-
"malayalam": "ml_in",
|
| 28 |
-
"tamil": "ta_in",
|
| 29 |
-
"somali": "so_so",
|
| 30 |
-
"georgian": "ka_ge",
|
| 31 |
-
}
|
| 32 |
-
for alias, code in aliases.items():
|
| 33 |
-
assert resolve_language(alias) == code
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def test_shared_script_metadata():
|
| 37 |
-
for code in ["ps_af", "ur_pk", "fa_ir", "ar_eg", "so_so", "hi_in"]:
|
| 38 |
-
config = get_script_config(code)
|
| 39 |
-
assert config.shared_script is True
|
| 40 |
-
assert config.warning
|
| 41 |
-
|
| 42 |
-
assert get_script_config("ka_ge").shared_script is False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|