SARFTokenizer / test_comprehensive_results.json
almaghrabima's picture
Upload test_comprehensive_results.json with huggingface_hub
1e8911f verified
{
"version": "0.3.8",
"rust_available": true,
"tokenizer": "almaghrabima/SARFTokenizer",
"samples": 1000000,
"roundtrip": {
"Arabic": {
"category": "Arabic",
"total": 333333,
"success": 333333,
"failed": 0,
"accuracy": 1.0,
"accuracy_pct": "100.00%",
"encode_time": 128.8216183092445,
"decode_time": 23.831407640129328,
"failures": []
},
"English": {
"category": "English",
"total": 333333,
"success": 333333,
"failed": 0,
"accuracy": 1.0,
"accuracy_pct": "100.00%",
"encode_time": 19.489008927717805,
"decode_time": 4.273356601595879,
"failures": []
},
"Mixed": {
"category": "Mixed",
"total": 333333,
"success": 333333,
"failed": 0,
"accuracy": 1.0,
"accuracy_pct": "100.00%",
"encode_time": 70.115827139467,
"decode_time": 13.394427740946412,
"failures": []
},
"TOTAL": {
"category": "TOTAL",
"total": 999999,
"success": 999999,
"failed": 0,
"accuracy": 1.0,
"accuracy_pct": "100.00%"
}
},
"edge_cases": {
"Unicode Normalization": {
"tests": 6,
"passed": 6,
"failed": 0,
"failures": []
},
"Zero-Width Characters": {
"tests": 6,
"passed": 6,
"failed": 0,
"failures": []
},
"Unicode Whitespace": {
"tests": 6,
"passed": 6,
"failed": 0,
"failures": []
},
"Grapheme Clusters": {
"tests": 6,
"passed": 6,
"failed": 0,
"failures": []
},
"Apostrophes": {
"tests": 4,
"passed": 4,
"failed": 0,
"failures": []
},
"Dashes": {
"tests": 4,
"passed": 4,
"failed": 0,
"failures": []
},
"Decimal Separators": {
"tests": 3,
"passed": 3,
"failed": 0,
"failures": []
},
"URLs/Emails": {
"tests": 4,
"passed": 4,
"failed": 0,
"failures": []
},
"File Paths": {
"tests": 3,
"passed": 3,
"failed": 0,
"failures": []
},
"Code Identifiers": {
"tests": 4,
"passed": 4,
"failed": 0,
"failures": []
},
"Mixed Scripts/RTL": {
"tests": 6,
"passed": 6,
"failed": 0,
"failures": []
},
"Robustness": {
"tests": 6,
"passed": 6,
"failed": 0,
"failures": []
},
"TOTAL": {
"tests": 58,
"passed": 58,
"failed": 0
}
}
}