| { | |
| "auto_map": { | |
| "AutoTokenizer": [ | |
| "tokenization_df_arc.DFArcTokenizer", | |
| null | |
| ] | |
| }, | |
| "tokenizer_class": "DFArcTokenizer", | |
| "phrases_file": "phrases.json", | |
| "normalization": { | |
| "unify_alef": true, | |
| "unify_yeh": true, | |
| "unify_teh_marbuta": true, | |
| "remove_diacritics": true, | |
| "remove_tatweel": true, | |
| "remove_repeats": true | |
| }, | |
| "min_stem_length": 2, | |
| "vocab_size": 256000, | |
| "model_max_length": 4096 | |
| } |