Arabic
arabic
tokenizer
morphology
nlp
dialect
df-arc / tokenizer_config.json
fr3on's picture
Upload folder using huggingface_hub
3b90e9e verified
raw
history blame
446 Bytes
{
"auto_map": {
"AutoTokenizer": [
"tokenization_df_arc.DFArcTokenizer",
null
]
},
"tokenizer_class": "DFArcTokenizer",
"phrases_file": "phrases.json",
"normalization": {
"unify_alef": true,
"unify_yeh": true,
"unify_teh_marbuta": true,
"remove_diacritics": true,
"remove_tatweel": true,
"remove_repeats": true
},
"min_stem_length": 2,
"vocab_size": 256000,
"model_max_length": 4096
}