data: tokenizer: name: huggingface path: flexitok/bpe_script_Arab_16000