indic_language_stock_tokenizer / tokenizer_summary.json
malarsaravanan's picture
Upload 6 files
765ce6a verified
raw
history blame contribute delete
259 Bytes
{
"language": "Tamil",
"algorithm": "BPE",
"vocabulary_size": 8000,
"compression_ratio": 4.6671,
"meets_vocab_requirement": true,
"meets_compression_requirement": true,
"dataset_size": 50000,
"dataset_source": "HuggingFace (Real Tamil Data)"
}