Upload kashmiri_unigram_tokenizer/tokenizer_config.json with huggingface_hub
Browse files
kashmiri_unigram_tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer_class": "UnigramLMTokenizer",
|
| 3 |
+
"vocab_size": 32000,
|
| 4 |
+
"model_max_length": 512,
|
| 5 |
+
"special_tokens": [
|
| 6 |
+
"[PAD]",
|
| 7 |
+
"[UNK]",
|
| 8 |
+
"[CLS]",
|
| 9 |
+
"[SEP]",
|
| 10 |
+
"[MASK]",
|
| 11 |
+
"[BOS]",
|
| 12 |
+
"[EOS]"
|
| 13 |
+
],
|
| 14 |
+
"language": "ks",
|
| 15 |
+
"script": "Arab",
|
| 16 |
+
"normalization": "NFC + Kashmiri-specific mappings",
|
| 17 |
+
"training_corpus": "KS-LIT-3M (3.1M words)",
|
| 18 |
+
"kashmiri_unique_chars_supported": true
|
| 19 |
+
}
|