Omarrran commited on
Commit
5980847
·
verified ·
1 Parent(s): 3f08c87

Upload kashmiri_unigram_tokenizer/tokenizer_config.json with huggingface_hub

Browse files
kashmiri_unigram_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "UnigramLMTokenizer",
3
+ "vocab_size": 32000,
4
+ "model_max_length": 512,
5
+ "special_tokens": [
6
+ "[PAD]",
7
+ "[UNK]",
8
+ "[CLS]",
9
+ "[SEP]",
10
+ "[MASK]",
11
+ "[BOS]",
12
+ "[EOS]"
13
+ ],
14
+ "language": "ks",
15
+ "script": "Arab",
16
+ "normalization": "NFC + Kashmiri-specific mappings",
17
+ "training_corpus": "KS-LIT-3M (3.1M words)",
18
+ "kashmiri_unique_chars_supported": true
19
+ }