PraneetNS commited on
Commit
8d85ec0
·
verified ·
1 Parent(s): d37d5bd

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +21 -0
  2. special_tokens_map.json +1 -0
  3. tokenizer.py +27 -0
README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indic Tokenizer v2
2
+
3
+ Custom SentencePiece Unigram tokenizer trained on:
4
+ - Hindi, Tamil, Telugu corpora
5
+ - Code-mixed Hinglish data
6
+
7
+ ## Features
8
+ - 40–70% fewer tokens vs GPT-2
9
+ - Script-aware tokenization
10
+ - Better handling of Indic languages
11
+
12
+ ## Usage
13
+
14
+ from transformers import AutoTokenizer
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(
17
+ "your-username/indic-tokenizer-v2",
18
+ trust_remote_code=True
19
+ )
20
+
21
+ print(tokenizer.tokenize("नमस्ते मित्र, कैसे हो?"))
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"}
tokenizer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ from transformers import PreTrainedTokenizer
3
+
4
+ class IndicTokenizer(PreTrainedTokenizer):
5
+ def __init__(self, vocab_file, **kwargs):
6
+ self.sp_model = spm.SentencePieceProcessor()
7
+ self.sp_model.load(vocab_file)
8
+ super().__init__(**kwargs)
9
+
10
+ def _tokenize(self, text):
11
+ return self.sp_model.encode(text, out_type=str)
12
+
13
+ def _convert_token_to_id(self, token):
14
+ return self.sp_model.piece_to_id(token)
15
+
16
+ def _convert_id_to_token(self, index):
17
+ return self.sp_model.id_to_piece(index)
18
+
19
+ def get_vocab(self):
20
+ return {self.sp_model.id_to_piece(i): i for i in range(self.sp_model.get_piece_size())}
21
+
22
+ def __len__(self):
23
+ return self.sp_model.get_piece_size()
24
+
25
+ @property
26
+ def vocab_size(self):
27
+ return self.sp_model.get_piece_size()