Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
iamkoder001 commited on
Commit
cd37343
·
verified ·
1 Parent(s): 288c6c2

Create data/tokenizer_train.py

Browse files
Files changed (1) hide show
  1. data/tokenizer_train.py +58 -0
data/tokenizer_train.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tokenizers import Tokenizer
3
+ from tokenizers.models import BPE
4
+ from tokenizers.trainers import BpeTrainer
5
+ from tokenizers.pre_tokenizers import Whitespace, ByteLevel
6
+ from tokenizers.processors import TemplateProcessing
7
+
8
+ def train_sovereign_tokenizer(corpus_path, vocab_size=50257):
9
+ """
10
+ Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms.
11
+ Target: 50,257 tokens (matches the model_config.yaml).
12
+ """
13
+ # 1. Initialize an empty BPE model
14
+ # ByteLevel ensures we can handle any UTF-8 character without [UNK] tokens
15
+ tokenizer = Tokenizer(BPE(unk_token="<|unk|>"))
16
+
17
+ # 2. Set the Pre-Tokenizer
18
+ # We use ByteLevel to treat the text as a sequence of bytes
19
+ tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
20
+
21
+ # 3. Initialize the Trainer
22
+ # We include special tokens for GOEC protocols (SN, NE, IPN)
23
+ trainer = BpeTrainer(
24
+ vocab_size=vocab_size,
25
+ min_frequency=2,
26
+ special_tokens=[
27
+ "<|endoftext|>",
28
+ "<|unk|>",
29
+ "<|pad|>",
30
+ "CATEGORY_SN",
31
+ "CATEGORY_NE",
32
+ "CATEGORY_IPN"
33
+ ],
34
+ show_progress=True,
35
+ initial_alphabet=ByteLevel.alphabet()
36
+ )
37
+
38
+ # 4. Train on the Sovereign Corpus
39
+ print(f"Commencing Tokenizer Training on {corpus_path}...")
40
+ files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")]
41
+ tokenizer.train(files, trainer)
42
+
43
+ # 5. Post-Processing
44
+ # Add template to handle start/end of sequences for the Secretariat
45
+ tokenizer.post_processor = TemplateProcessing(
46
+ single="$A <|endoftext|>",
47
+ special_tokens=[("<|endoftext|>", 0)],
48
+ )
49
+
50
+ # 6. Save the Sovereign Lens
51
+ tokenizer.save("data/processed/aravalli_tokenizer.json")
52
+ print("Sovereign Tokenizer Enacted and Saved to data/processed/")
53
+
54
+ if __name__ == "__main__":
55
+ # Ensure raw data exists before training
56
+ if not os.path.exists("data/raw/"):
57
+ os.makedirs("data/raw/")
58
+ train_sovereign_tokenizer("data/raw/")