iamkoder001
/

ARAVALLI-1

ecological-intelligence

environmental-protection

Model card Files Files and versions

iamkoder001 commited on Feb 21

Commit

cd37343

·

verified ·

1 Parent(s): 288c6c2

Create data/tokenizer_train.py

Files changed (1) hide show

data/tokenizer_train.py +58 -0

data/tokenizer_train.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace, ByteLevel
+from tokenizers.processors import TemplateProcessing
+def train_sovereign_tokenizer(corpus_path, vocab_size=50257):
+    """
+    Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms.
+    Target: 50,257 tokens (matches the model_config.yaml).
+    """
+    # 1. Initialize an empty BPE model
+    # ByteLevel ensures we can handle any UTF-8 character without [UNK] tokens
+    tokenizer = Tokenizer(BPE(unk_token="<|unk|>"))
+    # 2. Set the Pre-Tokenizer
+    # We use ByteLevel to treat the text as a sequence of bytes
+    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
+    # 3. Initialize the Trainer
+    # We include special tokens for GOEC protocols (SN, NE, IPN)
+    trainer = BpeTrainer(
+        vocab_size=vocab_size,
+        min_frequency=2,
+        special_tokens=[
+            "<|endoftext|>",
+            "<|unk|>",
+            "<|pad|>",
+            "CATEGORY_SN",
+            "CATEGORY_NE",
+            "CATEGORY_IPN"
+        ],
+        show_progress=True,
+        initial_alphabet=ByteLevel.alphabet()
+    )
+    # 4. Train on the Sovereign Corpus
+    print(f"Commencing Tokenizer Training on {corpus_path}...")
+    files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")]
+    tokenizer.train(files, trainer)
+    # 5. Post-Processing
+    # Add template to handle start/end of sequences for the Secretariat
+    tokenizer.post_processor = TemplateProcessing(
+        single="$A <|endoftext|>",
+        special_tokens=[("<|endoftext|>", 0)],
+    )
+    # 6. Save the Sovereign Lens
+    tokenizer.save("data/processed/aravalli_tokenizer.json")
+    print("Sovereign Tokenizer Enacted and Saved to data/processed/")
+if __name__ == "__main__":
+    # Ensure raw data exists before training
+    if not os.path.exists("data/raw/"):
+        os.makedirs("data/raw/")
+    train_sovereign_tokenizer("data/raw/")