Upload scripts/build_tokenizer.py with huggingface_hub
Browse files- scripts/build_tokenizer.py +413 -0
scripts/build_tokenizer.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Scientific Tokenizer Builder for Crowe Logic Mini
|
| 4 |
+
Builds a 32,000 token BPE tokenizer optimized for scientific domains
|
| 5 |
+
|
| 6 |
+
Domains covered:
|
| 7 |
+
- Mycology (mushroom cultivation, species, techniques)
|
| 8 |
+
- Drug Discovery (chemistry, compounds, protocols)
|
| 9 |
+
- AI Systems (architecture, training, deployment)
|
| 10 |
+
- Business Strategy (metrics, methodologies)
|
| 11 |
+
- General Scientific (research, analysis, statistics)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import List, Dict, Optional
|
| 18 |
+
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
|
| 19 |
+
from tokenizers.normalizers import NFKC, Sequence
|
| 20 |
+
from tqdm import tqdm
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class ScientificTokenizerBuilder:
|
| 24 |
+
"""Build a domain-optimized tokenizer for scientific text"""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
vocab_size: int = 32000,
|
| 29 |
+
min_frequency: int = 2,
|
| 30 |
+
special_tokens: Optional[List[str]] = None
|
| 31 |
+
):
|
| 32 |
+
self.vocab_size = vocab_size
|
| 33 |
+
self.min_frequency = min_frequency
|
| 34 |
+
|
| 35 |
+
if special_tokens is None:
|
| 36 |
+
self.special_tokens = [
|
| 37 |
+
"<|endoftext|>",
|
| 38 |
+
"<|im_start|>",
|
| 39 |
+
"<|im_end|>",
|
| 40 |
+
"<think>",
|
| 41 |
+
"</think>",
|
| 42 |
+
"<pad>",
|
| 43 |
+
"<unk>",
|
| 44 |
+
"<s>",
|
| 45 |
+
"</s>",
|
| 46 |
+
]
|
| 47 |
+
else:
|
| 48 |
+
self.special_tokens = special_tokens
|
| 49 |
+
|
| 50 |
+
# Add domain-specific special tokens
|
| 51 |
+
self.domain_tokens = [
|
| 52 |
+
"<mycology>", "</mycology>",
|
| 53 |
+
"<drug_discovery>", "</drug_discovery>",
|
| 54 |
+
"<ai_systems>", "</ai_systems>",
|
| 55 |
+
"<prologic>", "</prologic>",
|
| 56 |
+
"<business>", "</business>",
|
| 57 |
+
"<scientific>", "</scientific>",
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
self.all_special_tokens = self.special_tokens + self.domain_tokens
|
| 61 |
+
|
| 62 |
+
def create_scientific_vocabulary(self) -> List[str]:
|
| 63 |
+
"""Create a comprehensive list of scientific terms to ensure they're in the vocabulary"""
|
| 64 |
+
|
| 65 |
+
vocab_lists = {
|
| 66 |
+
"mycology": self._mycology_terms(),
|
| 67 |
+
"chemistry": self._chemistry_terms(),
|
| 68 |
+
"biology": self._biology_terms(),
|
| 69 |
+
"ai_ml": self._ai_ml_terms(),
|
| 70 |
+
"business": self._business_terms(),
|
| 71 |
+
"scientific_general": self._scientific_general_terms(),
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Flatten all terms
|
| 75 |
+
all_terms = []
|
| 76 |
+
for category, terms in vocab_lists.items():
|
| 77 |
+
all_terms.extend(terms)
|
| 78 |
+
print(f"✓ {category}: {len(terms)} terms")
|
| 79 |
+
|
| 80 |
+
print(f"\nTotal domain-specific terms: {len(all_terms)}")
|
| 81 |
+
return all_terms
|
| 82 |
+
|
| 83 |
+
def _mycology_terms(self) -> List[str]:
|
| 84 |
+
"""Mycology-specific vocabulary"""
|
| 85 |
+
return [
|
| 86 |
+
# Genus names
|
| 87 |
+
"Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe",
|
| 88 |
+
"Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus",
|
| 89 |
+
"Cantharellus", "Armillaria", "Inonotus", "Fomitopsis",
|
| 90 |
+
|
| 91 |
+
# Species
|
| 92 |
+
"ostreatus", "bisporus", "edodes", "lucidum", "cubensis",
|
| 93 |
+
"erinaceus", "versicolor", "militaris", "esculenta",
|
| 94 |
+
|
| 95 |
+
# Technical terms
|
| 96 |
+
"mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia",
|
| 97 |
+
"colonization", "inoculation", "spawn", "substrate", "agar",
|
| 98 |
+
"lamellae", "gills", "basidiocarp", "basidiospore", "spore",
|
| 99 |
+
"pinning", "flush", "canopy", "veil", "annulus",
|
| 100 |
+
|
| 101 |
+
# Cultivation
|
| 102 |
+
"autoclave", "sterilization", "pasteurization", "HEPA",
|
| 103 |
+
"flowhood", "monotub", "shotgun", "casing", "incubation",
|
| 104 |
+
"fruiting_chamber", "humidity", "FAE", "fresh_air_exchange",
|
| 105 |
+
"contamination", "trichoderma", "cobweb", "bacterial",
|
| 106 |
+
|
| 107 |
+
# Substrates
|
| 108 |
+
"sawdust", "hardwood", "supplemented", "straw", "manure",
|
| 109 |
+
"coco_coir", "vermiculite", "perlite", "gypsum", "bran",
|
| 110 |
+
"masters_mix", "CVG", "grain_spawn", "rye", "millet",
|
| 111 |
+
|
| 112 |
+
# Measurements
|
| 113 |
+
"yields", "biological_efficiency", "flush_timing",
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
def _chemistry_terms(self) -> List[str]:
|
| 117 |
+
"""Chemistry and drug discovery terms"""
|
| 118 |
+
return [
|
| 119 |
+
# Functional groups
|
| 120 |
+
"hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl",
|
| 121 |
+
"benzene", "acetyl", "carbonyl", "aldehyde", "ketone",
|
| 122 |
+
"ester", "ether", "amide", "thiol", "sulfide", "phosphate",
|
| 123 |
+
|
| 124 |
+
# Common compounds
|
| 125 |
+
"glucose", "fructose", "ATP", "NADH", "acetate", "lactate",
|
| 126 |
+
"pyruvate", "citrate", "succinate", "malate", "oxaloacetate",
|
| 127 |
+
|
| 128 |
+
# Drug discovery
|
| 129 |
+
"ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist",
|
| 130 |
+
"bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50",
|
| 131 |
+
"Kd", "binding_affinity", "selectivity", "cytotoxicity",
|
| 132 |
+
"assay", "screening", "HTS", "hit", "lead", "optimization",
|
| 133 |
+
|
| 134 |
+
# Techniques
|
| 135 |
+
"HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis",
|
| 136 |
+
"crystallography", "spectroscopy", "titration", "synthesis",
|
| 137 |
+
"purification", "extraction", "distillation",
|
| 138 |
+
|
| 139 |
+
# Biochemistry
|
| 140 |
+
"protein", "enzyme", "substrate", "catalyst", "peptide",
|
| 141 |
+
"nucleotide", "DNA", "RNA", "genome", "transcription",
|
| 142 |
+
"translation", "metabolism", "pathway", "metabolite",
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
def _biology_terms(self) -> List[str]:
|
| 146 |
+
"""Biology and life sciences terms"""
|
| 147 |
+
return [
|
| 148 |
+
# Cell biology
|
| 149 |
+
"mitochondria", "ribosome", "endoplasmic", "reticulum",
|
| 150 |
+
"Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus",
|
| 151 |
+
"membrane", "cytoskeleton", "vesicle", "organelle",
|
| 152 |
+
|
| 153 |
+
# Molecular biology
|
| 154 |
+
"PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector",
|
| 155 |
+
"transfection", "transformation", "expression", "CRISPR",
|
| 156 |
+
"genome_editing", "mutagenesis", "recombinant",
|
| 157 |
+
|
| 158 |
+
# Physiology
|
| 159 |
+
"neuron", "synapse", "neurotransmitter", "hormone", "receptor",
|
| 160 |
+
"signaling", "cascade", "phosphorylation", "ubiquitination",
|
| 161 |
+
|
| 162 |
+
# Microbiology
|
| 163 |
+
"bacteria", "bacterial", "fungal", "viral", "prokaryote",
|
| 164 |
+
"eukaryote", "microbiome", "culture", "fermentation",
|
| 165 |
+
"antibiotic", "resistance", "pathogen", "strain",
|
| 166 |
+
]
|
| 167 |
+
|
| 168 |
+
def _ai_ml_terms(self) -> List[str]:
|
| 169 |
+
"""AI and machine learning terminology"""
|
| 170 |
+
return [
|
| 171 |
+
# Architectures
|
| 172 |
+
"transformer", "attention", "self_attention", "cross_attention",
|
| 173 |
+
"feedforward", "embedding", "encoder", "decoder", "BERT",
|
| 174 |
+
"GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM",
|
| 175 |
+
"CNN", "ResNet", "ViT", "MoE", "mixture_of_experts",
|
| 176 |
+
|
| 177 |
+
# Training
|
| 178 |
+
"backpropagation", "gradient", "optimizer", "Adam", "SGD",
|
| 179 |
+
"learning_rate", "batch_size", "epoch", "iteration", "loss",
|
| 180 |
+
"cross_entropy", "MSE", "regularization", "dropout", "batch_norm",
|
| 181 |
+
"layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax",
|
| 182 |
+
|
| 183 |
+
# Concepts
|
| 184 |
+
"overfitting", "underfitting", "generalization", "bias", "variance",
|
| 185 |
+
"precision", "recall", "F1", "accuracy", "perplexity", "BLEU",
|
| 186 |
+
"fine_tuning", "pretraining", "transfer_learning", "few_shot",
|
| 187 |
+
"zero_shot", "prompt", "inference", "latency", "throughput",
|
| 188 |
+
|
| 189 |
+
# Infrastructure
|
| 190 |
+
"GPU", "TPU", "CUDA", "distributed", "parallel", "quantization",
|
| 191 |
+
"pruning", "distillation", "ONNX", "TensorRT", "deployment",
|
| 192 |
+
"serving", "endpoint", "API", "REST", "gRPC",
|
| 193 |
+
|
| 194 |
+
# Frameworks
|
| 195 |
+
"PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace",
|
| 196 |
+
"Transformers", "datasets", "tokenizers", "scikit", "numpy",
|
| 197 |
+
"pandas", "matplotlib", "wandb", "tensorboard",
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
def _business_terms(self) -> List[str]:
|
| 201 |
+
"""Business and strategy terminology"""
|
| 202 |
+
return [
|
| 203 |
+
# Metrics
|
| 204 |
+
"ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention",
|
| 205 |
+
"ARR", "MRR", "revenue", "profit", "margin", "gross", "net",
|
| 206 |
+
"CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR",
|
| 207 |
+
|
| 208 |
+
# Strategy
|
| 209 |
+
"GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF",
|
| 210 |
+
"product_market_fit", "MVP", "POC", "proof_of_concept",
|
| 211 |
+
"competitive_advantage", "moat", "differentiation",
|
| 212 |
+
"positioning", "segmentation", "targeting",
|
| 213 |
+
|
| 214 |
+
# Operations
|
| 215 |
+
"agile", "scrum", "sprint", "kanban", "OKRs", "roadmap",
|
| 216 |
+
"milestone", "deliverable", "stakeholder", "synergy",
|
| 217 |
+
"optimization", "efficiency", "scalability", "throughput",
|
| 218 |
+
|
| 219 |
+
# Methodologies
|
| 220 |
+
"Prologic", "intercept", "annotate", "correlate", "Six_Sigma",
|
| 221 |
+
"Lean", "methodology", "framework", "best_practices",
|
| 222 |
+
]
|
| 223 |
+
|
| 224 |
+
def _scientific_general_terms(self) -> List[str]:
|
| 225 |
+
"""General scientific terminology"""
|
| 226 |
+
return [
|
| 227 |
+
# Research
|
| 228 |
+
"hypothesis", "experiment", "control", "variable", "correlation",
|
| 229 |
+
"causation", "significance", "p_value", "statistical", "analysis",
|
| 230 |
+
"methodology", "protocol", "procedure", "reproducibility",
|
| 231 |
+
"validation", "verification", "peer_review", "publication",
|
| 232 |
+
|
| 233 |
+
# Statistics
|
| 234 |
+
"mean", "median", "mode", "standard_deviation", "variance",
|
| 235 |
+
"distribution", "normal", "Gaussian", "regression", "ANOVA",
|
| 236 |
+
"t_test", "chi_square", "confidence_interval", "bootstrap",
|
| 237 |
+
|
| 238 |
+
# Math
|
| 239 |
+
"logarithm", "exponential", "polynomial", "derivative", "integral",
|
| 240 |
+
"matrix", "vector", "tensor", "eigenvalue", "optimization",
|
| 241 |
+
"convex", "gradient_descent", "stochastic",
|
| 242 |
+
|
| 243 |
+
# Units
|
| 244 |
+
"micromolar", "nanomolar", "milligram", "microgram", "nanogram",
|
| 245 |
+
"celsius", "fahrenheit", "kelvin", "molarity", "pH",
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
def build_tokenizer(self, training_files: List[str]) -> Tokenizer:
|
| 249 |
+
"""Build the tokenizer from training files"""
|
| 250 |
+
|
| 251 |
+
print("=" * 70)
|
| 252 |
+
print("Building Scientific Tokenizer for Crowe Logic Mini")
|
| 253 |
+
print("=" * 70)
|
| 254 |
+
print(f"\nVocabulary size: {self.vocab_size:,}")
|
| 255 |
+
print(f"Special tokens: {len(self.all_special_tokens)}")
|
| 256 |
+
print(f"Training files: {len(training_files)}")
|
| 257 |
+
|
| 258 |
+
# Create BPE tokenizer
|
| 259 |
+
tokenizer = Tokenizer(models.BPE())
|
| 260 |
+
|
| 261 |
+
# Normalization (NFKC unicode normalization)
|
| 262 |
+
tokenizer.normalizer = NFKC()
|
| 263 |
+
|
| 264 |
+
# Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together)
|
| 265 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
| 266 |
+
|
| 267 |
+
# Decoder
|
| 268 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 269 |
+
|
| 270 |
+
# Create trainer
|
| 271 |
+
trainer = trainers.BpeTrainer(
|
| 272 |
+
vocab_size=self.vocab_size,
|
| 273 |
+
min_frequency=self.min_frequency,
|
| 274 |
+
special_tokens=self.all_special_tokens,
|
| 275 |
+
show_progress=True,
|
| 276 |
+
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# Train
|
| 280 |
+
print("\nTraining tokenizer...")
|
| 281 |
+
tokenizer.train(files=training_files, trainer=trainer)
|
| 282 |
+
|
| 283 |
+
# Post-processing (add special tokens in the right format)
|
| 284 |
+
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
|
| 285 |
+
|
| 286 |
+
print("✓ Tokenizer training complete!")
|
| 287 |
+
|
| 288 |
+
return tokenizer
|
| 289 |
+
|
| 290 |
+
def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str):
|
| 291 |
+
"""Save tokenizer files"""
|
| 292 |
+
|
| 293 |
+
output_path = Path(output_dir)
|
| 294 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 295 |
+
|
| 296 |
+
# Save tokenizer JSON
|
| 297 |
+
tokenizer.save(str(output_path / "tokenizer.json"))
|
| 298 |
+
print(f"✓ Saved tokenizer.json to {output_path}")
|
| 299 |
+
|
| 300 |
+
# Create tokenizer config
|
| 301 |
+
config = {
|
| 302 |
+
"add_bos_token": False,
|
| 303 |
+
"add_eos_token": False,
|
| 304 |
+
"add_prefix_space": False,
|
| 305 |
+
"added_tokens_decoder": {
|
| 306 |
+
str(i): {
|
| 307 |
+
"content": token,
|
| 308 |
+
"lstrip": False,
|
| 309 |
+
"normalized": False,
|
| 310 |
+
"rstrip": False,
|
| 311 |
+
"single_word": False,
|
| 312 |
+
"special": True
|
| 313 |
+
}
|
| 314 |
+
for i, token in enumerate(self.all_special_tokens)
|
| 315 |
+
},
|
| 316 |
+
"bos_token": "<|im_start|>",
|
| 317 |
+
"eos_token": "<|im_end|>",
|
| 318 |
+
"pad_token": "<|endoftext|>",
|
| 319 |
+
"unk_token": "<|endoftext|>",
|
| 320 |
+
"model_max_length": 16384,
|
| 321 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 322 |
+
"clean_up_tokenization_spaces": False,
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
with open(output_path / "tokenizer_config.json", "w") as f:
|
| 326 |
+
json.dump(config, f, indent=2)
|
| 327 |
+
print(f"✓ Saved tokenizer_config.json")
|
| 328 |
+
|
| 329 |
+
# Test the tokenizer
|
| 330 |
+
self._test_tokenizer(tokenizer)
|
| 331 |
+
|
| 332 |
+
def _test_tokenizer(self, tokenizer: Tokenizer):
|
| 333 |
+
"""Test tokenizer on domain-specific examples"""
|
| 334 |
+
|
| 335 |
+
print("\n" + "=" * 70)
|
| 336 |
+
print("Testing Tokenizer")
|
| 337 |
+
print("=" * 70)
|
| 338 |
+
|
| 339 |
+
test_cases = [
|
| 340 |
+
"Pleurotus ostreatus cultivation on supplemented sawdust substrate",
|
| 341 |
+
"IC50 determination for kinase inhibitor using HPLC-MS analysis",
|
| 342 |
+
"Transformer architecture with multi-head attention mechanism",
|
| 343 |
+
"ROI analysis shows 340% increase in operational efficiency",
|
| 344 |
+
"<think>Let me analyze this step by step using Prologic methodology</think>",
|
| 345 |
+
]
|
| 346 |
+
|
| 347 |
+
for text in test_cases:
|
| 348 |
+
encoding = tokenizer.encode(text)
|
| 349 |
+
tokens = encoding.tokens
|
| 350 |
+
ids = encoding.ids
|
| 351 |
+
|
| 352 |
+
print(f"\nText: {text}")
|
| 353 |
+
print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
|
| 354 |
+
print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}")
|
| 355 |
+
|
| 356 |
+
# Vocabulary statistics
|
| 357 |
+
vocab = tokenizer.get_vocab()
|
| 358 |
+
print(f"\n✓ Total vocabulary size: {len(vocab):,}")
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def main():
|
| 362 |
+
"""Main function to build tokenizer"""
|
| 363 |
+
|
| 364 |
+
print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n")
|
| 365 |
+
|
| 366 |
+
# Check if we have training data
|
| 367 |
+
data_dir = Path("./data/tokenizer_training")
|
| 368 |
+
|
| 369 |
+
if not data_dir.exists() or not list(data_dir.glob("*.txt")):
|
| 370 |
+
print("⚠️ No training data found!")
|
| 371 |
+
print(f" Expected: {data_dir}/*.txt files")
|
| 372 |
+
print("\n📋 Next steps:")
|
| 373 |
+
print(" 1. Run data collection script to gather training corpus")
|
| 374 |
+
print(" 2. Place text files in ./data/tokenizer_training/")
|
| 375 |
+
print(" 3. Run this script again")
|
| 376 |
+
print("\n For now, creating a minimal example tokenizer...")
|
| 377 |
+
|
| 378 |
+
# Create minimal training data for demonstration
|
| 379 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 380 |
+
|
| 381 |
+
builder = ScientificTokenizerBuilder(vocab_size=32000)
|
| 382 |
+
example_text = " ".join(builder.create_scientific_vocabulary())
|
| 383 |
+
|
| 384 |
+
with open(data_dir / "example.txt", "w") as f:
|
| 385 |
+
f.write(example_text)
|
| 386 |
+
|
| 387 |
+
print(f"\n✓ Created example training file with {len(example_text.split())} terms")
|
| 388 |
+
|
| 389 |
+
# Get all training files
|
| 390 |
+
training_files = [str(f) for f in data_dir.glob("*.txt")]
|
| 391 |
+
|
| 392 |
+
# Build tokenizer
|
| 393 |
+
builder = ScientificTokenizerBuilder(vocab_size=32000)
|
| 394 |
+
tokenizer = builder.build_tokenizer(training_files)
|
| 395 |
+
|
| 396 |
+
# Save tokenizer
|
| 397 |
+
output_dir = "./model/tokenizer_32k"
|
| 398 |
+
builder.save_tokenizer(tokenizer, output_dir)
|
| 399 |
+
|
| 400 |
+
print("\n" + "=" * 70)
|
| 401 |
+
print("✅ Tokenizer build complete!")
|
| 402 |
+
print("=" * 70)
|
| 403 |
+
print(f"\nTokenizer saved to: {output_dir}")
|
| 404 |
+
print(f"Vocabulary size: 32,000 tokens")
|
| 405 |
+
print(f"Optimized for: mycology, drug discovery, AI systems, business strategy")
|
| 406 |
+
print("\n📋 Next steps:")
|
| 407 |
+
print(" 1. Collect 1-2B tokens of training data")
|
| 408 |
+
print(" 2. Train the model from scratch with new tokenizer")
|
| 409 |
+
print(" 3. Evaluate on domain-specific benchmarks")
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
if __name__ == "__main__":
|
| 413 |
+
main()
|