mike1210 commited on
Commit
e9a0cd5
·
verified ·
1 Parent(s): 22bf3c3

Upload scripts/build_tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/build_tokenizer.py +413 -0
scripts/build_tokenizer.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Scientific Tokenizer Builder for Crowe Logic Mini
4
+ Builds a 32,000 token BPE tokenizer optimized for scientific domains
5
+
6
+ Domains covered:
7
+ - Mycology (mushroom cultivation, species, techniques)
8
+ - Drug Discovery (chemistry, compounds, protocols)
9
+ - AI Systems (architecture, training, deployment)
10
+ - Business Strategy (metrics, methodologies)
11
+ - General Scientific (research, analysis, statistics)
12
+ """
13
+
14
+ import os
15
+ import json
16
+ from pathlib import Path
17
+ from typing import List, Dict, Optional
18
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
19
+ from tokenizers.normalizers import NFKC, Sequence
20
+ from tqdm import tqdm
21
+
22
+
23
+ class ScientificTokenizerBuilder:
24
+ """Build a domain-optimized tokenizer for scientific text"""
25
+
26
+ def __init__(
27
+ self,
28
+ vocab_size: int = 32000,
29
+ min_frequency: int = 2,
30
+ special_tokens: Optional[List[str]] = None
31
+ ):
32
+ self.vocab_size = vocab_size
33
+ self.min_frequency = min_frequency
34
+
35
+ if special_tokens is None:
36
+ self.special_tokens = [
37
+ "<|endoftext|>",
38
+ "<|im_start|>",
39
+ "<|im_end|>",
40
+ "<think>",
41
+ "</think>",
42
+ "<pad>",
43
+ "<unk>",
44
+ "<s>",
45
+ "</s>",
46
+ ]
47
+ else:
48
+ self.special_tokens = special_tokens
49
+
50
+ # Add domain-specific special tokens
51
+ self.domain_tokens = [
52
+ "<mycology>", "</mycology>",
53
+ "<drug_discovery>", "</drug_discovery>",
54
+ "<ai_systems>", "</ai_systems>",
55
+ "<prologic>", "</prologic>",
56
+ "<business>", "</business>",
57
+ "<scientific>", "</scientific>",
58
+ ]
59
+
60
+ self.all_special_tokens = self.special_tokens + self.domain_tokens
61
+
62
+ def create_scientific_vocabulary(self) -> List[str]:
63
+ """Create a comprehensive list of scientific terms to ensure they're in the vocabulary"""
64
+
65
+ vocab_lists = {
66
+ "mycology": self._mycology_terms(),
67
+ "chemistry": self._chemistry_terms(),
68
+ "biology": self._biology_terms(),
69
+ "ai_ml": self._ai_ml_terms(),
70
+ "business": self._business_terms(),
71
+ "scientific_general": self._scientific_general_terms(),
72
+ }
73
+
74
+ # Flatten all terms
75
+ all_terms = []
76
+ for category, terms in vocab_lists.items():
77
+ all_terms.extend(terms)
78
+ print(f"✓ {category}: {len(terms)} terms")
79
+
80
+ print(f"\nTotal domain-specific terms: {len(all_terms)}")
81
+ return all_terms
82
+
83
+ def _mycology_terms(self) -> List[str]:
84
+ """Mycology-specific vocabulary"""
85
+ return [
86
+ # Genus names
87
+ "Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe",
88
+ "Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus",
89
+ "Cantharellus", "Armillaria", "Inonotus", "Fomitopsis",
90
+
91
+ # Species
92
+ "ostreatus", "bisporus", "edodes", "lucidum", "cubensis",
93
+ "erinaceus", "versicolor", "militaris", "esculenta",
94
+
95
+ # Technical terms
96
+ "mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia",
97
+ "colonization", "inoculation", "spawn", "substrate", "agar",
98
+ "lamellae", "gills", "basidiocarp", "basidiospore", "spore",
99
+ "pinning", "flush", "canopy", "veil", "annulus",
100
+
101
+ # Cultivation
102
+ "autoclave", "sterilization", "pasteurization", "HEPA",
103
+ "flowhood", "monotub", "shotgun", "casing", "incubation",
104
+ "fruiting_chamber", "humidity", "FAE", "fresh_air_exchange",
105
+ "contamination", "trichoderma", "cobweb", "bacterial",
106
+
107
+ # Substrates
108
+ "sawdust", "hardwood", "supplemented", "straw", "manure",
109
+ "coco_coir", "vermiculite", "perlite", "gypsum", "bran",
110
+ "masters_mix", "CVG", "grain_spawn", "rye", "millet",
111
+
112
+ # Measurements
113
+ "yields", "biological_efficiency", "flush_timing",
114
+ ]
115
+
116
+ def _chemistry_terms(self) -> List[str]:
117
+ """Chemistry and drug discovery terms"""
118
+ return [
119
+ # Functional groups
120
+ "hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl",
121
+ "benzene", "acetyl", "carbonyl", "aldehyde", "ketone",
122
+ "ester", "ether", "amide", "thiol", "sulfide", "phosphate",
123
+
124
+ # Common compounds
125
+ "glucose", "fructose", "ATP", "NADH", "acetate", "lactate",
126
+ "pyruvate", "citrate", "succinate", "malate", "oxaloacetate",
127
+
128
+ # Drug discovery
129
+ "ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist",
130
+ "bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50",
131
+ "Kd", "binding_affinity", "selectivity", "cytotoxicity",
132
+ "assay", "screening", "HTS", "hit", "lead", "optimization",
133
+
134
+ # Techniques
135
+ "HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis",
136
+ "crystallography", "spectroscopy", "titration", "synthesis",
137
+ "purification", "extraction", "distillation",
138
+
139
+ # Biochemistry
140
+ "protein", "enzyme", "substrate", "catalyst", "peptide",
141
+ "nucleotide", "DNA", "RNA", "genome", "transcription",
142
+ "translation", "metabolism", "pathway", "metabolite",
143
+ ]
144
+
145
+ def _biology_terms(self) -> List[str]:
146
+ """Biology and life sciences terms"""
147
+ return [
148
+ # Cell biology
149
+ "mitochondria", "ribosome", "endoplasmic", "reticulum",
150
+ "Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus",
151
+ "membrane", "cytoskeleton", "vesicle", "organelle",
152
+
153
+ # Molecular biology
154
+ "PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector",
155
+ "transfection", "transformation", "expression", "CRISPR",
156
+ "genome_editing", "mutagenesis", "recombinant",
157
+
158
+ # Physiology
159
+ "neuron", "synapse", "neurotransmitter", "hormone", "receptor",
160
+ "signaling", "cascade", "phosphorylation", "ubiquitination",
161
+
162
+ # Microbiology
163
+ "bacteria", "bacterial", "fungal", "viral", "prokaryote",
164
+ "eukaryote", "microbiome", "culture", "fermentation",
165
+ "antibiotic", "resistance", "pathogen", "strain",
166
+ ]
167
+
168
+ def _ai_ml_terms(self) -> List[str]:
169
+ """AI and machine learning terminology"""
170
+ return [
171
+ # Architectures
172
+ "transformer", "attention", "self_attention", "cross_attention",
173
+ "feedforward", "embedding", "encoder", "decoder", "BERT",
174
+ "GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM",
175
+ "CNN", "ResNet", "ViT", "MoE", "mixture_of_experts",
176
+
177
+ # Training
178
+ "backpropagation", "gradient", "optimizer", "Adam", "SGD",
179
+ "learning_rate", "batch_size", "epoch", "iteration", "loss",
180
+ "cross_entropy", "MSE", "regularization", "dropout", "batch_norm",
181
+ "layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax",
182
+
183
+ # Concepts
184
+ "overfitting", "underfitting", "generalization", "bias", "variance",
185
+ "precision", "recall", "F1", "accuracy", "perplexity", "BLEU",
186
+ "fine_tuning", "pretraining", "transfer_learning", "few_shot",
187
+ "zero_shot", "prompt", "inference", "latency", "throughput",
188
+
189
+ # Infrastructure
190
+ "GPU", "TPU", "CUDA", "distributed", "parallel", "quantization",
191
+ "pruning", "distillation", "ONNX", "TensorRT", "deployment",
192
+ "serving", "endpoint", "API", "REST", "gRPC",
193
+
194
+ # Frameworks
195
+ "PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace",
196
+ "Transformers", "datasets", "tokenizers", "scikit", "numpy",
197
+ "pandas", "matplotlib", "wandb", "tensorboard",
198
+ ]
199
+
200
+ def _business_terms(self) -> List[str]:
201
+ """Business and strategy terminology"""
202
+ return [
203
+ # Metrics
204
+ "ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention",
205
+ "ARR", "MRR", "revenue", "profit", "margin", "gross", "net",
206
+ "CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR",
207
+
208
+ # Strategy
209
+ "GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF",
210
+ "product_market_fit", "MVP", "POC", "proof_of_concept",
211
+ "competitive_advantage", "moat", "differentiation",
212
+ "positioning", "segmentation", "targeting",
213
+
214
+ # Operations
215
+ "agile", "scrum", "sprint", "kanban", "OKRs", "roadmap",
216
+ "milestone", "deliverable", "stakeholder", "synergy",
217
+ "optimization", "efficiency", "scalability", "throughput",
218
+
219
+ # Methodologies
220
+ "Prologic", "intercept", "annotate", "correlate", "Six_Sigma",
221
+ "Lean", "methodology", "framework", "best_practices",
222
+ ]
223
+
224
+ def _scientific_general_terms(self) -> List[str]:
225
+ """General scientific terminology"""
226
+ return [
227
+ # Research
228
+ "hypothesis", "experiment", "control", "variable", "correlation",
229
+ "causation", "significance", "p_value", "statistical", "analysis",
230
+ "methodology", "protocol", "procedure", "reproducibility",
231
+ "validation", "verification", "peer_review", "publication",
232
+
233
+ # Statistics
234
+ "mean", "median", "mode", "standard_deviation", "variance",
235
+ "distribution", "normal", "Gaussian", "regression", "ANOVA",
236
+ "t_test", "chi_square", "confidence_interval", "bootstrap",
237
+
238
+ # Math
239
+ "logarithm", "exponential", "polynomial", "derivative", "integral",
240
+ "matrix", "vector", "tensor", "eigenvalue", "optimization",
241
+ "convex", "gradient_descent", "stochastic",
242
+
243
+ # Units
244
+ "micromolar", "nanomolar", "milligram", "microgram", "nanogram",
245
+ "celsius", "fahrenheit", "kelvin", "molarity", "pH",
246
+ ]
247
+
248
+ def build_tokenizer(self, training_files: List[str]) -> Tokenizer:
249
+ """Build the tokenizer from training files"""
250
+
251
+ print("=" * 70)
252
+ print("Building Scientific Tokenizer for Crowe Logic Mini")
253
+ print("=" * 70)
254
+ print(f"\nVocabulary size: {self.vocab_size:,}")
255
+ print(f"Special tokens: {len(self.all_special_tokens)}")
256
+ print(f"Training files: {len(training_files)}")
257
+
258
+ # Create BPE tokenizer
259
+ tokenizer = Tokenizer(models.BPE())
260
+
261
+ # Normalization (NFKC unicode normalization)
262
+ tokenizer.normalizer = NFKC()
263
+
264
+ # Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together)
265
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
266
+
267
+ # Decoder
268
+ tokenizer.decoder = decoders.ByteLevel()
269
+
270
+ # Create trainer
271
+ trainer = trainers.BpeTrainer(
272
+ vocab_size=self.vocab_size,
273
+ min_frequency=self.min_frequency,
274
+ special_tokens=self.all_special_tokens,
275
+ show_progress=True,
276
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
277
+ )
278
+
279
+ # Train
280
+ print("\nTraining tokenizer...")
281
+ tokenizer.train(files=training_files, trainer=trainer)
282
+
283
+ # Post-processing (add special tokens in the right format)
284
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
285
+
286
+ print("✓ Tokenizer training complete!")
287
+
288
+ return tokenizer
289
+
290
+ def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str):
291
+ """Save tokenizer files"""
292
+
293
+ output_path = Path(output_dir)
294
+ output_path.mkdir(parents=True, exist_ok=True)
295
+
296
+ # Save tokenizer JSON
297
+ tokenizer.save(str(output_path / "tokenizer.json"))
298
+ print(f"✓ Saved tokenizer.json to {output_path}")
299
+
300
+ # Create tokenizer config
301
+ config = {
302
+ "add_bos_token": False,
303
+ "add_eos_token": False,
304
+ "add_prefix_space": False,
305
+ "added_tokens_decoder": {
306
+ str(i): {
307
+ "content": token,
308
+ "lstrip": False,
309
+ "normalized": False,
310
+ "rstrip": False,
311
+ "single_word": False,
312
+ "special": True
313
+ }
314
+ for i, token in enumerate(self.all_special_tokens)
315
+ },
316
+ "bos_token": "<|im_start|>",
317
+ "eos_token": "<|im_end|>",
318
+ "pad_token": "<|endoftext|>",
319
+ "unk_token": "<|endoftext|>",
320
+ "model_max_length": 16384,
321
+ "tokenizer_class": "PreTrainedTokenizerFast",
322
+ "clean_up_tokenization_spaces": False,
323
+ }
324
+
325
+ with open(output_path / "tokenizer_config.json", "w") as f:
326
+ json.dump(config, f, indent=2)
327
+ print(f"✓ Saved tokenizer_config.json")
328
+
329
+ # Test the tokenizer
330
+ self._test_tokenizer(tokenizer)
331
+
332
+ def _test_tokenizer(self, tokenizer: Tokenizer):
333
+ """Test tokenizer on domain-specific examples"""
334
+
335
+ print("\n" + "=" * 70)
336
+ print("Testing Tokenizer")
337
+ print("=" * 70)
338
+
339
+ test_cases = [
340
+ "Pleurotus ostreatus cultivation on supplemented sawdust substrate",
341
+ "IC50 determination for kinase inhibitor using HPLC-MS analysis",
342
+ "Transformer architecture with multi-head attention mechanism",
343
+ "ROI analysis shows 340% increase in operational efficiency",
344
+ "<think>Let me analyze this step by step using Prologic methodology</think>",
345
+ ]
346
+
347
+ for text in test_cases:
348
+ encoding = tokenizer.encode(text)
349
+ tokens = encoding.tokens
350
+ ids = encoding.ids
351
+
352
+ print(f"\nText: {text}")
353
+ print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
354
+ print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}")
355
+
356
+ # Vocabulary statistics
357
+ vocab = tokenizer.get_vocab()
358
+ print(f"\n✓ Total vocabulary size: {len(vocab):,}")
359
+
360
+
361
+ def main():
362
+ """Main function to build tokenizer"""
363
+
364
+ print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n")
365
+
366
+ # Check if we have training data
367
+ data_dir = Path("./data/tokenizer_training")
368
+
369
+ if not data_dir.exists() or not list(data_dir.glob("*.txt")):
370
+ print("⚠️ No training data found!")
371
+ print(f" Expected: {data_dir}/*.txt files")
372
+ print("\n📋 Next steps:")
373
+ print(" 1. Run data collection script to gather training corpus")
374
+ print(" 2. Place text files in ./data/tokenizer_training/")
375
+ print(" 3. Run this script again")
376
+ print("\n For now, creating a minimal example tokenizer...")
377
+
378
+ # Create minimal training data for demonstration
379
+ data_dir.mkdir(parents=True, exist_ok=True)
380
+
381
+ builder = ScientificTokenizerBuilder(vocab_size=32000)
382
+ example_text = " ".join(builder.create_scientific_vocabulary())
383
+
384
+ with open(data_dir / "example.txt", "w") as f:
385
+ f.write(example_text)
386
+
387
+ print(f"\n✓ Created example training file with {len(example_text.split())} terms")
388
+
389
+ # Get all training files
390
+ training_files = [str(f) for f in data_dir.glob("*.txt")]
391
+
392
+ # Build tokenizer
393
+ builder = ScientificTokenizerBuilder(vocab_size=32000)
394
+ tokenizer = builder.build_tokenizer(training_files)
395
+
396
+ # Save tokenizer
397
+ output_dir = "./model/tokenizer_32k"
398
+ builder.save_tokenizer(tokenizer, output_dir)
399
+
400
+ print("\n" + "=" * 70)
401
+ print("✅ Tokenizer build complete!")
402
+ print("=" * 70)
403
+ print(f"\nTokenizer saved to: {output_dir}")
404
+ print(f"Vocabulary size: 32,000 tokens")
405
+ print(f"Optimized for: mycology, drug discovery, AI systems, business strategy")
406
+ print("\n📋 Next steps:")
407
+ print(" 1. Collect 1-2B tokens of training data")
408
+ print(" 2. Train the model from scratch with new tokenizer")
409
+ print(" 3. Evaluate on domain-specific benchmarks")
410
+
411
+
412
+ if __name__ == "__main__":
413
+ main()