#!/usr/bin/env python3 """ Scientific Tokenizer Builder for Crowe Logic Mini Builds a 32,000 token BPE tokenizer optimized for scientific domains Domains covered: - Mycology (mushroom cultivation, species, techniques) - Drug Discovery (chemistry, compounds, protocols) - AI Systems (architecture, training, deployment) - Business Strategy (metrics, methodologies) - General Scientific (research, analysis, statistics) """ import os import json from pathlib import Path from typing import List, Dict, Optional from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors from tokenizers.normalizers import NFKC, Sequence from tqdm import tqdm class ScientificTokenizerBuilder: """Build a domain-optimized tokenizer for scientific text""" def __init__( self, vocab_size: int = 32000, min_frequency: int = 2, special_tokens: Optional[List[str]] = None ): self.vocab_size = vocab_size self.min_frequency = min_frequency if special_tokens is None: self.special_tokens = [ "<|endoftext|>", "<|im_start|>", "<|im_end|>", "", "", "", "", "", "", ] else: self.special_tokens = special_tokens # Add domain-specific special tokens self.domain_tokens = [ "", "", "", "", "", "", "", "", "", "", "", "", ] self.all_special_tokens = self.special_tokens + self.domain_tokens def create_scientific_vocabulary(self) -> List[str]: """Create a comprehensive list of scientific terms to ensure they're in the vocabulary""" vocab_lists = { "mycology": self._mycology_terms(), "chemistry": self._chemistry_terms(), "biology": self._biology_terms(), "ai_ml": self._ai_ml_terms(), "business": self._business_terms(), "scientific_general": self._scientific_general_terms(), } # Flatten all terms all_terms = [] for category, terms in vocab_lists.items(): all_terms.extend(terms) print(f"✓ {category}: {len(terms)} terms") print(f"\nTotal domain-specific terms: {len(all_terms)}") return all_terms def _mycology_terms(self) -> List[str]: """Mycology-specific vocabulary""" return [ # Genus names "Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe", "Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus", "Cantharellus", "Armillaria", "Inonotus", "Fomitopsis", # Species "ostreatus", "bisporus", "edodes", "lucidum", "cubensis", "erinaceus", "versicolor", "militaris", "esculenta", # Technical terms "mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia", "colonization", "inoculation", "spawn", "substrate", "agar", "lamellae", "gills", "basidiocarp", "basidiospore", "spore", "pinning", "flush", "canopy", "veil", "annulus", # Cultivation "autoclave", "sterilization", "pasteurization", "HEPA", "flowhood", "monotub", "shotgun", "casing", "incubation", "fruiting_chamber", "humidity", "FAE", "fresh_air_exchange", "contamination", "trichoderma", "cobweb", "bacterial", # Substrates "sawdust", "hardwood", "supplemented", "straw", "manure", "coco_coir", "vermiculite", "perlite", "gypsum", "bran", "masters_mix", "CVG", "grain_spawn", "rye", "millet", # Measurements "yields", "biological_efficiency", "flush_timing", ] def _chemistry_terms(self) -> List[str]: """Chemistry and drug discovery terms""" return [ # Functional groups "hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl", "benzene", "acetyl", "carbonyl", "aldehyde", "ketone", "ester", "ether", "amide", "thiol", "sulfide", "phosphate", # Common compounds "glucose", "fructose", "ATP", "NADH", "acetate", "lactate", "pyruvate", "citrate", "succinate", "malate", "oxaloacetate", # Drug discovery "ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist", "bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50", "Kd", "binding_affinity", "selectivity", "cytotoxicity", "assay", "screening", "HTS", "hit", "lead", "optimization", # Techniques "HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis", "crystallography", "spectroscopy", "titration", "synthesis", "purification", "extraction", "distillation", # Biochemistry "protein", "enzyme", "substrate", "catalyst", "peptide", "nucleotide", "DNA", "RNA", "genome", "transcription", "translation", "metabolism", "pathway", "metabolite", ] def _biology_terms(self) -> List[str]: """Biology and life sciences terms""" return [ # Cell biology "mitochondria", "ribosome", "endoplasmic", "reticulum", "Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus", "membrane", "cytoskeleton", "vesicle", "organelle", # Molecular biology "PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector", "transfection", "transformation", "expression", "CRISPR", "genome_editing", "mutagenesis", "recombinant", # Physiology "neuron", "synapse", "neurotransmitter", "hormone", "receptor", "signaling", "cascade", "phosphorylation", "ubiquitination", # Microbiology "bacteria", "bacterial", "fungal", "viral", "prokaryote", "eukaryote", "microbiome", "culture", "fermentation", "antibiotic", "resistance", "pathogen", "strain", ] def _ai_ml_terms(self) -> List[str]: """AI and machine learning terminology""" return [ # Architectures "transformer", "attention", "self_attention", "cross_attention", "feedforward", "embedding", "encoder", "decoder", "BERT", "GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM", "CNN", "ResNet", "ViT", "MoE", "mixture_of_experts", # Training "backpropagation", "gradient", "optimizer", "Adam", "SGD", "learning_rate", "batch_size", "epoch", "iteration", "loss", "cross_entropy", "MSE", "regularization", "dropout", "batch_norm", "layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax", # Concepts "overfitting", "underfitting", "generalization", "bias", "variance", "precision", "recall", "F1", "accuracy", "perplexity", "BLEU", "fine_tuning", "pretraining", "transfer_learning", "few_shot", "zero_shot", "prompt", "inference", "latency", "throughput", # Infrastructure "GPU", "TPU", "CUDA", "distributed", "parallel", "quantization", "pruning", "distillation", "ONNX", "TensorRT", "deployment", "serving", "endpoint", "API", "REST", "gRPC", # Frameworks "PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace", "Transformers", "datasets", "tokenizers", "scikit", "numpy", "pandas", "matplotlib", "wandb", "tensorboard", ] def _business_terms(self) -> List[str]: """Business and strategy terminology""" return [ # Metrics "ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention", "ARR", "MRR", "revenue", "profit", "margin", "gross", "net", "CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR", # Strategy "GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF", "product_market_fit", "MVP", "POC", "proof_of_concept", "competitive_advantage", "moat", "differentiation", "positioning", "segmentation", "targeting", # Operations "agile", "scrum", "sprint", "kanban", "OKRs", "roadmap", "milestone", "deliverable", "stakeholder", "synergy", "optimization", "efficiency", "scalability", "throughput", # Methodologies "Prologic", "intercept", "annotate", "correlate", "Six_Sigma", "Lean", "methodology", "framework", "best_practices", ] def _scientific_general_terms(self) -> List[str]: """General scientific terminology""" return [ # Research "hypothesis", "experiment", "control", "variable", "correlation", "causation", "significance", "p_value", "statistical", "analysis", "methodology", "protocol", "procedure", "reproducibility", "validation", "verification", "peer_review", "publication", # Statistics "mean", "median", "mode", "standard_deviation", "variance", "distribution", "normal", "Gaussian", "regression", "ANOVA", "t_test", "chi_square", "confidence_interval", "bootstrap", # Math "logarithm", "exponential", "polynomial", "derivative", "integral", "matrix", "vector", "tensor", "eigenvalue", "optimization", "convex", "gradient_descent", "stochastic", # Units "micromolar", "nanomolar", "milligram", "microgram", "nanogram", "celsius", "fahrenheit", "kelvin", "molarity", "pH", ] def build_tokenizer(self, training_files: List[str]) -> Tokenizer: """Build the tokenizer from training files""" print("=" * 70) print("Building Scientific Tokenizer for Crowe Logic Mini") print("=" * 70) print(f"\nVocabulary size: {self.vocab_size:,}") print(f"Special tokens: {len(self.all_special_tokens)}") print(f"Training files: {len(training_files)}") # Create BPE tokenizer tokenizer = Tokenizer(models.BPE()) # Normalization (NFKC unicode normalization) tokenizer.normalizer = NFKC() # Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Decoder tokenizer.decoder = decoders.ByteLevel() # Create trainer trainer = trainers.BpeTrainer( vocab_size=self.vocab_size, min_frequency=self.min_frequency, special_tokens=self.all_special_tokens, show_progress=True, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), ) # Train print("\nTraining tokenizer...") tokenizer.train(files=training_files, trainer=trainer) # Post-processing (add special tokens in the right format) tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) print("✓ Tokenizer training complete!") return tokenizer def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str): """Save tokenizer files""" output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Save tokenizer JSON tokenizer.save(str(output_path / "tokenizer.json")) print(f"✓ Saved tokenizer.json to {output_path}") # Create tokenizer config config = { "add_bos_token": False, "add_eos_token": False, "add_prefix_space": False, "added_tokens_decoder": { str(i): { "content": token, "lstrip": False, "normalized": False, "rstrip": False, "single_word": False, "special": True } for i, token in enumerate(self.all_special_tokens) }, "bos_token": "<|im_start|>", "eos_token": "<|im_end|>", "pad_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "model_max_length": 16384, "tokenizer_class": "PreTrainedTokenizerFast", "clean_up_tokenization_spaces": False, } with open(output_path / "tokenizer_config.json", "w") as f: json.dump(config, f, indent=2) print(f"✓ Saved tokenizer_config.json") # Test the tokenizer self._test_tokenizer(tokenizer) def _test_tokenizer(self, tokenizer: Tokenizer): """Test tokenizer on domain-specific examples""" print("\n" + "=" * 70) print("Testing Tokenizer") print("=" * 70) test_cases = [ "Pleurotus ostreatus cultivation on supplemented sawdust substrate", "IC50 determination for kinase inhibitor using HPLC-MS analysis", "Transformer architecture with multi-head attention mechanism", "ROI analysis shows 340% increase in operational efficiency", "Let me analyze this step by step using Prologic methodology", ] for text in test_cases: encoding = tokenizer.encode(text) tokens = encoding.tokens ids = encoding.ids print(f"\nText: {text}") print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}") print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}") # Vocabulary statistics vocab = tokenizer.get_vocab() print(f"\n✓ Total vocabulary size: {len(vocab):,}") def main(): """Main function to build tokenizer""" print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n") # Check if we have training data data_dir = Path("./data/tokenizer_training") if not data_dir.exists() or not list(data_dir.glob("*.txt")): print("⚠️ No training data found!") print(f" Expected: {data_dir}/*.txt files") print("\n📋 Next steps:") print(" 1. Run data collection script to gather training corpus") print(" 2. Place text files in ./data/tokenizer_training/") print(" 3. Run this script again") print("\n For now, creating a minimal example tokenizer...") # Create minimal training data for demonstration data_dir.mkdir(parents=True, exist_ok=True) builder = ScientificTokenizerBuilder(vocab_size=32000) example_text = " ".join(builder.create_scientific_vocabulary()) with open(data_dir / "example.txt", "w") as f: f.write(example_text) print(f"\n✓ Created example training file with {len(example_text.split())} terms") # Get all training files training_files = [str(f) for f in data_dir.glob("*.txt")] # Build tokenizer builder = ScientificTokenizerBuilder(vocab_size=32000) tokenizer = builder.build_tokenizer(training_files) # Save tokenizer output_dir = "./model/tokenizer_32k" builder.save_tokenizer(tokenizer, output_dir) print("\n" + "=" * 70) print("✅ Tokenizer build complete!") print("=" * 70) print(f"\nTokenizer saved to: {output_dir}") print(f"Vocabulary size: 32,000 tokens") print(f"Optimized for: mycology, drug discovery, AI systems, business strategy") print("\n📋 Next steps:") print(" 1. Collect 1-2B tokens of training data") print(" 2. Train the model from scratch with new tokenizer") print(" 3. Evaluate on domain-specific benchmarks") if __name__ == "__main__": main()