| |
| """ |
| Scientific Tokenizer Builder for Crowe Logic Mini |
| Builds a 32,000 token BPE tokenizer optimized for scientific domains |
| |
| Domains covered: |
| - Mycology (mushroom cultivation, species, techniques) |
| - Drug Discovery (chemistry, compounds, protocols) |
| - AI Systems (architecture, training, deployment) |
| - Business Strategy (metrics, methodologies) |
| - General Scientific (research, analysis, statistics) |
| """ |
|
|
| import os |
| import json |
| from pathlib import Path |
| from typing import List, Dict, Optional |
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors |
| from tokenizers.normalizers import NFKC, Sequence |
| from tqdm import tqdm |
|
|
|
|
| class ScientificTokenizerBuilder: |
| """Build a domain-optimized tokenizer for scientific text""" |
|
|
| def __init__( |
| self, |
| vocab_size: int = 32000, |
| min_frequency: int = 2, |
| special_tokens: Optional[List[str]] = None |
| ): |
| self.vocab_size = vocab_size |
| self.min_frequency = min_frequency |
|
|
| if special_tokens is None: |
| self.special_tokens = [ |
| "<|endoftext|>", |
| "<|im_start|>", |
| "<|im_end|>", |
| "<think>", |
| "</think>", |
| "<pad>", |
| "<unk>", |
| "<s>", |
| "</s>", |
| ] |
| else: |
| self.special_tokens = special_tokens |
|
|
| |
| self.domain_tokens = [ |
| "<mycology>", "</mycology>", |
| "<drug_discovery>", "</drug_discovery>", |
| "<ai_systems>", "</ai_systems>", |
| "<prologic>", "</prologic>", |
| "<business>", "</business>", |
| "<scientific>", "</scientific>", |
| ] |
|
|
| self.all_special_tokens = self.special_tokens + self.domain_tokens |
|
|
| def create_scientific_vocabulary(self) -> List[str]: |
| """Create a comprehensive list of scientific terms to ensure they're in the vocabulary""" |
|
|
| vocab_lists = { |
| "mycology": self._mycology_terms(), |
| "chemistry": self._chemistry_terms(), |
| "biology": self._biology_terms(), |
| "ai_ml": self._ai_ml_terms(), |
| "business": self._business_terms(), |
| "scientific_general": self._scientific_general_terms(), |
| } |
|
|
| |
| all_terms = [] |
| for category, terms in vocab_lists.items(): |
| all_terms.extend(terms) |
| print(f"โ {category}: {len(terms)} terms") |
|
|
| print(f"\nTotal domain-specific terms: {len(all_terms)}") |
| return all_terms |
|
|
| def _mycology_terms(self) -> List[str]: |
| """Mycology-specific vocabulary""" |
| return [ |
| |
| "Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe", |
| "Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus", |
| "Cantharellus", "Armillaria", "Inonotus", "Fomitopsis", |
|
|
| |
| "ostreatus", "bisporus", "edodes", "lucidum", "cubensis", |
| "erinaceus", "versicolor", "militaris", "esculenta", |
|
|
| |
| "mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia", |
| "colonization", "inoculation", "spawn", "substrate", "agar", |
| "lamellae", "gills", "basidiocarp", "basidiospore", "spore", |
| "pinning", "flush", "canopy", "veil", "annulus", |
|
|
| |
| "autoclave", "sterilization", "pasteurization", "HEPA", |
| "flowhood", "monotub", "shotgun", "casing", "incubation", |
| "fruiting_chamber", "humidity", "FAE", "fresh_air_exchange", |
| "contamination", "trichoderma", "cobweb", "bacterial", |
|
|
| |
| "sawdust", "hardwood", "supplemented", "straw", "manure", |
| "coco_coir", "vermiculite", "perlite", "gypsum", "bran", |
| "masters_mix", "CVG", "grain_spawn", "rye", "millet", |
|
|
| |
| "yields", "biological_efficiency", "flush_timing", |
| ] |
|
|
| def _chemistry_terms(self) -> List[str]: |
| """Chemistry and drug discovery terms""" |
| return [ |
| |
| "hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl", |
| "benzene", "acetyl", "carbonyl", "aldehyde", "ketone", |
| "ester", "ether", "amide", "thiol", "sulfide", "phosphate", |
|
|
| |
| "glucose", "fructose", "ATP", "NADH", "acetate", "lactate", |
| "pyruvate", "citrate", "succinate", "malate", "oxaloacetate", |
|
|
| |
| "ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist", |
| "bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50", |
| "Kd", "binding_affinity", "selectivity", "cytotoxicity", |
| "assay", "screening", "HTS", "hit", "lead", "optimization", |
|
|
| |
| "HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis", |
| "crystallography", "spectroscopy", "titration", "synthesis", |
| "purification", "extraction", "distillation", |
|
|
| |
| "protein", "enzyme", "substrate", "catalyst", "peptide", |
| "nucleotide", "DNA", "RNA", "genome", "transcription", |
| "translation", "metabolism", "pathway", "metabolite", |
| ] |
|
|
| def _biology_terms(self) -> List[str]: |
| """Biology and life sciences terms""" |
| return [ |
| |
| "mitochondria", "ribosome", "endoplasmic", "reticulum", |
| "Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus", |
| "membrane", "cytoskeleton", "vesicle", "organelle", |
|
|
| |
| "PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector", |
| "transfection", "transformation", "expression", "CRISPR", |
| "genome_editing", "mutagenesis", "recombinant", |
|
|
| |
| "neuron", "synapse", "neurotransmitter", "hormone", "receptor", |
| "signaling", "cascade", "phosphorylation", "ubiquitination", |
|
|
| |
| "bacteria", "bacterial", "fungal", "viral", "prokaryote", |
| "eukaryote", "microbiome", "culture", "fermentation", |
| "antibiotic", "resistance", "pathogen", "strain", |
| ] |
|
|
| def _ai_ml_terms(self) -> List[str]: |
| """AI and machine learning terminology""" |
| return [ |
| |
| "transformer", "attention", "self_attention", "cross_attention", |
| "feedforward", "embedding", "encoder", "decoder", "BERT", |
| "GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM", |
| "CNN", "ResNet", "ViT", "MoE", "mixture_of_experts", |
|
|
| |
| "backpropagation", "gradient", "optimizer", "Adam", "SGD", |
| "learning_rate", "batch_size", "epoch", "iteration", "loss", |
| "cross_entropy", "MSE", "regularization", "dropout", "batch_norm", |
| "layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax", |
|
|
| |
| "overfitting", "underfitting", "generalization", "bias", "variance", |
| "precision", "recall", "F1", "accuracy", "perplexity", "BLEU", |
| "fine_tuning", "pretraining", "transfer_learning", "few_shot", |
| "zero_shot", "prompt", "inference", "latency", "throughput", |
|
|
| |
| "GPU", "TPU", "CUDA", "distributed", "parallel", "quantization", |
| "pruning", "distillation", "ONNX", "TensorRT", "deployment", |
| "serving", "endpoint", "API", "REST", "gRPC", |
|
|
| |
| "PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace", |
| "Transformers", "datasets", "tokenizers", "scikit", "numpy", |
| "pandas", "matplotlib", "wandb", "tensorboard", |
| ] |
|
|
| def _business_terms(self) -> List[str]: |
| """Business and strategy terminology""" |
| return [ |
| |
| "ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention", |
| "ARR", "MRR", "revenue", "profit", "margin", "gross", "net", |
| "CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR", |
|
|
| |
| "GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF", |
| "product_market_fit", "MVP", "POC", "proof_of_concept", |
| "competitive_advantage", "moat", "differentiation", |
| "positioning", "segmentation", "targeting", |
|
|
| |
| "agile", "scrum", "sprint", "kanban", "OKRs", "roadmap", |
| "milestone", "deliverable", "stakeholder", "synergy", |
| "optimization", "efficiency", "scalability", "throughput", |
|
|
| |
| "Prologic", "intercept", "annotate", "correlate", "Six_Sigma", |
| "Lean", "methodology", "framework", "best_practices", |
| ] |
|
|
| def _scientific_general_terms(self) -> List[str]: |
| """General scientific terminology""" |
| return [ |
| |
| "hypothesis", "experiment", "control", "variable", "correlation", |
| "causation", "significance", "p_value", "statistical", "analysis", |
| "methodology", "protocol", "procedure", "reproducibility", |
| "validation", "verification", "peer_review", "publication", |
|
|
| |
| "mean", "median", "mode", "standard_deviation", "variance", |
| "distribution", "normal", "Gaussian", "regression", "ANOVA", |
| "t_test", "chi_square", "confidence_interval", "bootstrap", |
|
|
| |
| "logarithm", "exponential", "polynomial", "derivative", "integral", |
| "matrix", "vector", "tensor", "eigenvalue", "optimization", |
| "convex", "gradient_descent", "stochastic", |
|
|
| |
| "micromolar", "nanomolar", "milligram", "microgram", "nanogram", |
| "celsius", "fahrenheit", "kelvin", "molarity", "pH", |
| ] |
|
|
| def build_tokenizer(self, training_files: List[str]) -> Tokenizer: |
| """Build the tokenizer from training files""" |
|
|
| print("=" * 70) |
| print("Building Scientific Tokenizer for Crowe Logic Mini") |
| print("=" * 70) |
| print(f"\nVocabulary size: {self.vocab_size:,}") |
| print(f"Special tokens: {len(self.all_special_tokens)}") |
| print(f"Training files: {len(training_files)}") |
|
|
| |
| tokenizer = Tokenizer(models.BPE()) |
|
|
| |
| tokenizer.normalizer = NFKC() |
|
|
| |
| tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) |
|
|
| |
| tokenizer.decoder = decoders.ByteLevel() |
|
|
| |
| trainer = trainers.BpeTrainer( |
| vocab_size=self.vocab_size, |
| min_frequency=self.min_frequency, |
| special_tokens=self.all_special_tokens, |
| show_progress=True, |
| initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), |
| ) |
|
|
| |
| print("\nTraining tokenizer...") |
| tokenizer.train(files=training_files, trainer=trainer) |
|
|
| |
| tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) |
|
|
| print("โ Tokenizer training complete!") |
|
|
| return tokenizer |
|
|
| def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str): |
| """Save tokenizer files""" |
|
|
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
|
|
| |
| tokenizer.save(str(output_path / "tokenizer.json")) |
| print(f"โ Saved tokenizer.json to {output_path}") |
|
|
| |
| config = { |
| "add_bos_token": False, |
| "add_eos_token": False, |
| "add_prefix_space": False, |
| "added_tokens_decoder": { |
| str(i): { |
| "content": token, |
| "lstrip": False, |
| "normalized": False, |
| "rstrip": False, |
| "single_word": False, |
| "special": True |
| } |
| for i, token in enumerate(self.all_special_tokens) |
| }, |
| "bos_token": "<|im_start|>", |
| "eos_token": "<|im_end|>", |
| "pad_token": "<|endoftext|>", |
| "unk_token": "<|endoftext|>", |
| "model_max_length": 16384, |
| "tokenizer_class": "PreTrainedTokenizerFast", |
| "clean_up_tokenization_spaces": False, |
| } |
|
|
| with open(output_path / "tokenizer_config.json", "w") as f: |
| json.dump(config, f, indent=2) |
| print(f"โ Saved tokenizer_config.json") |
|
|
| |
| self._test_tokenizer(tokenizer) |
|
|
| def _test_tokenizer(self, tokenizer: Tokenizer): |
| """Test tokenizer on domain-specific examples""" |
|
|
| print("\n" + "=" * 70) |
| print("Testing Tokenizer") |
| print("=" * 70) |
|
|
| test_cases = [ |
| "Pleurotus ostreatus cultivation on supplemented sawdust substrate", |
| "IC50 determination for kinase inhibitor using HPLC-MS analysis", |
| "Transformer architecture with multi-head attention mechanism", |
| "ROI analysis shows 340% increase in operational efficiency", |
| "<think>Let me analyze this step by step using Prologic methodology</think>", |
| ] |
|
|
| for text in test_cases: |
| encoding = tokenizer.encode(text) |
| tokens = encoding.tokens |
| ids = encoding.ids |
|
|
| print(f"\nText: {text}") |
| print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}") |
| print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}") |
|
|
| |
| vocab = tokenizer.get_vocab() |
| print(f"\nโ Total vocabulary size: {len(vocab):,}") |
|
|
|
|
| def main(): |
| """Main function to build tokenizer""" |
|
|
| print("\n๐ Crowe Logic Mini - Scientific Tokenizer Builder\n") |
|
|
| |
| data_dir = Path("./data/tokenizer_training") |
|
|
| if not data_dir.exists() or not list(data_dir.glob("*.txt")): |
| print("โ ๏ธ No training data found!") |
| print(f" Expected: {data_dir}/*.txt files") |
| print("\n๐ Next steps:") |
| print(" 1. Run data collection script to gather training corpus") |
| print(" 2. Place text files in ./data/tokenizer_training/") |
| print(" 3. Run this script again") |
| print("\n For now, creating a minimal example tokenizer...") |
|
|
| |
| data_dir.mkdir(parents=True, exist_ok=True) |
|
|
| builder = ScientificTokenizerBuilder(vocab_size=32000) |
| example_text = " ".join(builder.create_scientific_vocabulary()) |
|
|
| with open(data_dir / "example.txt", "w") as f: |
| f.write(example_text) |
|
|
| print(f"\nโ Created example training file with {len(example_text.split())} terms") |
|
|
| |
| training_files = [str(f) for f in data_dir.glob("*.txt")] |
|
|
| |
| builder = ScientificTokenizerBuilder(vocab_size=32000) |
| tokenizer = builder.build_tokenizer(training_files) |
|
|
| |
| output_dir = "./model/tokenizer_32k" |
| builder.save_tokenizer(tokenizer, output_dir) |
|
|
| print("\n" + "=" * 70) |
| print("โ
Tokenizer build complete!") |
| print("=" * 70) |
| print(f"\nTokenizer saved to: {output_dir}") |
| print(f"Vocabulary size: 32,000 tokens") |
| print(f"Optimized for: mycology, drug discovery, AI systems, business strategy") |
| print("\n๐ Next steps:") |
| print(" 1. Collect 1-2B tokens of training data") |
| print(" 2. Train the model from scratch with new tokenizer") |
| print(" 3. Evaluate on domain-specific benchmarks") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|