#!/usr/bin/env python3
"""
Scientific Tokenizer Builder for Crowe Logic Mini
Builds a 32,000 token BPE tokenizer optimized for scientific domains
Domains covered:
- Mycology (mushroom cultivation, species, techniques)
- Drug Discovery (chemistry, compounds, protocols)
- AI Systems (architecture, training, deployment)
- Business Strategy (metrics, methodologies)
- General Scientific (research, analysis, statistics)
"""
import os
import json
from pathlib import Path
from typing import List, Dict, Optional
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from tokenizers.normalizers import NFKC, Sequence
from tqdm import tqdm
class ScientificTokenizerBuilder:
"""Build a domain-optimized tokenizer for scientific text"""
def __init__(
self,
vocab_size: int = 32000,
min_frequency: int = 2,
special_tokens: Optional[List[str]] = None
):
self.vocab_size = vocab_size
self.min_frequency = min_frequency
if special_tokens is None:
self.special_tokens = [
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>",
"",
"",
"",
"",
"",
"",
]
else:
self.special_tokens = special_tokens
# Add domain-specific special tokens
self.domain_tokens = [
"", "",
"", "",
"", "",
"", "",
"", "",
"", "",
]
self.all_special_tokens = self.special_tokens + self.domain_tokens
def create_scientific_vocabulary(self) -> List[str]:
"""Create a comprehensive list of scientific terms to ensure they're in the vocabulary"""
vocab_lists = {
"mycology": self._mycology_terms(),
"chemistry": self._chemistry_terms(),
"biology": self._biology_terms(),
"ai_ml": self._ai_ml_terms(),
"business": self._business_terms(),
"scientific_general": self._scientific_general_terms(),
}
# Flatten all terms
all_terms = []
for category, terms in vocab_lists.items():
all_terms.extend(terms)
print(f"✓ {category}: {len(terms)} terms")
print(f"\nTotal domain-specific terms: {len(all_terms)}")
return all_terms
def _mycology_terms(self) -> List[str]:
"""Mycology-specific vocabulary"""
return [
# Genus names
"Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe",
"Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus",
"Cantharellus", "Armillaria", "Inonotus", "Fomitopsis",
# Species
"ostreatus", "bisporus", "edodes", "lucidum", "cubensis",
"erinaceus", "versicolor", "militaris", "esculenta",
# Technical terms
"mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia",
"colonization", "inoculation", "spawn", "substrate", "agar",
"lamellae", "gills", "basidiocarp", "basidiospore", "spore",
"pinning", "flush", "canopy", "veil", "annulus",
# Cultivation
"autoclave", "sterilization", "pasteurization", "HEPA",
"flowhood", "monotub", "shotgun", "casing", "incubation",
"fruiting_chamber", "humidity", "FAE", "fresh_air_exchange",
"contamination", "trichoderma", "cobweb", "bacterial",
# Substrates
"sawdust", "hardwood", "supplemented", "straw", "manure",
"coco_coir", "vermiculite", "perlite", "gypsum", "bran",
"masters_mix", "CVG", "grain_spawn", "rye", "millet",
# Measurements
"yields", "biological_efficiency", "flush_timing",
]
def _chemistry_terms(self) -> List[str]:
"""Chemistry and drug discovery terms"""
return [
# Functional groups
"hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl",
"benzene", "acetyl", "carbonyl", "aldehyde", "ketone",
"ester", "ether", "amide", "thiol", "sulfide", "phosphate",
# Common compounds
"glucose", "fructose", "ATP", "NADH", "acetate", "lactate",
"pyruvate", "citrate", "succinate", "malate", "oxaloacetate",
# Drug discovery
"ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist",
"bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50",
"Kd", "binding_affinity", "selectivity", "cytotoxicity",
"assay", "screening", "HTS", "hit", "lead", "optimization",
# Techniques
"HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis",
"crystallography", "spectroscopy", "titration", "synthesis",
"purification", "extraction", "distillation",
# Biochemistry
"protein", "enzyme", "substrate", "catalyst", "peptide",
"nucleotide", "DNA", "RNA", "genome", "transcription",
"translation", "metabolism", "pathway", "metabolite",
]
def _biology_terms(self) -> List[str]:
"""Biology and life sciences terms"""
return [
# Cell biology
"mitochondria", "ribosome", "endoplasmic", "reticulum",
"Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus",
"membrane", "cytoskeleton", "vesicle", "organelle",
# Molecular biology
"PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector",
"transfection", "transformation", "expression", "CRISPR",
"genome_editing", "mutagenesis", "recombinant",
# Physiology
"neuron", "synapse", "neurotransmitter", "hormone", "receptor",
"signaling", "cascade", "phosphorylation", "ubiquitination",
# Microbiology
"bacteria", "bacterial", "fungal", "viral", "prokaryote",
"eukaryote", "microbiome", "culture", "fermentation",
"antibiotic", "resistance", "pathogen", "strain",
]
def _ai_ml_terms(self) -> List[str]:
"""AI and machine learning terminology"""
return [
# Architectures
"transformer", "attention", "self_attention", "cross_attention",
"feedforward", "embedding", "encoder", "decoder", "BERT",
"GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM",
"CNN", "ResNet", "ViT", "MoE", "mixture_of_experts",
# Training
"backpropagation", "gradient", "optimizer", "Adam", "SGD",
"learning_rate", "batch_size", "epoch", "iteration", "loss",
"cross_entropy", "MSE", "regularization", "dropout", "batch_norm",
"layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax",
# Concepts
"overfitting", "underfitting", "generalization", "bias", "variance",
"precision", "recall", "F1", "accuracy", "perplexity", "BLEU",
"fine_tuning", "pretraining", "transfer_learning", "few_shot",
"zero_shot", "prompt", "inference", "latency", "throughput",
# Infrastructure
"GPU", "TPU", "CUDA", "distributed", "parallel", "quantization",
"pruning", "distillation", "ONNX", "TensorRT", "deployment",
"serving", "endpoint", "API", "REST", "gRPC",
# Frameworks
"PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace",
"Transformers", "datasets", "tokenizers", "scikit", "numpy",
"pandas", "matplotlib", "wandb", "tensorboard",
]
def _business_terms(self) -> List[str]:
"""Business and strategy terminology"""
return [
# Metrics
"ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention",
"ARR", "MRR", "revenue", "profit", "margin", "gross", "net",
"CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR",
# Strategy
"GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF",
"product_market_fit", "MVP", "POC", "proof_of_concept",
"competitive_advantage", "moat", "differentiation",
"positioning", "segmentation", "targeting",
# Operations
"agile", "scrum", "sprint", "kanban", "OKRs", "roadmap",
"milestone", "deliverable", "stakeholder", "synergy",
"optimization", "efficiency", "scalability", "throughput",
# Methodologies
"Prologic", "intercept", "annotate", "correlate", "Six_Sigma",
"Lean", "methodology", "framework", "best_practices",
]
def _scientific_general_terms(self) -> List[str]:
"""General scientific terminology"""
return [
# Research
"hypothesis", "experiment", "control", "variable", "correlation",
"causation", "significance", "p_value", "statistical", "analysis",
"methodology", "protocol", "procedure", "reproducibility",
"validation", "verification", "peer_review", "publication",
# Statistics
"mean", "median", "mode", "standard_deviation", "variance",
"distribution", "normal", "Gaussian", "regression", "ANOVA",
"t_test", "chi_square", "confidence_interval", "bootstrap",
# Math
"logarithm", "exponential", "polynomial", "derivative", "integral",
"matrix", "vector", "tensor", "eigenvalue", "optimization",
"convex", "gradient_descent", "stochastic",
# Units
"micromolar", "nanomolar", "milligram", "microgram", "nanogram",
"celsius", "fahrenheit", "kelvin", "molarity", "pH",
]
def build_tokenizer(self, training_files: List[str]) -> Tokenizer:
"""Build the tokenizer from training files"""
print("=" * 70)
print("Building Scientific Tokenizer for Crowe Logic Mini")
print("=" * 70)
print(f"\nVocabulary size: {self.vocab_size:,}")
print(f"Special tokens: {len(self.all_special_tokens)}")
print(f"Training files: {len(training_files)}")
# Create BPE tokenizer
tokenizer = Tokenizer(models.BPE())
# Normalization (NFKC unicode normalization)
tokenizer.normalizer = NFKC()
# Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# Decoder
tokenizer.decoder = decoders.ByteLevel()
# Create trainer
trainer = trainers.BpeTrainer(
vocab_size=self.vocab_size,
min_frequency=self.min_frequency,
special_tokens=self.all_special_tokens,
show_progress=True,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
# Train
print("\nTraining tokenizer...")
tokenizer.train(files=training_files, trainer=trainer)
# Post-processing (add special tokens in the right format)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
print("✓ Tokenizer training complete!")
return tokenizer
def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str):
"""Save tokenizer files"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Save tokenizer JSON
tokenizer.save(str(output_path / "tokenizer.json"))
print(f"✓ Saved tokenizer.json to {output_path}")
# Create tokenizer config
config = {
"add_bos_token": False,
"add_eos_token": False,
"add_prefix_space": False,
"added_tokens_decoder": {
str(i): {
"content": token,
"lstrip": False,
"normalized": False,
"rstrip": False,
"single_word": False,
"special": True
}
for i, token in enumerate(self.all_special_tokens)
},
"bos_token": "<|im_start|>",
"eos_token": "<|im_end|>",
"pad_token": "<|endoftext|>",
"unk_token": "<|endoftext|>",
"model_max_length": 16384,
"tokenizer_class": "PreTrainedTokenizerFast",
"clean_up_tokenization_spaces": False,
}
with open(output_path / "tokenizer_config.json", "w") as f:
json.dump(config, f, indent=2)
print(f"✓ Saved tokenizer_config.json")
# Test the tokenizer
self._test_tokenizer(tokenizer)
def _test_tokenizer(self, tokenizer: Tokenizer):
"""Test tokenizer on domain-specific examples"""
print("\n" + "=" * 70)
print("Testing Tokenizer")
print("=" * 70)
test_cases = [
"Pleurotus ostreatus cultivation on supplemented sawdust substrate",
"IC50 determination for kinase inhibitor using HPLC-MS analysis",
"Transformer architecture with multi-head attention mechanism",
"ROI analysis shows 340% increase in operational efficiency",
"Let me analyze this step by step using Prologic methodology",
]
for text in test_cases:
encoding = tokenizer.encode(text)
tokens = encoding.tokens
ids = encoding.ids
print(f"\nText: {text}")
print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}")
# Vocabulary statistics
vocab = tokenizer.get_vocab()
print(f"\n✓ Total vocabulary size: {len(vocab):,}")
def main():
"""Main function to build tokenizer"""
print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n")
# Check if we have training data
data_dir = Path("./data/tokenizer_training")
if not data_dir.exists() or not list(data_dir.glob("*.txt")):
print("⚠️ No training data found!")
print(f" Expected: {data_dir}/*.txt files")
print("\n📋 Next steps:")
print(" 1. Run data collection script to gather training corpus")
print(" 2. Place text files in ./data/tokenizer_training/")
print(" 3. Run this script again")
print("\n For now, creating a minimal example tokenizer...")
# Create minimal training data for demonstration
data_dir.mkdir(parents=True, exist_ok=True)
builder = ScientificTokenizerBuilder(vocab_size=32000)
example_text = " ".join(builder.create_scientific_vocabulary())
with open(data_dir / "example.txt", "w") as f:
f.write(example_text)
print(f"\n✓ Created example training file with {len(example_text.split())} terms")
# Get all training files
training_files = [str(f) for f in data_dir.glob("*.txt")]
# Build tokenizer
builder = ScientificTokenizerBuilder(vocab_size=32000)
tokenizer = builder.build_tokenizer(training_files)
# Save tokenizer
output_dir = "./model/tokenizer_32k"
builder.save_tokenizer(tokenizer, output_dir)
print("\n" + "=" * 70)
print("✅ Tokenizer build complete!")
print("=" * 70)
print(f"\nTokenizer saved to: {output_dir}")
print(f"Vocabulary size: 32,000 tokens")
print(f"Optimized for: mycology, drug discovery, AI systems, business strategy")
print("\n📋 Next steps:")
print(" 1. Collect 1-2B tokens of training data")
print(" 2. Train the model from scratch with new tokenizer")
print(" 3. Evaluate on domain-specific benchmarks")
if __name__ == "__main__":
main()