crowe-logic-mini / scripts /build_tokenizer.py

Upload scripts/build_tokenizer.py with huggingface_hub

e9a0cd5 verified 6 months ago

16.1 kB

	#!/usr/bin/env python3
	"""
	Scientific Tokenizer Builder for Crowe Logic Mini
	Builds a 32,000 token BPE tokenizer optimized for scientific domains

	Domains covered:
	- Mycology (mushroom cultivation, species, techniques)
	- Drug Discovery (chemistry, compounds, protocols)
	- AI Systems (architecture, training, deployment)
	- Business Strategy (metrics, methodologies)
	- General Scientific (research, analysis, statistics)
	"""

	import os
	import json
	from pathlib import Path
	from typing import List, Dict, Optional
	from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
	from tokenizers.normalizers import NFKC, Sequence
	from tqdm import tqdm


	class ScientificTokenizerBuilder:
	"""Build a domain-optimized tokenizer for scientific text"""

	def __init__(
	self,
	vocab_size: int = 32000,
	min_frequency: int = 2,
	special_tokens: Optional[List[str]] = None
	):
	self.vocab_size = vocab_size
	self.min_frequency = min_frequency

	if special_tokens is None:
	self.special_tokens = [
	"<\|endoftext\|>",
	"<\|im_start\|>",
	"<\|im_end\|>",
	"<think>",
	"</think>",
	"<pad>",
	"<unk>",
	"<s>",
	"</s>",
	]
	else:
	self.special_tokens = special_tokens

	# Add domain-specific special tokens
	self.domain_tokens = [
	"<mycology>", "</mycology>",
	"<drug_discovery>", "</drug_discovery>",
	"<ai_systems>", "</ai_systems>",
	"<prologic>", "</prologic>",
	"<business>", "</business>",
	"<scientific>", "</scientific>",
	]

	self.all_special_tokens = self.special_tokens + self.domain_tokens

	def create_scientific_vocabulary(self) -> List[str]:
	"""Create a comprehensive list of scientific terms to ensure they're in the vocabulary"""

	vocab_lists = {
	"mycology": self._mycology_terms(),
	"chemistry": self._chemistry_terms(),
	"biology": self._biology_terms(),
	"ai_ml": self._ai_ml_terms(),
	"business": self._business_terms(),
	"scientific_general": self._scientific_general_terms(),
	}

	# Flatten all terms
	all_terms = []
	for category, terms in vocab_lists.items():
	all_terms.extend(terms)
	print(f"✓ {category}: {len(terms)} terms")

	print(f"\nTotal domain-specific terms: {len(all_terms)}")
	return all_terms

	def _mycology_terms(self) -> List[str]:
	"""Mycology-specific vocabulary"""
	return [
	# Genus names
	"Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe",
	"Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus",
	"Cantharellus", "Armillaria", "Inonotus", "Fomitopsis",

	# Species
	"ostreatus", "bisporus", "edodes", "lucidum", "cubensis",
	"erinaceus", "versicolor", "militaris", "esculenta",

	# Technical terms
	"mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia",
	"colonization", "inoculation", "spawn", "substrate", "agar",
	"lamellae", "gills", "basidiocarp", "basidiospore", "spore",
	"pinning", "flush", "canopy", "veil", "annulus",

	# Cultivation
	"autoclave", "sterilization", "pasteurization", "HEPA",
	"flowhood", "monotub", "shotgun", "casing", "incubation",
	"fruiting_chamber", "humidity", "FAE", "fresh_air_exchange",
	"contamination", "trichoderma", "cobweb", "bacterial",

	# Substrates
	"sawdust", "hardwood", "supplemented", "straw", "manure",
	"coco_coir", "vermiculite", "perlite", "gypsum", "bran",
	"masters_mix", "CVG", "grain_spawn", "rye", "millet",

	# Measurements
	"yields", "biological_efficiency", "flush_timing",
	]

	def _chemistry_terms(self) -> List[str]:
	"""Chemistry and drug discovery terms"""
	return [
	# Functional groups
	"hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl",
	"benzene", "acetyl", "carbonyl", "aldehyde", "ketone",
	"ester", "ether", "amide", "thiol", "sulfide", "phosphate",

	# Common compounds
	"glucose", "fructose", "ATP", "NADH", "acetate", "lactate",
	"pyruvate", "citrate", "succinate", "malate", "oxaloacetate",

	# Drug discovery
	"ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist",
	"bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50",
	"Kd", "binding_affinity", "selectivity", "cytotoxicity",
	"assay", "screening", "HTS", "hit", "lead", "optimization",

	# Techniques
	"HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis",
	"crystallography", "spectroscopy", "titration", "synthesis",
	"purification", "extraction", "distillation",

	# Biochemistry
	"protein", "enzyme", "substrate", "catalyst", "peptide",
	"nucleotide", "DNA", "RNA", "genome", "transcription",
	"translation", "metabolism", "pathway", "metabolite",
	]

	def _biology_terms(self) -> List[str]:
	"""Biology and life sciences terms"""
	return [
	# Cell biology
	"mitochondria", "ribosome", "endoplasmic", "reticulum",
	"Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus",
	"membrane", "cytoskeleton", "vesicle", "organelle",

	# Molecular biology
	"PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector",
	"transfection", "transformation", "expression", "CRISPR",
	"genome_editing", "mutagenesis", "recombinant",

	# Physiology
	"neuron", "synapse", "neurotransmitter", "hormone", "receptor",
	"signaling", "cascade", "phosphorylation", "ubiquitination",

	# Microbiology
	"bacteria", "bacterial", "fungal", "viral", "prokaryote",
	"eukaryote", "microbiome", "culture", "fermentation",
	"antibiotic", "resistance", "pathogen", "strain",
	]

	def _ai_ml_terms(self) -> List[str]:
	"""AI and machine learning terminology"""
	return [
	# Architectures
	"transformer", "attention", "self_attention", "cross_attention",
	"feedforward", "embedding", "encoder", "decoder", "BERT",
	"GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM",
	"CNN", "ResNet", "ViT", "MoE", "mixture_of_experts",

	# Training
	"backpropagation", "gradient", "optimizer", "Adam", "SGD",
	"learning_rate", "batch_size", "epoch", "iteration", "loss",
	"cross_entropy", "MSE", "regularization", "dropout", "batch_norm",
	"layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax",

	# Concepts
	"overfitting", "underfitting", "generalization", "bias", "variance",
	"precision", "recall", "F1", "accuracy", "perplexity", "BLEU",
	"fine_tuning", "pretraining", "transfer_learning", "few_shot",
	"zero_shot", "prompt", "inference", "latency", "throughput",

	# Infrastructure
	"GPU", "TPU", "CUDA", "distributed", "parallel", "quantization",
	"pruning", "distillation", "ONNX", "TensorRT", "deployment",
	"serving", "endpoint", "API", "REST", "gRPC",

	# Frameworks
	"PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace",
	"Transformers", "datasets", "tokenizers", "scikit", "numpy",
	"pandas", "matplotlib", "wandb", "tensorboard",
	]

	def _business_terms(self) -> List[str]:
	"""Business and strategy terminology"""
	return [
	# Metrics
	"ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention",
	"ARR", "MRR", "revenue", "profit", "margin", "gross", "net",
	"CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR",

	# Strategy
	"GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF",
	"product_market_fit", "MVP", "POC", "proof_of_concept",
	"competitive_advantage", "moat", "differentiation",
	"positioning", "segmentation", "targeting",

	# Operations
	"agile", "scrum", "sprint", "kanban", "OKRs", "roadmap",
	"milestone", "deliverable", "stakeholder", "synergy",
	"optimization", "efficiency", "scalability", "throughput",

	# Methodologies
	"Prologic", "intercept", "annotate", "correlate", "Six_Sigma",
	"Lean", "methodology", "framework", "best_practices",
	]

	def _scientific_general_terms(self) -> List[str]:
	"""General scientific terminology"""
	return [
	# Research
	"hypothesis", "experiment", "control", "variable", "correlation",
	"causation", "significance", "p_value", "statistical", "analysis",
	"methodology", "protocol", "procedure", "reproducibility",
	"validation", "verification", "peer_review", "publication",

	# Statistics
	"mean", "median", "mode", "standard_deviation", "variance",
	"distribution", "normal", "Gaussian", "regression", "ANOVA",
	"t_test", "chi_square", "confidence_interval", "bootstrap",

	# Math
	"logarithm", "exponential", "polynomial", "derivative", "integral",
	"matrix", "vector", "tensor", "eigenvalue", "optimization",
	"convex", "gradient_descent", "stochastic",

	# Units
	"micromolar", "nanomolar", "milligram", "microgram", "nanogram",
	"celsius", "fahrenheit", "kelvin", "molarity", "pH",
	]

	def build_tokenizer(self, training_files: List[str]) -> Tokenizer:
	"""Build the tokenizer from training files"""

	print("=" * 70)
	print("Building Scientific Tokenizer for Crowe Logic Mini")
	print("=" * 70)
	print(f"\nVocabulary size: {self.vocab_size:,}")
	print(f"Special tokens: {len(self.all_special_tokens)}")
	print(f"Training files: {len(training_files)}")

	# Create BPE tokenizer
	tokenizer = Tokenizer(models.BPE())

	# Normalization (NFKC unicode normalization)
	tokenizer.normalizer = NFKC()

	# Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together)
	tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

	# Decoder
	tokenizer.decoder = decoders.ByteLevel()

	# Create trainer
	trainer = trainers.BpeTrainer(
	vocab_size=self.vocab_size,
	min_frequency=self.min_frequency,
	special_tokens=self.all_special_tokens,
	show_progress=True,
	initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
	)

	# Train
	print("\nTraining tokenizer...")
	tokenizer.train(files=training_files, trainer=trainer)

	# Post-processing (add special tokens in the right format)
	tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

	print("✓ Tokenizer training complete!")

	return tokenizer

	def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str):
	"""Save tokenizer files"""

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	# Save tokenizer JSON
	tokenizer.save(str(output_path / "tokenizer.json"))
	print(f"✓ Saved tokenizer.json to {output_path}")

	# Create tokenizer config
	config = {
	"add_bos_token": False,
	"add_eos_token": False,
	"add_prefix_space": False,
	"added_tokens_decoder": {
	str(i): {
	"content": token,
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	"special": True
	}
	for i, token in enumerate(self.all_special_tokens)
	},
	"bos_token": "<\|im_start\|>",
	"eos_token": "<\|im_end\|>",
	"pad_token": "<\|endoftext\|>",
	"unk_token": "<\|endoftext\|>",
	"model_max_length": 16384,
	"tokenizer_class": "PreTrainedTokenizerFast",
	"clean_up_tokenization_spaces": False,
	}

	with open(output_path / "tokenizer_config.json", "w") as f:
	json.dump(config, f, indent=2)
	print(f"✓ Saved tokenizer_config.json")

	# Test the tokenizer
	self._test_tokenizer(tokenizer)

	def _test_tokenizer(self, tokenizer: Tokenizer):
	"""Test tokenizer on domain-specific examples"""

	print("\n" + "=" * 70)
	print("Testing Tokenizer")
	print("=" * 70)

	test_cases = [
	"Pleurotus ostreatus cultivation on supplemented sawdust substrate",
	"IC50 determination for kinase inhibitor using HPLC-MS analysis",
	"Transformer architecture with multi-head attention mechanism",
	"ROI analysis shows 340% increase in operational efficiency",
	"<think>Let me analyze this step by step using Prologic methodology</think>",
	]

	for text in test_cases:
	encoding = tokenizer.encode(text)
	tokens = encoding.tokens
	ids = encoding.ids

	print(f"\nText: {text}")
	print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
	print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}")

	# Vocabulary statistics
	vocab = tokenizer.get_vocab()
	print(f"\n✓ Total vocabulary size: {len(vocab):,}")


	def main():
	"""Main function to build tokenizer"""

	print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n")

	# Check if we have training data
	data_dir = Path("./data/tokenizer_training")

	if not data_dir.exists() or not list(data_dir.glob("*.txt")):
	print("⚠️ No training data found!")
	print(f" Expected: {data_dir}/*.txt files")
	print("\n📋 Next steps:")
	print(" 1. Run data collection script to gather training corpus")
	print(" 2. Place text files in ./data/tokenizer_training/")
	print(" 3. Run this script again")
	print("\n For now, creating a minimal example tokenizer...")

	# Create minimal training data for demonstration
	data_dir.mkdir(parents=True, exist_ok=True)

	builder = ScientificTokenizerBuilder(vocab_size=32000)
	example_text = " ".join(builder.create_scientific_vocabulary())

	with open(data_dir / "example.txt", "w") as f:
	f.write(example_text)

	print(f"\n✓ Created example training file with {len(example_text.split())} terms")

	# Get all training files
	training_files = [str(f) for f in data_dir.glob("*.txt")]

	# Build tokenizer
	builder = ScientificTokenizerBuilder(vocab_size=32000)
	tokenizer = builder.build_tokenizer(training_files)

	# Save tokenizer
	output_dir = "./model/tokenizer_32k"
	builder.save_tokenizer(tokenizer, output_dir)

	print("\n" + "=" * 70)
	print("✅ Tokenizer build complete!")
	print("=" * 70)
	print(f"\nTokenizer saved to: {output_dir}")
	print(f"Vocabulary size: 32,000 tokens")
	print(f"Optimized for: mycology, drug discovery, AI systems, business strategy")
	print("\n📋 Next steps:")
	print(" 1. Collect 1-2B tokens of training data")
	print(" 2. Train the model from scratch with new tokenizer")
	print(" 3. Evaluate on domain-specific benchmarks")


	if __name__ == "__main__":
	main()