mike1210
/

crowe-logic-mini

+#!/usr/bin/env python3
+"""
+Scientific Tokenizer Builder for Crowe Logic Mini
+Builds a 32,000 token BPE tokenizer optimized for scientific domains
+Domains covered:
+- Mycology (mushroom cultivation, species, techniques)
+- Drug Discovery (chemistry, compounds, protocols)
+- AI Systems (architecture, training, deployment)
+- Business Strategy (metrics, methodologies)
+- General Scientific (research, analysis, statistics)
+"""
+import os
+import json
+from pathlib import Path
+from typing import List, Dict, Optional
+from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
+from tokenizers.normalizers import NFKC, Sequence
+from tqdm import tqdm
+class ScientificTokenizerBuilder:
+    """Build a domain-optimized tokenizer for scientific text"""
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        min_frequency: int = 2,
+        special_tokens: Optional[List[str]] = None
+    ):
+        self.vocab_size = vocab_size
+        self.min_frequency = min_frequency
+        if special_tokens is None:
+            self.special_tokens = [
+                "<|endoftext|>",
+                "<|im_start|>",
+                "<|im_end|>",
+                "<think>",
+                "</think>",
+                "<pad>",
+                "<unk>",
+                "<s>",
+                "</s>",
+            ]
+        else:
+            self.special_tokens = special_tokens
+        # Add domain-specific special tokens
+        self.domain_tokens = [
+            "<mycology>", "</mycology>",
+            "<drug_discovery>", "</drug_discovery>",
+            "<ai_systems>", "</ai_systems>",
+            "<prologic>", "</prologic>",
+            "<business>", "</business>",
+            "<scientific>", "</scientific>",
+        ]
+        self.all_special_tokens = self.special_tokens + self.domain_tokens
+    def create_scientific_vocabulary(self) -> List[str]:
+        """Create a comprehensive list of scientific terms to ensure they're in the vocabulary"""
+        vocab_lists = {
+            "mycology": self._mycology_terms(),
+            "chemistry": self._chemistry_terms(),
+            "biology": self._biology_terms(),
+            "ai_ml": self._ai_ml_terms(),
+            "business": self._business_terms(),
+            "scientific_general": self._scientific_general_terms(),
+        }
+        # Flatten all terms
+        all_terms = []
+        for category, terms in vocab_lists.items():
+            all_terms.extend(terms)
+            print(f"✓ {category}: {len(terms)} terms")
+        print(f"\nTotal domain-specific terms: {len(all_terms)}")
+        return all_terms
+    def _mycology_terms(self) -> List[str]:
+        """Mycology-specific vocabulary"""
+        return [
+            # Genus names
+            "Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe",
+            "Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus",
+            "Cantharellus", "Armillaria", "Inonotus", "Fomitopsis",
+            # Species
+            "ostreatus", "bisporus", "edodes", "lucidum", "cubensis",
+            "erinaceus", "versicolor", "militaris", "esculenta",
+            # Technical terms
+            "mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia",
+            "colonization", "inoculation", "spawn", "substrate", "agar",
+            "lamellae", "gills", "basidiocarp", "basidiospore", "spore",
+            "pinning", "flush", "canopy", "veil", "annulus",
+            # Cultivation
+            "autoclave", "sterilization", "pasteurization", "HEPA",
+            "flowhood", "monotub", "shotgun", "casing", "incubation",
+            "fruiting_chamber", "humidity", "FAE", "fresh_air_exchange",
+            "contamination", "trichoderma", "cobweb", "bacterial",
+            # Substrates
+            "sawdust", "hardwood", "supplemented", "straw", "manure",
+            "coco_coir", "vermiculite", "perlite", "gypsum", "bran",
+            "masters_mix", "CVG", "grain_spawn", "rye", "millet",
+            # Measurements
+            "yields", "biological_efficiency", "flush_timing",
+        ]
+    def _chemistry_terms(self) -> List[str]:
+        """Chemistry and drug discovery terms"""
+        return [
+            # Functional groups
+            "hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl",
+            "benzene", "acetyl", "carbonyl", "aldehyde", "ketone",
+            "ester", "ether", "amide", "thiol", "sulfide", "phosphate",
+            # Common compounds
+            "glucose", "fructose", "ATP", "NADH", "acetate", "lactate",
+            "pyruvate", "citrate", "succinate", "malate", "oxaloacetate",
+            # Drug discovery
+            "ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist",
+            "bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50",
+            "Kd", "binding_affinity", "selectivity", "cytotoxicity",
+            "assay", "screening", "HTS", "hit", "lead", "optimization",
+            # Techniques
+            "HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis",
+            "crystallography", "spectroscopy", "titration", "synthesis",
+            "purification", "extraction", "distillation",
+            # Biochemistry
+            "protein", "enzyme", "substrate", "catalyst", "peptide",
+            "nucleotide", "DNA", "RNA", "genome", "transcription",
+            "translation", "metabolism", "pathway", "metabolite",
+        ]
+    def _biology_terms(self) -> List[str]:
+        """Biology and life sciences terms"""
+        return [
+            # Cell biology
+            "mitochondria", "ribosome", "endoplasmic", "reticulum",
+            "Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus",
+            "membrane", "cytoskeleton", "vesicle", "organelle",
+            # Molecular biology
+            "PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector",
+            "transfection", "transformation", "expression", "CRISPR",
+            "genome_editing", "mutagenesis", "recombinant",
+            # Physiology
+            "neuron", "synapse", "neurotransmitter", "hormone", "receptor",
+            "signaling", "cascade", "phosphorylation", "ubiquitination",
+            # Microbiology
+            "bacteria", "bacterial", "fungal", "viral", "prokaryote",
+            "eukaryote", "microbiome", "culture", "fermentation",
+            "antibiotic", "resistance", "pathogen", "strain",
+        ]
+    def _ai_ml_terms(self) -> List[str]:
+        """AI and machine learning terminology"""
+        return [
+            # Architectures
+            "transformer", "attention", "self_attention", "cross_attention",
+            "feedforward", "embedding", "encoder", "decoder", "BERT",
+            "GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM",
+            "CNN", "ResNet", "ViT", "MoE", "mixture_of_experts",
+            # Training
+            "backpropagation", "gradient", "optimizer", "Adam", "SGD",
+            "learning_rate", "batch_size", "epoch", "iteration", "loss",
+            "cross_entropy", "MSE", "regularization", "dropout", "batch_norm",
+            "layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax",
+            # Concepts
+            "overfitting", "underfitting", "generalization", "bias", "variance",
+            "precision", "recall", "F1", "accuracy", "perplexity", "BLEU",
+            "fine_tuning", "pretraining", "transfer_learning", "few_shot",
+            "zero_shot", "prompt", "inference", "latency", "throughput",
+            # Infrastructure
+            "GPU", "TPU", "CUDA", "distributed", "parallel", "quantization",
+            "pruning", "distillation", "ONNX", "TensorRT", "deployment",
+            "serving", "endpoint", "API", "REST", "gRPC",
+            # Frameworks
+            "PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace",
+            "Transformers", "datasets", "tokenizers", "scikit", "numpy",
+            "pandas", "matplotlib", "wandb", "tensorboard",
+        ]
+    def _business_terms(self) -> List[str]:
+        """Business and strategy terminology"""
+        return [
+            # Metrics
+            "ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention",
+            "ARR", "MRR", "revenue", "profit", "margin", "gross", "net",
+            "CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR",
+            # Strategy
+            "GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF",
+            "product_market_fit", "MVP", "POC", "proof_of_concept",
+            "competitive_advantage", "moat", "differentiation",
+            "positioning", "segmentation", "targeting",
+            # Operations
+            "agile", "scrum", "sprint", "kanban", "OKRs", "roadmap",
+            "milestone", "deliverable", "stakeholder", "synergy",
+            "optimization", "efficiency", "scalability", "throughput",
+            # Methodologies
+            "Prologic", "intercept", "annotate", "correlate", "Six_Sigma",
+            "Lean", "methodology", "framework", "best_practices",
+        ]
+    def _scientific_general_terms(self) -> List[str]:
+        """General scientific terminology"""
+        return [
+            # Research
+            "hypothesis", "experiment", "control", "variable", "correlation",
+            "causation", "significance", "p_value", "statistical", "analysis",
+            "methodology", "protocol", "procedure", "reproducibility",
+            "validation", "verification", "peer_review", "publication",
+            # Statistics
+            "mean", "median", "mode", "standard_deviation", "variance",
+            "distribution", "normal", "Gaussian", "regression", "ANOVA",
+            "t_test", "chi_square", "confidence_interval", "bootstrap",
+            # Math
+            "logarithm", "exponential", "polynomial", "derivative", "integral",
+            "matrix", "vector", "tensor", "eigenvalue", "optimization",
+            "convex", "gradient_descent", "stochastic",
+            # Units
+            "micromolar", "nanomolar", "milligram", "microgram", "nanogram",
+            "celsius", "fahrenheit", "kelvin", "molarity", "pH",
+        ]
+    def build_tokenizer(self, training_files: List[str]) -> Tokenizer:
+        """Build the tokenizer from training files"""
+        print("=" * 70)
+        print("Building Scientific Tokenizer for Crowe Logic Mini")
+        print("=" * 70)
+        print(f"\nVocabulary size: {self.vocab_size:,}")
+        print(f"Special tokens: {len(self.all_special_tokens)}")
+        print(f"Training files: {len(training_files)}")
+        # Create BPE tokenizer
+        tokenizer = Tokenizer(models.BPE())
+        # Normalization (NFKC unicode normalization)
+        tokenizer.normalizer = NFKC()
+        # Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together)
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+        # Decoder
+        tokenizer.decoder = decoders.ByteLevel()
+        # Create trainer
+        trainer = trainers.BpeTrainer(
+            vocab_size=self.vocab_size,
+            min_frequency=self.min_frequency,
+            special_tokens=self.all_special_tokens,
+            show_progress=True,
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+        )
+        # Train
+        print("\nTraining tokenizer...")
+        tokenizer.train(files=training_files, trainer=trainer)
+        # Post-processing (add special tokens in the right format)
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+        print("✓ Tokenizer training complete!")
+        return tokenizer
+    def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str):
+        """Save tokenizer files"""
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        # Save tokenizer JSON
+        tokenizer.save(str(output_path / "tokenizer.json"))
+        print(f"✓ Saved tokenizer.json to {output_path}")
+        # Create tokenizer config
+        config = {
+            "add_bos_token": False,
+            "add_eos_token": False,
+            "add_prefix_space": False,
+            "added_tokens_decoder": {
+                str(i): {
+                    "content": token,
+                    "lstrip": False,
+                    "normalized": False,
+                    "rstrip": False,
+                    "single_word": False,
+                    "special": True
+                }
+                for i, token in enumerate(self.all_special_tokens)
+            },
+            "bos_token": "<|im_start|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<|endoftext|>",
+            "model_max_length": 16384,
+            "tokenizer_class": "PreTrainedTokenizerFast",
+            "clean_up_tokenization_spaces": False,
+        }
+        with open(output_path / "tokenizer_config.json", "w") as f:
+            json.dump(config, f, indent=2)
+        print(f"✓ Saved tokenizer_config.json")
+        # Test the tokenizer
+        self._test_tokenizer(tokenizer)
+    def _test_tokenizer(self, tokenizer: Tokenizer):
+        """Test tokenizer on domain-specific examples"""
+        print("\n" + "=" * 70)
+        print("Testing Tokenizer")
+        print("=" * 70)
+        test_cases = [
+            "Pleurotus ostreatus cultivation on supplemented sawdust substrate",
+            "IC50 determination for kinase inhibitor using HPLC-MS analysis",
+            "Transformer architecture with multi-head attention mechanism",
+            "ROI analysis shows 340% increase in operational efficiency",
+            "<think>Let me analyze this step by step using Prologic methodology</think>",
+        ]
+        for text in test_cases:
+            encoding = tokenizer.encode(text)
+            tokens = encoding.tokens
+            ids = encoding.ids
+            print(f"\nText: {text}")
+            print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
+            print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}")
+        # Vocabulary statistics
+        vocab = tokenizer.get_vocab()
+        print(f"\n✓ Total vocabulary size: {len(vocab):,}")
+def main():
+    """Main function to build tokenizer"""
+    print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n")
+    # Check if we have training data
+    data_dir = Path("./data/tokenizer_training")
+    if not data_dir.exists() or not list(data_dir.glob("*.txt")):
+        print("⚠️  No training data found!")
+        print(f"   Expected: {data_dir}/*.txt files")
+        print("\n📋 Next steps:")
+        print("   1. Run data collection script to gather training corpus")
+        print("   2. Place text files in ./data/tokenizer_training/")
+        print("   3. Run this script again")
+        print("\n   For now, creating a minimal example tokenizer...")
+        # Create minimal training data for demonstration
+        data_dir.mkdir(parents=True, exist_ok=True)
+        builder = ScientificTokenizerBuilder(vocab_size=32000)
+        example_text = " ".join(builder.create_scientific_vocabulary())
+        with open(data_dir / "example.txt", "w") as f:
+            f.write(example_text)
+        print(f"\n✓ Created example training file with {len(example_text.split())} terms")
+    # Get all training files
+    training_files = [str(f) for f in data_dir.glob("*.txt")]
+    # Build tokenizer
+    builder = ScientificTokenizerBuilder(vocab_size=32000)
+    tokenizer = builder.build_tokenizer(training_files)
+    # Save tokenizer
+    output_dir = "./model/tokenizer_32k"
+    builder.save_tokenizer(tokenizer, output_dir)
+    print("\n" + "=" * 70)
+    print("✅ Tokenizer build complete!")
+    print("=" * 70)
+    print(f"\nTokenizer saved to: {output_dir}")
+    print(f"Vocabulary size: 32,000 tokens")
+    print(f"Optimized for: mycology, drug discovery, AI systems, business strategy")
+    print("\n📋 Next steps:")
+    print("   1. Collect 1-2B tokens of training data")
+    print("   2. Train the model from scratch with new tokenizer")
+    print("   3. Evaluate on domain-specific benchmarks")
+if __name__ == "__main__":
+    main()