File size: 16,117 Bytes

e9a0cd5

#!/usr/bin/env python3
"""
Scientific Tokenizer Builder for Crowe Logic Mini
Builds a 32,000 token BPE tokenizer optimized for scientific domains

Domains covered:
- Mycology (mushroom cultivation, species, techniques)
- Drug Discovery (chemistry, compounds, protocols)
- AI Systems (architecture, training, deployment)
- Business Strategy (metrics, methodologies)
- General Scientific (research, analysis, statistics)
"""

import os
import json
from pathlib import Path
from typing import List, Dict, Optional
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from tokenizers.normalizers import NFKC, Sequence
from tqdm import tqdm


class ScientificTokenizerBuilder:
    """Build a domain-optimized tokenizer for scientific text"""

    def __init__(
        self,
        vocab_size: int = 32000,
        min_frequency: int = 2,
        special_tokens: Optional[List[str]] = None
    ):
        self.vocab_size = vocab_size
        self.min_frequency = min_frequency

        if special_tokens is None:
            self.special_tokens = [
                "<|endoftext|>",
                "<|im_start|>",
                "<|im_end|>",
                "<think>",
                "</think>",
                "<pad>",
                "<unk>",
                "<s>",
                "</s>",
            ]
        else:
            self.special_tokens = special_tokens

        # Add domain-specific special tokens
        self.domain_tokens = [
            "<mycology>", "</mycology>",
            "<drug_discovery>", "</drug_discovery>",
            "<ai_systems>", "</ai_systems>",
            "<prologic>", "</prologic>",
            "<business>", "</business>",
            "<scientific>", "</scientific>",
        ]

        self.all_special_tokens = self.special_tokens + self.domain_tokens

    def create_scientific_vocabulary(self) -> List[str]:
        """Create a comprehensive list of scientific terms to ensure they're in the vocabulary"""

        vocab_lists = {
            "mycology": self._mycology_terms(),
            "chemistry": self._chemistry_terms(),
            "biology": self._biology_terms(),
            "ai_ml": self._ai_ml_terms(),
            "business": self._business_terms(),
            "scientific_general": self._scientific_general_terms(),
        }

        # Flatten all terms
        all_terms = []
        for category, terms in vocab_lists.items():
            all_terms.extend(terms)
            print(f"✓ {category}: {len(terms)} terms")

        print(f"\nTotal domain-specific terms: {len(all_terms)}")
        return all_terms

    def _mycology_terms(self) -> List[str]:
        """Mycology-specific vocabulary"""
        return [
            # Genus names
            "Pleurotus", "Agaricus", "Lentinula", "Ganoderma", "Psilocybe",
            "Hericium", "Trametes", "Cordyceps", "Morchella", "Boletus",
            "Cantharellus", "Armillaria", "Inonotus", "Fomitopsis",

            # Species
            "ostreatus", "bisporus", "edodes", "lucidum", "cubensis",
            "erinaceus", "versicolor", "militaris", "esculenta",

            # Technical terms
            "mycelium", "mycelia", "hyphae", "hypha", "fruiting", "primordia",
            "colonization", "inoculation", "spawn", "substrate", "agar",
            "lamellae", "gills", "basidiocarp", "basidiospore", "spore",
            "pinning", "flush", "canopy", "veil", "annulus",

            # Cultivation
            "autoclave", "sterilization", "pasteurization", "HEPA",
            "flowhood", "monotub", "shotgun", "casing", "incubation",
            "fruiting_chamber", "humidity", "FAE", "fresh_air_exchange",
            "contamination", "trichoderma", "cobweb", "bacterial",

            # Substrates
            "sawdust", "hardwood", "supplemented", "straw", "manure",
            "coco_coir", "vermiculite", "perlite", "gypsum", "bran",
            "masters_mix", "CVG", "grain_spawn", "rye", "millet",

            # Measurements
            "yields", "biological_efficiency", "flush_timing",
        ]

    def _chemistry_terms(self) -> List[str]:
        """Chemistry and drug discovery terms"""
        return [
            # Functional groups
            "hydroxyl", "carboxyl", "amino", "methyl", "ethyl", "phenyl",
            "benzene", "acetyl", "carbonyl", "aldehyde", "ketone",
            "ester", "ether", "amide", "thiol", "sulfide", "phosphate",

            # Common compounds
            "glucose", "fructose", "ATP", "NADH", "acetate", "lactate",
            "pyruvate", "citrate", "succinate", "malate", "oxaloacetate",

            # Drug discovery
            "ligand", "receptor", "kinase", "inhibitor", "agonist", "antagonist",
            "bioavailability", "pharmacokinetics", "ADME", "IC50", "EC50",
            "Kd", "binding_affinity", "selectivity", "cytotoxicity",
            "assay", "screening", "HTS", "hit", "lead", "optimization",

            # Techniques
            "HPLC", "NMR", "MS", "mass_spec", "chromatography", "electrophoresis",
            "crystallography", "spectroscopy", "titration", "synthesis",
            "purification", "extraction", "distillation",

            # Biochemistry
            "protein", "enzyme", "substrate", "catalyst", "peptide",
            "nucleotide", "DNA", "RNA", "genome", "transcription",
            "translation", "metabolism", "pathway", "metabolite",
        ]

    def _biology_terms(self) -> List[str]:
        """Biology and life sciences terms"""
        return [
            # Cell biology
            "mitochondria", "ribosome", "endoplasmic", "reticulum",
            "Golgi", "lysosome", "peroxisome", "cytoplasm", "nucleus",
            "membrane", "cytoskeleton", "vesicle", "organelle",

            # Molecular biology
            "PCR", "qPCR", "sequencing", "cloning", "plasmid", "vector",
            "transfection", "transformation", "expression", "CRISPR",
            "genome_editing", "mutagenesis", "recombinant",

            # Physiology
            "neuron", "synapse", "neurotransmitter", "hormone", "receptor",
            "signaling", "cascade", "phosphorylation", "ubiquitination",

            # Microbiology
            "bacteria", "bacterial", "fungal", "viral", "prokaryote",
            "eukaryote", "microbiome", "culture", "fermentation",
            "antibiotic", "resistance", "pathogen", "strain",
        ]

    def _ai_ml_terms(self) -> List[str]:
        """AI and machine learning terminology"""
        return [
            # Architectures
            "transformer", "attention", "self_attention", "cross_attention",
            "feedforward", "embedding", "encoder", "decoder", "BERT",
            "GPT", "LLaMA", "diffusion", "GAN", "VAE", "RNN", "LSTM",
            "CNN", "ResNet", "ViT", "MoE", "mixture_of_experts",

            # Training
            "backpropagation", "gradient", "optimizer", "Adam", "SGD",
            "learning_rate", "batch_size", "epoch", "iteration", "loss",
            "cross_entropy", "MSE", "regularization", "dropout", "batch_norm",
            "layer_norm", "activation", "ReLU", "GELU", "SiLU", "softmax",

            # Concepts
            "overfitting", "underfitting", "generalization", "bias", "variance",
            "precision", "recall", "F1", "accuracy", "perplexity", "BLEU",
            "fine_tuning", "pretraining", "transfer_learning", "few_shot",
            "zero_shot", "prompt", "inference", "latency", "throughput",

            # Infrastructure
            "GPU", "TPU", "CUDA", "distributed", "parallel", "quantization",
            "pruning", "distillation", "ONNX", "TensorRT", "deployment",
            "serving", "endpoint", "API", "REST", "gRPC",

            # Frameworks
            "PyTorch", "TensorFlow", "JAX", "Keras", "HuggingFace",
            "Transformers", "datasets", "tokenizers", "scikit", "numpy",
            "pandas", "matplotlib", "wandb", "tensorboard",
        ]

    def _business_terms(self) -> List[str]:
        """Business and strategy terminology"""
        return [
            # Metrics
            "ROI", "EBITDA", "CAC", "LTV", "CLTV", "churn", "retention",
            "ARR", "MRR", "revenue", "profit", "margin", "gross", "net",
            "CAGR", "YoY", "MoM", "QoQ", "KPI", "OKR",

            # Strategy
            "GTM", "go_to_market", "TAM", "SAM", "SOM", "PMF",
            "product_market_fit", "MVP", "POC", "proof_of_concept",
            "competitive_advantage", "moat", "differentiation",
            "positioning", "segmentation", "targeting",

            # Operations
            "agile", "scrum", "sprint", "kanban", "OKRs", "roadmap",
            "milestone", "deliverable", "stakeholder", "synergy",
            "optimization", "efficiency", "scalability", "throughput",

            # Methodologies
            "Prologic", "intercept", "annotate", "correlate", "Six_Sigma",
            "Lean", "methodology", "framework", "best_practices",
        ]

    def _scientific_general_terms(self) -> List[str]:
        """General scientific terminology"""
        return [
            # Research
            "hypothesis", "experiment", "control", "variable", "correlation",
            "causation", "significance", "p_value", "statistical", "analysis",
            "methodology", "protocol", "procedure", "reproducibility",
            "validation", "verification", "peer_review", "publication",

            # Statistics
            "mean", "median", "mode", "standard_deviation", "variance",
            "distribution", "normal", "Gaussian", "regression", "ANOVA",
            "t_test", "chi_square", "confidence_interval", "bootstrap",

            # Math
            "logarithm", "exponential", "polynomial", "derivative", "integral",
            "matrix", "vector", "tensor", "eigenvalue", "optimization",
            "convex", "gradient_descent", "stochastic",

            # Units
            "micromolar", "nanomolar", "milligram", "microgram", "nanogram",
            "celsius", "fahrenheit", "kelvin", "molarity", "pH",
        ]

    def build_tokenizer(self, training_files: List[str]) -> Tokenizer:
        """Build the tokenizer from training files"""

        print("=" * 70)
        print("Building Scientific Tokenizer for Crowe Logic Mini")
        print("=" * 70)
        print(f"\nVocabulary size: {self.vocab_size:,}")
        print(f"Special tokens: {len(self.all_special_tokens)}")
        print(f"Training files: {len(training_files)}")

        # Create BPE tokenizer
        tokenizer = Tokenizer(models.BPE())

        # Normalization (NFKC unicode normalization)
        tokenizer.normalizer = NFKC()

        # Pre-tokenization (split on whitespace and punctuation, but keep scientific notation together)
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

        # Decoder
        tokenizer.decoder = decoders.ByteLevel()

        # Create trainer
        trainer = trainers.BpeTrainer(
            vocab_size=self.vocab_size,
            min_frequency=self.min_frequency,
            special_tokens=self.all_special_tokens,
            show_progress=True,
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        )

        # Train
        print("\nTraining tokenizer...")
        tokenizer.train(files=training_files, trainer=trainer)

        # Post-processing (add special tokens in the right format)
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

        print("✓ Tokenizer training complete!")

        return tokenizer

    def save_tokenizer(self, tokenizer: Tokenizer, output_dir: str):
        """Save tokenizer files"""

        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        # Save tokenizer JSON
        tokenizer.save(str(output_path / "tokenizer.json"))
        print(f"✓ Saved tokenizer.json to {output_path}")

        # Create tokenizer config
        config = {
            "add_bos_token": False,
            "add_eos_token": False,
            "add_prefix_space": False,
            "added_tokens_decoder": {
                str(i): {
                    "content": token,
                    "lstrip": False,
                    "normalized": False,
                    "rstrip": False,
                    "single_word": False,
                    "special": True
                }
                for i, token in enumerate(self.all_special_tokens)
            },
            "bos_token": "<|im_start|>",
            "eos_token": "<|im_end|>",
            "pad_token": "<|endoftext|>",
            "unk_token": "<|endoftext|>",
            "model_max_length": 16384,
            "tokenizer_class": "PreTrainedTokenizerFast",
            "clean_up_tokenization_spaces": False,
        }

        with open(output_path / "tokenizer_config.json", "w") as f:
            json.dump(config, f, indent=2)
        print(f"✓ Saved tokenizer_config.json")

        # Test the tokenizer
        self._test_tokenizer(tokenizer)

    def _test_tokenizer(self, tokenizer: Tokenizer):
        """Test tokenizer on domain-specific examples"""

        print("\n" + "=" * 70)
        print("Testing Tokenizer")
        print("=" * 70)

        test_cases = [
            "Pleurotus ostreatus cultivation on supplemented sawdust substrate",
            "IC50 determination for kinase inhibitor using HPLC-MS analysis",
            "Transformer architecture with multi-head attention mechanism",
            "ROI analysis shows 340% increase in operational efficiency",
            "<think>Let me analyze this step by step using Prologic methodology</think>",
        ]

        for text in test_cases:
            encoding = tokenizer.encode(text)
            tokens = encoding.tokens
            ids = encoding.ids

            print(f"\nText: {text}")
            print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
            print(f"Token IDs: {ids[:10]}{'...' if len(ids) > 10 else ''}")

        # Vocabulary statistics
        vocab = tokenizer.get_vocab()
        print(f"\n✓ Total vocabulary size: {len(vocab):,}")


def main():
    """Main function to build tokenizer"""

    print("\n🚀 Crowe Logic Mini - Scientific Tokenizer Builder\n")

    # Check if we have training data
    data_dir = Path("./data/tokenizer_training")

    if not data_dir.exists() or not list(data_dir.glob("*.txt")):
        print("⚠️  No training data found!")
        print(f"   Expected: {data_dir}/*.txt files")
        print("\n📋 Next steps:")
        print("   1. Run data collection script to gather training corpus")
        print("   2. Place text files in ./data/tokenizer_training/")
        print("   3. Run this script again")
        print("\n   For now, creating a minimal example tokenizer...")

        # Create minimal training data for demonstration
        data_dir.mkdir(parents=True, exist_ok=True)

        builder = ScientificTokenizerBuilder(vocab_size=32000)
        example_text = " ".join(builder.create_scientific_vocabulary())

        with open(data_dir / "example.txt", "w") as f:
            f.write(example_text)

        print(f"\n✓ Created example training file with {len(example_text.split())} terms")

    # Get all training files
    training_files = [str(f) for f in data_dir.glob("*.txt")]

    # Build tokenizer
    builder = ScientificTokenizerBuilder(vocab_size=32000)
    tokenizer = builder.build_tokenizer(training_files)

    # Save tokenizer
    output_dir = "./model/tokenizer_32k"
    builder.save_tokenizer(tokenizer, output_dir)

    print("\n" + "=" * 70)
    print("✅ Tokenizer build complete!")
    print("=" * 70)
    print(f"\nTokenizer saved to: {output_dir}")
    print(f"Vocabulary size: 32,000 tokens")
    print(f"Optimized for: mycology, drug discovery, AI systems, business strategy")
    print("\n📋 Next steps:")
    print("   1. Collect 1-2B tokens of training data")
    print("   2. Train the model from scratch with new tokenizer")
    print("   3. Evaluate on domain-specific benchmarks")


if __name__ == "__main__":
    main()