Spaces:

lemms
/

llm

Runtime error

App Files Files Community

lemms commited on Aug 22, 2025

Commit

ef6446c

verified ·

1 Parent(s): 4152f38

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

core/src/__init__.py +16 -0
core/src/data_loader.py +493 -0
core/src/download_and_prepare.py +243 -0
core/src/enterprise_integration.py +139 -0
core/src/evaluate_model.py +767 -0
core/src/export_model.py +727 -0
core/src/generate_text.py +866 -0
core/src/inference_server.py +907 -0
core/src/main.py +842 -0
core/src/mixed_precision.py +220 -0
core/src/model.py +665 -0
core/src/model_test.py +564 -0
core/src/optimized_data_loader.py +437 -0
core/src/optimized_inference_server.py +739 -0
core/src/performance_monitor.py +543 -0
core/src/quantization.py +286 -0
core/src/train_model.py +668 -0
core/src/train_tokenizer.py +428 -0

core/src/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Core source package for OpenLLM
+# This file makes the core/src directory a Python package
+"""
+OpenLLM Core Source Package
+This package contains the core implementation of the OpenLLM language model,
+including model architecture, training, inference, and data processing components.
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+__version__ = "1.0.0"
+__author__ = "Louis Chua Bean Chong"
+__license__ = "GPLv3"

core/src/data_loader.py ADDED Viewed

	@@ -0,0 +1,493 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+Training Data Loader for Language Model Training
+This module provides efficient data loading and batching for training GPT-style
+language models. It handles text preprocessing, tokenization, and creates
+batches suitable for autoregressive language modeling.
+FEATURES:
+- Memory-efficient text loading with sliding window
+- Automatic tokenization using trained SentencePiece model
+- Configurable sequence length and batch size
+- CPU-optimized data loading for limited hardware
+- Support for training data validation and statistics
+MEMORY OPTIMIZATION:
+- Streaming data loading (doesn't load entire dataset to memory)
+- Configurable chunk sizes for large files
+- Efficient tensor creation and batching
+- Garbage collection hints for memory management
+Usage:
+    from data_loader import TextDataLoader
+    loader = TextDataLoader(
+        data_file="data/clean/training_data.txt",
+        tokenizer_path="data/tokenizer/tokenizer.model",
+        seq_len=512,
+        batch_size=4
+    )
+    for batch in loader:
+        input_ids, targets = batch
+        # input_ids: (batch_size, seq_len)
+        # targets: (batch_size, seq_len) - shifted by 1 for next token prediction
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import gc
+import os
+import random
+import time
+from typing import Iterator, List, Tuple
+import torch
+try:
+    import sentencepiece as spm
+except ImportError:
+    print("ERROR: SentencePiece not installed. Run: pip install sentencepiece")
+    exit(1)
+class TextDataLoader:
+    """
+    Efficient data loader for autoregressive language model training.
+    This class handles loading text data, tokenizing it using SentencePiece,
+    and creating batches suitable for next-token prediction training.
+    """
+    def __init__(
+        self,
+        data_file: str,
+        tokenizer_path: str,
+        seq_len: int = 512,
+        batch_size: int = 4,
+        chunk_size: int = 1000000,  # Lines to read at once
+        shuffle: bool = True,
+        seed: int = 42,
+    ):
+        """
+        Initialize the data loader.
+        Args:
+            data_file: Path to training text file (one passage per line)
+            tokenizer_path: Path to trained SentencePiece model
+            seq_len: Maximum sequence length for training
+            batch_size: Batch size for training
+            chunk_size: Number of lines to read in memory at once
+            shuffle: Whether to shuffle training examples
+            seed: Random seed for reproducibility
+        """
+        self.data_file = data_file
+        self.tokenizer_path = tokenizer_path
+        self.seq_len = seq_len
+        self.batch_size = batch_size
+        self.chunk_size = chunk_size
+        self.shuffle = shuffle
+        self.seed = seed
+        # Validate inputs
+        self._validate_inputs()
+        # Load tokenizer
+        self.tokenizer = self._load_tokenizer()
+        # Get data statistics
+        self.total_lines = self._count_lines()
+        self.current_line = 0
+        # Initialize data attribute for testing compatibility
+        # Load a small sample of data for testing purposes
+        self.data = self._read_chunk(
+            0, min(self.chunk_size, 100)
+        )  # Load up to 100 passages for testing
+        # Set random seed for reproducibility
+        random.seed(seed)
+        print("📊 TextDataLoader initialized")
+        print(f"  Data file: {data_file}")
+        print(f"  Total passages: {self.total_lines:,}")
+        print(f"  Sequence length: {seq_len}")
+        print(f"  Batch size: {batch_size}")
+        print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
+    def _validate_inputs(self) -> None:
+        """Validate input parameters and file paths."""
+        if not os.path.exists(self.data_file):
+            raise FileNotFoundError(f"Training data file not found: {self.data_file}")
+        if not os.path.exists(self.tokenizer_path):
+            raise FileNotFoundError(f"Tokenizer model not found: {self.tokenizer_path}")
+        if self.seq_len <= 0:
+            raise ValueError(f"Sequence length must be positive, got {self.seq_len}")
+        if self.batch_size <= 0:
+            raise ValueError(f"Batch size must be positive, got {self.batch_size}")
+        if self.chunk_size <= 0:
+            raise ValueError(f"Chunk size must be positive, got {self.chunk_size}")
+    def _load_tokenizer(self) -> spm.SentencePieceProcessor:
+        """Load the trained SentencePiece tokenizer."""
+        try:
+            tokenizer = spm.SentencePieceProcessor()
+            tokenizer.load(self.tokenizer_path)
+            return tokenizer
+        except Exception as e:
+            raise RuntimeError(f"Failed to load tokenizer: {e}")
+    def _count_lines(self) -> int:
+        """Count total number of lines in the data file."""
+        print("📏 Counting training passages...")
+        start_time = time.time()
+        line_count = 0
+        with open(self.data_file, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.strip():  # Only count non-empty lines
+                    line_count += 1
+        count_time = time.time() - start_time
+        print(f"✓ Found {line_count:,} passages in {count_time:.1f}s")
+        return line_count
+    def _read_chunk(self, start_line: int = 0, limit: int = None) -> List[str]:
+        """
+        Read a chunk of lines from the data file.
+        Args:
+            start_line: Line number to start reading from
+            limit: Maximum number of lines to read (None for default chunk_size)
+        Returns:
+            List of text passages
+        """
+        chunk = []
+        current_line = 0
+        lines_read = 0
+        max_lines = limit if limit is not None else self.chunk_size
+        with open(self.data_file, "r", encoding="utf-8") as f:
+            for line in f:
+                if current_line < start_line:
+                    current_line += 1
+                    continue
+                text = line.strip()
+                if text:  # Only include non-empty lines
+                    chunk.append(text)
+                    lines_read += 1
+                    if lines_read >= max_lines:
+                        break
+                current_line += 1
+        return chunk
+    def _tokenize_texts(self, texts: List[str]) -> List[List[int]]:
+        """
+        Tokenize a list of text passages using SentencePiece tokenizer.
+        This method converts raw text into token ID sequences suitable for language model training.
+        It handles special tokens (BOS/EOS) and length constraints for efficient training.
+        Text processing pipeline:
+        1. Add BOS (Beginning of Sequence) token to mark sequence start
+        2. Tokenize text using trained SentencePiece model (subword tokenization)
+        3. Truncate sequences that exceed maximum length
+        4. Add EOS (End of Sequence) token to mark sequence end
+        Special token handling:
+        - BOS token helps model learn to generate text from scratch
+        - EOS token signals natural sequence endings
+        - These tokens are crucial for proper autoregressive generation
+        Args:
+            texts: List of text passages (typically Wikipedia passages from SQUAD)
+                  Each passage should be a complete, coherent text segment
+        Returns:
+            List of token ID sequences, where each sequence is a list of integers
+            representing subword tokens from the SentencePiece vocabulary
+        """
+        tokenized = []
+        for text in texts:
+            try:
+                # Add BOS (Beginning of Sequence) token at the start
+                # BOS token ID=2 by default in SentencePiece, signals sequence start
+                # This helps the model learn proper sequence initialization during generation
+                tokens = [self.tokenizer.bos_id()] + self.tokenizer.encode(text)
+                # Truncate sequences that exceed maximum context length
+                # Reserve one position for EOS token by using (seq_len - 1)
+                # This ensures we never exceed the model's context window during training
+                if len(tokens) > self.seq_len - 1:
+                    tokens = tokens[: self.seq_len - 1]
+                    # NOTE: Truncation may cut off text mid-sentence, but this is acceptable
+                    # for language modeling where the model learns from partial contexts
+                # Add EOS (End of Sequence) token at the end
+                # EOS token ID=1 by default in SentencePiece, signals sequence completion
+                # This teaches the model when to stop generating text naturally
+                tokens.append(self.tokenizer.eos_id())
+                # Validate tokenization result
+                if len(tokens) <= 2:  # Only BOS + EOS tokens, no actual content
+                    print(f"⚠️  Skipping very short text: {text[:50]}...")
+                    continue
+                tokenized.append(tokens)
+            except Exception as e:
+                # Handle tokenization errors gracefully to avoid stopping training
+                # Common causes: encoding issues, very long texts, special characters
+                print(f"⚠️  Failed to tokenize passage: {text[:50]}... Error: {e}")
+                continue
+        # Log tokenization statistics for monitoring
+        if tokenized:
+            avg_length = sum(len(tokens) for tokens in tokenized) / len(tokenized)
+            print(f"📊 Tokenized {len(tokenized)} passages, avg length: {avg_length:.1f} tokens")
+        return tokenized
+    def _create_training_examples(
+        self, token_sequences: List[List[int]]
+    ) -> List[Tuple[List[int], List[int]]]:
+        """
+        Create training examples with input and target sequences.
+        For autoregressive training, targets are inputs shifted by one position.
+        Args:
+            token_sequences: List of tokenized sequences
+        Returns:
+            List of (input_ids, target_ids) tuples
+        """
+        examples = []
+        for tokens in token_sequences:
+            if len(tokens) < 2:  # Need at least 2 tokens for input/target pair
+                continue
+            # For sequences longer than seq_len, create multiple examples with sliding window
+            if len(tokens) > self.seq_len:
+                # Create overlapping windows (50% overlap for better learning)
+                stride = self.seq_len // 2
+                for i in range(0, len(tokens) - self.seq_len, stride):
+                    input_ids = tokens[i : i + self.seq_len]
+                    target_ids = tokens[i + 1 : i + self.seq_len + 1]
+                    examples.append((input_ids, target_ids))
+            else:
+                # Pad shorter sequences
+                input_ids = tokens[:-1]  # All but last token
+                target_ids = tokens[1:]  # All but first token
+                # Pad to seq_len if necessary
+                while len(input_ids) < self.seq_len:
+                    input_ids.append(self.tokenizer.pad_id())
+                    target_ids.append(-1)  # Use -1 for padding in targets (ignored in loss)
+                # Truncate if still too long
+                input_ids = input_ids[: self.seq_len]
+                target_ids = target_ids[: self.seq_len]
+                examples.append((input_ids, target_ids))
+        return examples
+    def _create_batch(
+        self, examples: List[Tuple[List[int], List[int]]]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Create a batch tensor from training examples.
+        Args:
+            examples: List of (input_ids, target_ids) tuples
+        Returns:
+            Tuple of (input_tensor, target_tensor)
+        """
+        if not examples:
+            raise ValueError("Cannot create batch from empty examples")
+        batch_size = len(examples)
+        # Initialize tensors
+        input_ids = torch.zeros((batch_size, self.seq_len), dtype=torch.long)
+        target_ids = torch.full((batch_size, self.seq_len), -1, dtype=torch.long)
+        # Fill tensors
+        for i, (inp, tgt) in enumerate(examples):
+            input_ids[i, : len(inp)] = torch.tensor(inp, dtype=torch.long)
+            target_ids[i, : len(tgt)] = torch.tensor(tgt, dtype=torch.long)
+        return input_ids, target_ids
+    def __iter__(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Iterate over training batches.
+        Yields:
+            Tuple of (input_ids, target_ids) tensors
+        """
+        self.current_line = 0
+        while self.current_line < self.total_lines:
+            # Read chunk of text
+            texts = self._read_chunk(self.current_line)
+            if not texts:
+                break
+            # Tokenize texts
+            token_sequences = self._tokenize_texts(texts)
+            # Create training examples
+            examples = self._create_training_examples(token_sequences)
+            # Shuffle examples if requested
+            if self.shuffle:
+                random.shuffle(examples)
+            # Create batches
+            for i in range(0, len(examples), self.batch_size):
+                batch_examples = examples[i : i + self.batch_size]
+                if len(batch_examples) == self.batch_size:  # Only yield full batches
+                    try:
+                        input_ids, target_ids = self._create_batch(batch_examples)
+                        yield input_ids, target_ids
+                    except Exception as e:
+                        print(f"⚠️  Failed to create batch: {e}")
+                        continue
+            # Update progress
+            self.current_line += len(texts)
+            # Clean up memory
+            del texts, token_sequences, examples
+            gc.collect()
+    def get_data_stats(self) -> dict:
+        """
+        Get statistics about the training data.
+        Returns:
+            Dictionary with data statistics
+        """
+        print("📊 Analyzing training data...")
+        # Sample some data to get statistics
+        sample_texts = self._read_chunk(0)[:100]  # Sample first 100 passages
+        token_sequences = self._tokenize_texts(sample_texts)
+        if token_sequences:
+            sequence_lengths = [len(seq) for seq in token_sequences]
+            avg_length = sum(sequence_lengths) / len(sequence_lengths)
+            max_length = max(sequence_lengths)
+            min_length = min(sequence_lengths)
+        else:
+            avg_length = max_length = min_length = 0
+        # Estimate total tokens
+        estimated_total_tokens = int(avg_length * self.total_lines)
+        # Estimate number of batches per epoch
+        examples_per_passage = max(1, avg_length // self.seq_len)
+        total_examples = int(self.total_lines * examples_per_passage)
+        batches_per_epoch = total_examples // self.batch_size
+        stats = {
+            "total_passages": self.total_lines,
+            "avg_tokens_per_passage": avg_length,
+            "min_tokens_per_passage": min_length,
+            "max_tokens_per_passage": max_length,
+            "estimated_total_tokens": estimated_total_tokens,
+            "estimated_examples_per_epoch": total_examples,
+            "estimated_batches_per_epoch": batches_per_epoch,
+            "sequence_length": self.seq_len,
+            "batch_size": self.batch_size,
+            "vocabulary_size": self.tokenizer.vocab_size(),
+        }
+        print("✓ Data analysis complete:")
+        print(f"  Total passages: {stats['total_passages']:,}")
+        print(f"  Avg tokens per passage: {stats['avg_tokens_per_passage']:.1f}")
+        print(f"  Estimated total tokens: {stats['estimated_total_tokens']:,}")
+        print(f"  Estimated batches per epoch: {stats['estimated_batches_per_epoch']:,}")
+        return stats
+def test_data_loader():
+    """Test function for the data loader."""
+    print("🧪 Testing TextDataLoader...")
+    # Test with small parameters
+    try:
+        loader = TextDataLoader(
+            data_file="data/clean/training_data.txt",
+            tokenizer_path="data/tokenizer/tokenizer.model",
+            seq_len=128,
+            batch_size=2,
+            chunk_size=10,  # Small for testing
+        )
+        # Get data statistics
+        _ = loader.get_data_stats()
+        # Test iteration
+        print("\n🔄 Testing batch iteration...")
+        start_time = time.time()
+        batch_count = 0
+        for batch_idx, (input_ids, target_ids) in enumerate(loader):
+            batch_count += 1
+            print(f"Batch {batch_idx + 1}:")
+            print(f"  Input shape: {input_ids.shape}")
+            print(f"  Target shape: {target_ids.shape}")
+            print(f"  Sample input tokens: {input_ids[0][:10].tolist()}")
+            print(f"  Sample target tokens: {target_ids[0][:10].tolist()}")
+            if batch_idx >= 2:  # Only test first few batches
+                break
+        test_time = time.time() - start_time
+        print("\n✓ Data loader test completed successfully!")
+        print(f"  Processed {batch_count} batches in {test_time:.2f}s")
+        print(f"  Average time per batch: {test_time/max(1, batch_count):.2f}s")
+        return True
+    except Exception as e:
+        print(f"❌ Data loader test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    test_data_loader()

core/src/download_and_prepare.py ADDED Viewed

	@@ -0,0 +1,243 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+r"""
+Download and prepare training data from the SQUAD dataset.
+OVERVIEW:
+This script downloads the SQUAD (Stanford Question Answering Dataset) from its official source,
+extracts the Wikipedia context passages from the JSON format, and saves the cleaned text to disk.
+The SQUAD dataset contains high-quality Wikipedia articles that are perfect for training language models.
+DATA FLOW:
+1. Downloads 4 JSON files from Stanford (SQUAD v1.1 & v2.0, train & dev splits)
+2. Parses JSON structure: data -> articles -> paragraphs -> context
+3. Extracts only the 'context' fields (Wikipedia passages, not questions/answers)
+4. Cleans text: normalizes whitespace, filters by minimum word count
+5. Outputs one passage per line in a single text file
+The output is a single text file containing ~150k-200k Wikipedia article passages,
+suitable for training tokenizers and language models.
+DATASET INFO:
+- SQUAD v1.1: 87k train + 10k dev examples
+- SQUAD v2.0: 130k train + 11k dev examples
+- Source: High-quality Wikipedia articles across diverse topics
+- Total download size: ~200MB
+- Final processed size: ~100-150MB of clean text
+Usage:
+    python core/src/download_and_prepare.py
+Output:
+    data/clean/training_data.txt - Cleaned Wikipedia passages from SQUAD dataset
+Requirements:
+    pip install requests tqdm
+Example setup:
+Windows PowerShell:
+```powershell
+python -m venv venv
+.\venv\Scripts\Activate.ps1
+pip install requests tqdm
+python core/src/download_and_prepare.py
+```
+Linux/macOS:
+```bash
+python -m venv venv
+source venv/bin/activate
+pip install requests tqdm
+python core/src/download_and_prepare.py
+```
+"""
+import json
+import os
+import requests
+from tqdm import tqdm
+def download_file(url, filename):
+    """
+    Download a file from URL with progress bar.
+    Args:
+        url (str): URL to download from
+        filename (str): Local path where file should be saved
+    """
+    # Stream the download to handle large files efficiently
+    response = requests.get(url, stream=True, timeout=30)
+    total_size = int(response.headers.get("content-length", 0))
+    # Use tqdm progress bar to show download progress
+    with open(filename, "wb") as file, tqdm(
+        desc=filename,
+        total=total_size,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as pbar:
+        # Download in 1KB chunks
+        for data in response.iter_content(chunk_size=1024):
+            size = file.write(data)
+            pbar.update(size)
+def prepare_training_data(output_path="data/clean/training_data.txt", min_words=10):
+    """
+    Downloads the SQUAD dataset and extracts Wikipedia context passages for training.
+    SQUAD Dataset Structure:
+    - Each JSON file contains a 'data' array of Wikipedia articles
+    - Each article has 'paragraphs' containing 'context' (Wikipedia text) and 'qas' (questions/answers)
+    - We extract only the 'context' fields which contain high-quality Wikipedia passages
+    Args:
+        output_path (str): Path to save the cleaned text data.
+        min_words (int): Minimum number of words required for a passage to be included.
+    """
+    print("Downloading SQUAD dataset...")
+    # Official SQUAD dataset URLs from Stanford
+    # Using both v1.1 and v2.0 for maximum training data
+    # v1.1: ~87k training + 10k dev examples
+    # v2.0: ~130k training + 11k dev examples (includes unanswerable questions)
+    squad_urls = [
+        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json",
+        "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json",
+        "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json",
+        "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
+    ]
+    # Create directory structure for temporary files
+    os.makedirs("data/raw", exist_ok=True)
+    downloaded_files = []
+    # Download each SQUAD dataset file
+    print("Step 1: Downloading SQUAD JSON files...")
+    for i, url in enumerate(squad_urls):
+        filename = f"data/raw/squad_{i+1}.json"
+        try:
+            print(f"Downloading {url}...")
+            download_file(url, filename)
+            downloaded_files.append(filename)
+            print(f"Successfully downloaded {filename}")
+        except Exception as e:
+            print(f"Failed to download {url}: {e}")
+            continue
+    # Verify we have at least one successful download
+    if not downloaded_files:
+        print("ERROR: No files were downloaded successfully.")
+        return
+    # Ensure output directory exists
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    print(f"\nStep 2: Processing SQUAD files and saving to {output_path}...")
+    # Process each downloaded SQUAD JSON file and extract contexts
+    with open(output_path, "w", encoding="utf-8") as f:
+        total_contexts = 0
+        for file_path in downloaded_files:
+            print(f"Processing {file_path}...")
+            try:
+                # Load and parse the JSON file
+                with open(file_path, "r", encoding="utf-8") as json_file:
+                    squad_data = json.load(json_file)
+                # Navigate the SQUAD JSON structure to extract context passages
+                # Structure: data -> articles -> paragraphs -> context
+                contexts = []
+                for article in squad_data.get("data", []):
+                    # Each article represents a Wikipedia page
+                    for paragraph in article.get("paragraphs", []):
+                        # Each paragraph contains a 'context' (Wikipedia passage) and 'qas' (Q&A pairs)
+                        context = paragraph.get("context", "").strip()
+                        if context:
+                            contexts.append(context)
+                print(f"Found {len(contexts)} Wikipedia passages in {os.path.basename(file_path)}")
+                # Clean and filter each context passage for high-quality training data
+                # This preprocessing is crucial for effective language model training
+                for context in tqdm(contexts, desc=f"Processing {os.path.basename(file_path)}"):
+                    # Text normalization and cleaning pipeline
+                    # Step 1: Normalize whitespace to ensure consistent formatting
+                    # - Collapse multiple spaces/tabs into single spaces
+                    # - Remove excessive newlines that break sentence flow
+                    # - Strip leading/trailing whitespace
+                    # This preserves natural sentence structure while cleaning artifacts
+                    cleaned_text = " ".join(context.split())
+                    # Step 2: Skip empty passages after cleaning
+                    # Empty passages can occur from malformed JSON or pure whitespace
+                    if not cleaned_text:
+                        continue
+                    # Step 3: Quality filtering based on content length
+                    # Apply minimum word count filter to ensure substantial content
+                    # Short passages (< min_words) provide insufficient context for language modeling
+                    # Wikipedia passages are typically well-formed, so this mainly catches truncated text
+                    word_count = len(cleaned_text.split())
+                    if word_count >= min_words:
+                        # Write each passage on a new line for easy processing by data loaders
+                        # The line-based format enables efficient streaming during training
+                        # Each line represents one coherent Wikipedia passage
+                        f.write(cleaned_text + "\n")
+                        total_contexts += 1
+                    # Optional: Log extremely short passages for monitoring data quality
+                    elif word_count > 0:  # Non-empty but too short
+                        if total_contexts % 1000 == 0:  # Log occasionally to avoid spam
+                            print(
+                                f"⚠️  Skipped short passage ({word_count} words): {cleaned_text[:50]}..."
+                            )
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
+                continue
+    print(f"\nStep 3: Successfully saved {total_contexts} Wikipedia passages from SQUAD dataset.")
+    print(f"Output file: {output_path}")
+    # Clean up temporary downloaded files to save disk space
+    print("Step 4: Cleaning up temporary files...")
+    for file in downloaded_files:
+        try:
+            os.remove(file)
+            print(f"Removed {file}")
+        except Exception as e:
+            print(f"Warning: Could not remove {file}: {e}")
+if __name__ == "__main__":
+    """
+    Main execution block - runs when script is called directly.
+    This will:
+    1. Download SQUAD v1.1 and v2.0 datasets (~200MB total)
+    2. Extract ~240k Wikipedia passages from the JSON files
+    3. Clean and filter the text (remove passages < 10 words)
+    4. Save all passages to data/clean/training_data.txt (one per line)
+    5. Clean up temporary files
+    Expected output: ~150k-200k high-quality Wikipedia passages suitable for LM training
+    """
+    # Run the data preparation function with default parameters
+    prepare_training_data()

core/src/enterprise_integration.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+Enterprise Integration Layer for OpenLLM
+This module provides an optional plugin mechanism to load enterprise-only
+modules without coupling the open source core to proprietary code. It follows
+the project rule: core functionality must work without proprietary
+dependencies; enterprise features are optional extensions.
+How it works:
+- Attempts to locate a Python module that exposes enterprise commands
+- Supports two discovery methods:
+  1) Python package on sys.path: `openllm_enterprise`
+  2) Filesystem path via env var `OPENLLM_ENTERPRISE_PATH` that contains
+     a module with `register_cli(subparsers)` function
+- If found, calls `register_cli(subparsers)` to register additional CLI commands
+Security and Licensing:
+- No proprietary code is included in the open repository
+- This module only performs optional dynamic imports if the user provides
+  an enterprise package or path
+- All core code remains GPLv3 compliant
+Usage (enterprise side expected contract):
+    # In the enterprise package/module
+    def register_cli(subparsers):
+        parser = subparsers.add_parser(
+            "enterprise-train",
+            help="Enterprise: RLHF training",
+            description="Run RLHF training using enterprise-only components."
+        )
+        parser.add_argument("--config", required=True)
+        parser.set_defaults(func=enterprise_train_entry)
+    def enterprise_train_entry(args):
+        ...
+Author: Louis Chua Bean Chong
+License: GPLv3 (core); enterprise modules remain out-of-tree
+"""
+from __future__ import annotations
+import importlib
+import os
+import sys
+from pathlib import Path
+from typing import Any
+def _try_import_by_name(module_name: str):
+    """Attempt to import a module by name. Returns module or None on failure."""
+    try:
+        return importlib.import_module(module_name)
+    except Exception:
+        return None
+def _try_import_from_path(module_path: str):
+    """
+    Attempt to import a module from a filesystem path.
+    The path may point either to a package directory (containing __init__.py)
+    or to a .py file. This function prepends the parent directory to sys.path
+    and imports the module by stem name.
+    """
+    try:
+        path = Path(module_path)
+        if not path.exists():
+            return None
+        if path.is_file():
+            parent = str(path.parent)
+            mod_name = path.stem
+        else:
+            parent = str(path.parent)
+            mod_name = path.name
+        if parent not in sys.path:
+            sys.path.insert(0, parent)
+        return importlib.import_module(mod_name)
+    except Exception:
+        return None
+def load_enterprise_cli(subparsers: Any) -> bool:
+    """
+    Try to load enterprise-only CLI commands.
+    Discovery order:
+    1) Python package/module named `openllm_enterprise`
+    2) Env var `OPENLLM_ENTERPRISE_PATH` pointing to a package dir or .py file
+    If a discovered module exposes `register_cli(subparsers)`, it will be called
+    to register enterprise commands. Returns True if any enterprise module was
+    loaded successfully; otherwise False.
+    """
+    # 1) Try well-known package name
+    enterprise_mod = _try_import_by_name("openllm_enterprise")
+    if enterprise_mod and hasattr(enterprise_mod, "register_cli"):
+        try:
+            enterprise_mod.register_cli(subparsers)
+            print("🔌 Loaded enterprise commands from openllm_enterprise package")
+            return True
+        except Exception as e:
+            # Fail gracefully; core must continue to work
+            print(f"Warning: Enterprise module registration failed: {e}")
+    # 2) Try explicit path via environment variable
+    enterprise_path = os.environ.get("OPENLLM_ENTERPRISE_PATH")
+    if enterprise_path:
+        enterprise_mod = _try_import_from_path(enterprise_path)
+        if enterprise_mod and hasattr(enterprise_mod, "register_cli"):
+            try:
+                enterprise_mod.register_cli(subparsers)
+                print(
+                    "🔌 Loaded enterprise commands from OPENLLM_ENTERPRISE_PATH="
+                    f"{enterprise_path}"
+                )
+                return True
+            except Exception as e:
+                # Fail gracefully
+                print(f"Warning: Enterprise module registration failed: {e}")
+    # Not found (by design this is optional)
+    return False
+__all__ = ["load_enterprise_cli"]

core/src/evaluate_model.py ADDED Viewed

	@@ -0,0 +1,767 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+OpenLLM Model Evaluation Script
+This script implements comprehensive evaluation for trained OpenLLM models,
+including intrinsic evaluation (perplexity, loss) and text generation quality
+assessment as specified in Step 5 of the training pipeline.
+Usage:
+    python core/src/evaluate_model.py \
+        --model_dir models/openllm-medium \
+        --eval_data data/clean/validation_data.txt \
+        --metrics perplexity,loss
+Features:
+- Perplexity calculation on held-out data
+- Text generation quality assessment
+- Multiple evaluation metrics
+- Comprehensive quality benchmarks
+- JSON output for downstream analysis
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import argparse
+import json
+import math
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as smp
+import torch
+# Add current directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model import GPTModel, create_model
+class ModelEvaluator:
+    """
+    Comprehensive evaluator for OpenLLM models.
+    Implements intrinsic evaluation metrics and text generation quality
+    assessment following the training pipeline specifications.
+    """
+    def __init__(self, model: GPTModel, tokenizer_path: str, device: str = "cpu"):
+        """
+        Initialize the model evaluator.
+        Args:
+            model: Trained GPT model
+            tokenizer_path: Path to tokenizer model file
+            device: Device to run evaluation on
+        """
+        self.model = model.to(device)
+        self.device = device
+        # Load tokenizer
+        self.tokenizer = smp.SentencePieceProcessor()
+        self.tokenizer.load(tokenizer_path)
+        print("🔧 ModelEvaluator initialized")
+        print(f"  Device: {device}")
+        print(f"  Model parameters: {model.get_num_params():,}")
+        print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
+    def evaluate_perplexity(
+        self, eval_data: List[str], max_seq_len: int = 512, batch_size: int = 1
+    ) -> Dict[str, float]:
+        """
+        Calculate perplexity on evaluation data.
+        Args:
+            eval_data: List of text passages for evaluation
+            max_seq_len: Maximum sequence length for evaluation
+            batch_size: Batch size for evaluation
+        Returns:
+            Dictionary with loss and perplexity metrics
+        """
+        self.model.eval()
+        total_loss = 0.0
+        total_tokens = 0
+        num_sequences = 0
+        print(f"📊 Calculating perplexity on {len(eval_data)} passages...")
+        with torch.no_grad():
+            for i, text in enumerate(eval_data):
+                if i % 100 == 0:
+                    print(f"  Progress: {i}/{len(eval_data)} passages")
+                # Tokenize text
+                tokens = self.tokenizer.encode(text)
+                if len(tokens) < 2:
+                    continue
+                # Truncate if too long
+                if len(tokens) > max_seq_len:
+                    tokens = tokens[:max_seq_len]
+                # Create input and target tensors
+                input_ids = torch.tensor([tokens[:-1]], dtype=torch.long, device=self.device)
+                target_ids = torch.tensor([tokens[1:]], dtype=torch.long, device=self.device)
+                # Forward pass
+                logits, loss = self.model(input_ids, target_ids)
+                # Accumulate loss
+                seq_length = len(tokens) - 1
+                total_loss += loss.item() * seq_length
+                total_tokens += seq_length
+                num_sequences += 1
+        # Calculate metrics
+        avg_loss = total_loss / total_tokens if total_tokens > 0 else float("inf")
+        perplexity = math.exp(min(avg_loss, 10))  # Cap to prevent overflow
+        return {
+            "loss": avg_loss,
+            "perplexity": perplexity,
+            "total_tokens": total_tokens,
+            "num_sequences": num_sequences,
+        }
+    def evaluate_text_generation(
+        self,
+        prompts: List[str],
+        max_length: int = 256,
+        temperature: float = 0.7,
+        top_k: Optional[int] = 40,
+        num_samples: int = 1,
+    ) -> List[Dict[str, Any]]:
+        """
+        Evaluate text generation quality.
+        Args:
+            prompts: List of input prompts
+            max_length: Maximum generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            num_samples: Number of samples per prompt
+        Returns:
+            List of generation results with quality metrics
+        """
+        self.model.eval()
+        results = []
+        print(f"✍️  Evaluating text generation on {len(prompts)} prompts...")
+        with torch.no_grad():
+            for prompt in prompts:
+                prompt_results = []
+                for sample_idx in range(num_samples):
+                    # Tokenize prompt
+                    input_ids = self.tokenizer.encode(prompt)
+                    input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+                    start_time = time.time()
+                    # Generate text
+                    output = self.model.generate(
+                        input_tensor,
+                        max_new_tokens=max_length,
+                        temperature=temperature,
+                        top_k=top_k,
+                    )
+                    generation_time = time.time() - start_time
+                    # Decode output
+                    generated_ids = output[0].tolist()
+                    full_text = self.tokenizer.decode(generated_ids)
+                    generated_text = self.tokenizer.decode(generated_ids[len(input_ids) :])
+                    # Calculate quality metrics
+                    quality_metrics = self._assess_generation_quality(generated_text)
+                    prompt_results.append(
+                        {
+                            "prompt": prompt,
+                            "generated_text": generated_text,
+                            "full_text": full_text,
+                            "generation_time": generation_time,
+                            "tokens_generated": len(generated_ids) - len(input_ids),
+                            "tokens_per_second": (len(generated_ids) - len(input_ids))
+                            / generation_time,
+                            "quality_metrics": quality_metrics,
+                        }
+                    )
+                results.extend(prompt_results)
+        return results
+    def _assess_generation_quality(self, text: str) -> Dict[str, float]:
+        """
+        Assess basic quality metrics for generated text.
+        Args:
+            text: Generated text to assess
+        Returns:
+            Dictionary of quality metrics
+        """
+        if not text.strip():
+            return {
+                "length": 0,
+                "avg_word_length": 0,
+                "repetition_rate": 1.0,
+                "coherence_score": 0.0,
+            }
+        words = text.split()
+        # Basic metrics
+        length = len(words)
+        avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
+        # Repetition rate (simple n-gram repetition)
+        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
+        unique_bigrams = len(set(bigrams))
+        repetition_rate = 1 - (unique_bigrams / len(bigrams) if bigrams else 0)
+        # Simple coherence score (based on sentence structure)
+        sentences = text.split(".")
+        valid_sentences = [s for s in sentences if len(s.strip().split()) > 3]
+        coherence_score = len(valid_sentences) / len(sentences) if sentences else 0
+        return {
+            "length": length,
+            "avg_word_length": avg_word_length,
+            "repetition_rate": repetition_rate,
+            "coherence_score": coherence_score,
+        }
+    def evaluate_downstream_tasks(self) -> Dict[str, Any]:
+        """
+        Evaluate model performance on downstream tasks.
+        This function implements basic downstream task evaluation including:
+        - Reading comprehension (simplified SQUAD-style)
+        - Sentiment analysis (few-shot)
+        - Common sense reasoning
+        Returns:
+            Dictionary of downstream task results
+        """
+        results = {}
+        # 1. Reading Comprehension (Simplified SQUAD-style)
+        results["reading_comprehension"] = self._evaluate_reading_comprehension()
+        # 2. Sentiment Analysis (Few-shot learning)
+        results["sentiment_analysis"] = self._evaluate_sentiment_analysis()
+        # 3. Common Sense Reasoning
+        results["reasoning"] = self._evaluate_reasoning()
+        # 4. Text Completion Quality
+        results["text_completion"] = self._evaluate_text_completion()
+        return results
+    def _evaluate_reading_comprehension(self) -> Dict[str, Any]:
+        """Simplified reading comprehension evaluation."""
+        # Sample reading comprehension tasks
+        tasks = [
+            {
+                "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.",
+                "question": "Who is the Eiffel Tower named after?",
+                "expected": "Gustave Eiffel",
+            },
+            {
+                "context": "Python is a high-level programming language. It was created by Guido van Rossum and first released in 1991.",
+                "question": "When was Python first released?",
+                "expected": "1991",
+            },
+            {
+                "context": "Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
+                "question": "What is machine learning a subset of?",
+                "expected": "artificial intelligence",
+            },
+        ]
+        correct = 0
+        total = len(tasks)
+        for task in tasks:
+            prompt = f"Context: {task['context']}\nQuestion: {task['question']}\nAnswer:"
+            # Generate answer
+            input_ids = self.tokenizer.encode(prompt)
+            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+            with torch.no_grad():
+                output = self.model.generate(input_tensor, max_new_tokens=20, temperature=0.1)
+            generated_ids = output[0].tolist()
+            answer = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
+            # Simple substring matching
+            if task["expected"].lower() in answer:
+                correct += 1
+        return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
+        }
+    def _evaluate_sentiment_analysis(self) -> Dict[str, Any]:
+        """Few-shot sentiment analysis evaluation."""
+        # Few-shot examples
+        examples = "Examples:\nText: 'I love this movie!' Sentiment: Positive\nText: 'This is terrible.' Sentiment: Negative\nText: 'It was okay.' Sentiment: Neutral\n\n"
+        # Test cases
+        test_cases = [
+            {"text": "This is amazing!", "expected": "positive"},
+            {"text": "I hate this.", "expected": "negative"},
+            {"text": "This is wonderful.", "expected": "positive"},
+            {"text": "This is awful.", "expected": "negative"},
+            {"text": "It was fine.", "expected": "neutral"},
+        ]
+        correct = 0
+        total = len(test_cases)
+        for case in test_cases:
+            prompt = f"{examples}Text: '{case['text']}' Sentiment:"
+            # Generate sentiment
+            input_ids = self.tokenizer.encode(prompt)
+            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+            with torch.no_grad():
+                output = self.model.generate(input_tensor, max_new_tokens=5, temperature=0.1)
+            generated_ids = output[0].tolist()
+            sentiment = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
+            # Check if expected sentiment is in the generated response
+            if case["expected"] in sentiment:
+                correct += 1
+        return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
+        }
+    def _evaluate_reasoning(self) -> Dict[str, Any]:
+        """Simple reasoning evaluation."""
+        # Basic reasoning tasks
+        tasks = [
+            {
+                "question": "If all birds can fly and a penguin is a bird, can a penguin fly?",
+                "expected": "no",  # This tests if model knows real-world facts
+            },
+            {
+                "question": "If it is raining outside, should you take an umbrella?",
+                "expected": "yes",
+            },
+            {"question": "What comes after Monday?", "expected": "tuesday"},
+            {"question": "Is the sun larger than the earth?", "expected": "yes"},
+        ]
+        correct = 0
+        total = len(tasks)
+        for task in tasks:
+            prompt = f"Question: {task['question']}\nAnswer:"
+            # Generate answer
+            input_ids = self.tokenizer.encode(prompt)
+            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+            with torch.no_grad():
+                output = self.model.generate(input_tensor, max_new_tokens=10, temperature=0.1)
+            generated_ids = output[0].tolist()
+            answer = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
+            # Check if expected answer is in the response
+            if task["expected"] in answer:
+                correct += 1
+        return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
+        }
+    def _evaluate_text_completion(self) -> Dict[str, Any]:
+        """Evaluate text completion quality."""
+        # Common phrases that should be completed predictably
+        completions = [
+            {"prompt": "The capital of France is", "expected_word": "paris"},
+            {"prompt": "Two plus two equals", "expected_word": "four"},
+            {"prompt": "The largest planet in our solar system is", "expected_word": "jupiter"},
+            {"prompt": "Water boils at", "expected_word": "100"},
+        ]
+        correct = 0
+        total = len(completions)
+        for completion in completions:
+            # Generate completion
+            input_ids = self.tokenizer.encode(completion["prompt"])
+            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+            with torch.no_grad():
+                output = self.model.generate(input_tensor, max_new_tokens=5, temperature=0.1)
+            generated_ids = output[0].tolist()
+            generated_text = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
+            # Check if expected word appears in completion
+            if completion["expected_word"] in generated_text:
+                correct += 1
+        return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
+        }
+    def run_comprehensive_evaluation(
+        self, eval_data_path: str, metrics: List[str] = None, generation_prompts: List[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Run comprehensive model evaluation.
+        Args:
+            eval_data_path: Path to evaluation text file
+            metrics: List of metrics to compute
+            generation_prompts: Prompts for text generation evaluation
+        Returns:
+            Complete evaluation results
+        """
+        if metrics is None:
+            metrics = ["perplexity", "loss", "generation"]
+        if generation_prompts is None:
+            generation_prompts = [
+                "The history of artificial intelligence",
+                "Machine learning algorithms",
+                "The future of technology",
+                "In a world where",
+                "Scientists have discovered",
+            ]
+        results = {
+            "model_info": {
+                "parameters": self.model.get_num_params(),
+                "device": self.device,
+                "vocab_size": self.tokenizer.vocab_size(),
+            },
+            "evaluation_timestamp": time.time(),
+        }
+        # Load evaluation data
+        print(f"📂 Loading evaluation data from {eval_data_path}")
+        if os.path.exists(eval_data_path):
+            with open(eval_data_path, "r", encoding="utf-8") as f:
+                eval_texts = [line.strip() for line in f if line.strip()]
+        else:
+            print("⚠️  Evaluation file not found, using sample texts")
+            eval_texts = [
+                "Artificial intelligence is a rapidly growing field of computer science.",
+                "Machine learning algorithms can learn patterns from data automatically.",
+                "Natural language processing helps computers understand human language.",
+                "Deep learning uses neural networks with multiple layers for complex tasks.",
+                "The development of large language models has transformed AI applications.",
+            ]
+        # Intrinsic evaluation
+        if "perplexity" in metrics or "loss" in metrics:
+            perplexity_results = self.evaluate_perplexity(eval_texts)
+            results["intrinsic_evaluation"] = perplexity_results
+        # Text generation evaluation
+        if "generation" in metrics:
+            generation_results = self.evaluate_text_generation(generation_prompts)
+            results["generation_evaluation"] = {
+                "results": generation_results,
+                "summary": self._summarize_generation_results(generation_results),
+            }
+        # Downstream tasks (placeholder)
+        results["downstream_evaluation"] = self.evaluate_downstream_tasks()
+        # Overall quality assessment
+        results["quality_assessment"] = self._assess_overall_quality(results)
+        return results
+    def _summarize_generation_results(self, results: List[Dict[str, Any]]) -> Dict[str, float]:
+        """Summarize text generation results."""
+        if not results:
+            return {}
+        total_time = sum(r["generation_time"] for r in results)
+        total_tokens = sum(r["tokens_generated"] for r in results)
+        quality_metrics = [r["quality_metrics"] for r in results]
+        return {
+            "avg_generation_time": total_time / len(results),
+            "avg_tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
+            "avg_length": sum(q["length"] for q in quality_metrics) / len(quality_metrics),
+            "avg_repetition_rate": sum(q["repetition_rate"] for q in quality_metrics)
+            / len(quality_metrics),
+            "avg_coherence_score": sum(q["coherence_score"] for q in quality_metrics)
+            / len(quality_metrics),
+        }
+    def _assess_overall_quality(self, results: Dict[str, Any]) -> Dict[str, Any]:
+        """Assess overall model quality based on evaluation results."""
+        assessment = {"quality_level": "unknown", "recommendations": []}
+        # Check intrinsic metrics
+        if "intrinsic_evaluation" in results:
+            perplexity = results["intrinsic_evaluation"].get("perplexity", float("inf"))
+            if perplexity < 12:
+                assessment["quality_level"] = "good"
+                assessment["recommendations"].append("Model shows good perplexity scores")
+            elif perplexity < 50:
+                assessment["quality_level"] = "fair"
+                assessment["recommendations"].append(
+                    "Model shows fair performance, could benefit from more training"
+                )
+            else:
+                assessment["quality_level"] = "poor"
+                assessment["recommendations"].append(
+                    "Model needs significant more training or data improvements"
+                )
+        # Check generation quality
+        if "generation_evaluation" in results:
+            summary = results["generation_evaluation"].get("summary", {})
+            repetition_rate = summary.get("avg_repetition_rate", 1.0)
+            coherence_score = summary.get("avg_coherence_score", 0.0)
+            if repetition_rate > 0.7:
+                assessment["recommendations"].append(
+                    "High repetition rate - consider training longer or adjusting data"
+                )
+            if coherence_score < 0.3:
+                assessment["recommendations"].append(
+                    "Low coherence - model may need more training steps"
+                )
+        return assessment
+def load_model_from_directory(model_dir: str, device: str = "cpu") -> Tuple[GPTModel, str]:
+    """
+    Load model from directory containing checkpoints.
+    Args:
+        model_dir: Directory containing model files
+        device: Device to load model on
+    Returns:
+        Tuple of (model, tokenizer_path)
+    """
+    model_dir = Path(model_dir)
+    # Find best model checkpoint
+    best_model_path = model_dir / "best_model.pt"
+    if not best_model_path.exists():
+        # Look for latest checkpoint
+        checkpoints = list(model_dir.glob("checkpoint_step_*.pt"))
+        if not checkpoints:
+            raise FileNotFoundError(f"No model checkpoints found in {model_dir}")
+        # Get latest checkpoint
+        latest_checkpoint = max(checkpoints, key=lambda p: int(p.stem.split("_")[-1]))
+        best_model_path = latest_checkpoint
+    print(f"📂 Loading model from {best_model_path}")
+    # Load checkpoint
+    checkpoint = torch.load(best_model_path, map_location=device)
+    # Determine model size from config
+    config = checkpoint.get("config", {})
+    n_layer = config.get("n_layer", 12)
+    if n_layer <= 6:
+        model_size = "small"
+    elif n_layer <= 12:
+        model_size = "medium"
+    else:
+        model_size = "large"
+    # Create and load model
+    model = create_model(model_size)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    print(f"✅ Model loaded successfully ({model_size}, {model.get_num_params():,} parameters)")
+    # Find tokenizer
+    tokenizer_path = model_dir.parent / "tokenizer" / "tokenizer.model"
+    if not tokenizer_path.exists():
+        tokenizer_path = Path("data/tokenizer/tokenizer.model")
+    if not tokenizer_path.exists():
+        raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+    return model, str(tokenizer_path)
+def main():
+    """Main evaluation function."""
+    parser = argparse.ArgumentParser(
+        description="Evaluate OpenLLM model performance",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic evaluation
+  python core/src/evaluate_model.py \\
+    --model_dir models/small-extended-4k \\
+    --eval_data data/clean/training_data.txt
+  # Specific metrics
+  python core/src/evaluate_model.py \\
+    --model_dir models/small-extended-4k \\
+    --metrics perplexity,generation \\
+    --output results.json
+        """,
+    )
+    parser.add_argument("--model_dir", required=True, help="Directory containing trained model")
+    parser.add_argument(
+        "--eval_data", help="Path to evaluation text file (default: use sample texts)"
+    )
+    parser.add_argument(
+        "--metrics",
+        default="perplexity,loss,generation",
+        help="Comma-separated list of metrics to evaluate (default: perplexity,loss,generation)",
+    )
+    parser.add_argument("--output", help="Output JSON file for results (default: print to console)")
+    parser.add_argument(
+        "--device",
+        choices=["cpu", "cuda", "auto"],
+        default="auto",
+        help="Device for evaluation (default: auto)",
+    )
+    parser.add_argument(
+        "--generation_prompts", help="File containing prompts for text generation evaluation"
+    )
+    args = parser.parse_args()
+    print("📊 OpenLLM Model Evaluation")
+    print("=" * 50)
+    # Determine device
+    if args.device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    else:
+        device = args.device
+    print(f"Using device: {device}")
+    try:
+        # Load model
+        model, tokenizer_path = load_model_from_directory(args.model_dir, device)
+        # Create evaluator
+        evaluator = ModelEvaluator(model, tokenizer_path, device)
+        # Parse metrics
+        metrics = [m.strip() for m in args.metrics.split(",")]
+        # Load generation prompts if specified
+        generation_prompts = None
+        if args.generation_prompts and os.path.exists(args.generation_prompts):
+            with open(args.generation_prompts, "r", encoding="utf-8") as f:
+                generation_prompts = [line.strip() for line in f if line.strip()]
+        # Run evaluation
+        eval_data_path = args.eval_data or "data/clean/training_data.txt"
+        results = evaluator.run_comprehensive_evaluation(
+            eval_data_path, metrics, generation_prompts
+        )
+        # Output results
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                json.dump(results, f, indent=2)
+            print(f"\n💾 Results saved to {args.output}")
+        else:
+            print("\n📊 Evaluation Results:")
+            print("=" * 50)
+            # Print key metrics
+            if "intrinsic_evaluation" in results:
+                intrinsic = results["intrinsic_evaluation"]
+                print("📈 Intrinsic Metrics:")
+                print(f"  Loss: {intrinsic['loss']:.4f}")
+                print(f"  Perplexity: {intrinsic['perplexity']:.2f}")
+                print(f"  Sequences evaluated: {intrinsic['num_sequences']:,}")
+            if "generation_evaluation" in results:
+                gen_summary = results["generation_evaluation"]["summary"]
+                print("\n✍️  Generation Quality:")
+                print(
+                    f"  Avg generation speed: {gen_summary['avg_tokens_per_second']:.1f} tokens/sec"
+                )
+                print(f"  Avg text length: {gen_summary['avg_length']:.1f} words")
+                print(f"  Repetition rate: {gen_summary['avg_repetition_rate']:.3f}")
+                print(f"  Coherence score: {gen_summary['avg_coherence_score']:.3f}")
+            # Quality assessment
+            if "quality_assessment" in results:
+                assessment = results["quality_assessment"]
+                print("\n🎯 Overall Assessment:")
+                print(f"  Quality Level: {assessment['quality_level'].upper()}")
+                for rec in assessment["recommendations"]:
+                    print(f"  • {rec}")
+        print("\n🎉 Evaluation completed successfully!")
+    except Exception as e:
+        print(f"\n❌ Evaluation failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+if __name__ == "__main__":
+    main()

core/src/export_model.py ADDED Viewed

	@@ -0,0 +1,727 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+OpenLLM Model Export Script
+This script implements Step 6 of the training pipeline: Model Export & Deployment.
+It exports trained OpenLLM models to various formats for production inference.
+Supported Formats:
+- PyTorch native format (for Python inference)
+- Hugging Face format (for ecosystem compatibility)
+- ONNX format (for optimized cross-platform inference)
+Usage:
+    # PyTorch format
+    python core/src/export_model.py \
+        --model_dir models/small-extended-4k \
+        --format pytorch \
+        --output_dir exports/pytorch/
+    # Hugging Face format
+    python core/src/export_model.py \
+        --model_dir models/small-extended-4k \
+        --format huggingface \
+        --output_dir exports/huggingface/
+    # ONNX format
+    python core/src/export_model.py \
+        --model_dir models/small-extended-4k \
+        --format onnx \
+        --output_dir exports/onnx/ \
+        --optimize_for_inference
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import argparse
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict
+import torch
+# Add current directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model import create_model
+class ModelExporter:
+    """
+    Comprehensive model exporter for OpenLLM models.
+    Handles export to multiple formats including PyTorch, Hugging Face,
+    and ONNX for different deployment scenarios.
+    """
+    def __init__(self, model_dir: str, output_dir: str):
+        """
+        Initialize the model exporter.
+        Args:
+            model_dir: Directory containing trained model checkpoints
+            output_dir: Base directory for exported models
+        """
+        self.model_dir = Path(model_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Load model and metadata
+        self.model, self.config, self.training_info = self._load_model()
+        self.tokenizer_path = self._find_tokenizer()
+        print("🔧 ModelExporter initialized")
+        print(f"  Model: {self.config.model_name}")
+        print(f"  Parameters: {self.model.get_num_params():,}")
+        print(f"  Output directory: {output_dir}")
+    def _load_model(self):
+        """Load model from checkpoint directory."""
+        # Find best model checkpoint
+        best_model_path = self.model_dir / "best_model.pt"
+        if not best_model_path.exists():
+            # Look for latest checkpoint
+            checkpoints = list(self.model_dir.glob("checkpoint_step_*.pt"))
+            if not checkpoints:
+                raise FileNotFoundError(f"No model checkpoints found in {self.model_dir}")
+            # Get latest checkpoint
+            latest_checkpoint = max(checkpoints, key=lambda p: int(p.stem.split("_")[-1]))
+            best_model_path = latest_checkpoint
+        print(f"📂 Loading model from {best_model_path}")
+        # Load checkpoint
+        checkpoint = torch.load(best_model_path, map_location="cpu")
+        # Determine model size from config
+        config_dict = checkpoint.get("config", {})
+        n_layer = config_dict.get("n_layer", 12)
+        if n_layer <= 6:
+            model_size = "small"
+        elif n_layer <= 12:
+            model_size = "medium"
+        else:
+            model_size = "large"
+        # Create and load model
+        model = create_model(model_size)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        model.eval()  # Set to evaluation mode
+        # Extract training info
+        training_info = {
+            "step": checkpoint.get("step", 0),
+            "best_loss": checkpoint.get("best_loss", 0.0),
+            "model_size": model_size,
+        }
+        return model, model.config, training_info
+    def _find_tokenizer(self):
+        """Find tokenizer path."""
+        # Try multiple possible locations
+        possible_paths = [
+            self.model_dir.parent / "tokenizer" / "tokenizer.model",
+            Path("data/tokenizer/tokenizer.model"),
+            self.model_dir / "tokenizer.model",
+        ]
+        for path in possible_paths:
+            if path.exists():
+                return str(path)
+        raise FileNotFoundError("Tokenizer not found in expected locations")
+    def export_pytorch(self) -> str:
+        """
+        Export model in PyTorch native format.
+        Returns:
+            Path to exported model directory
+        """
+        output_path = self.output_dir / "pytorch"
+        output_path.mkdir(parents=True, exist_ok=True)
+        print("🔄 Exporting to PyTorch format...")
+        # Save model state dict
+        model_path = output_path / "model.pt"
+        torch.save(
+            {
+                "model_state_dict": self.model.state_dict(),
+                "config": self.config.__dict__,
+                "training_info": self.training_info,
+            },
+            model_path,
+        )
+        # Save configuration
+        config_path = output_path / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(
+                {
+                    "model_config": self.config.__dict__,
+                    "training_info": self.training_info,
+                    "export_format": "pytorch",
+                },
+                f,
+                indent=2,
+            )
+        # Copy tokenizer
+        tokenizer_out = output_path / "tokenizer.model"
+        shutil.copy2(self.tokenizer_path, tokenizer_out)
+        # Create loading script
+        self._create_pytorch_loader(output_path)
+        print(f"✅ PyTorch export completed: {output_path}")
+        return str(output_path)
+    def export_huggingface(self) -> str:
+        """
+        Export model in Hugging Face compatible format.
+        Returns:
+            Path to exported model directory
+        """
+        output_path = self.output_dir / "huggingface"
+        output_path.mkdir(parents=True, exist_ok=True)
+        print("🔄 Exporting to Hugging Face format...")
+        # Save model weights in HF format
+        model_path = output_path / "pytorch_model.bin"
+        torch.save(self.model.state_dict(), model_path)
+        # Create HF-compatible config
+        hf_config = {
+            "architectures": ["GPTModel"],
+            "model_type": "gpt",
+            "vocab_size": self.config.vocab_size,
+            "n_layer": self.config.n_layer,
+            "n_head": self.config.n_head,
+            "n_embd": self.config.n_embd,
+            "block_size": self.config.block_size,
+            "dropout": self.config.dropout,
+            "bias": self.config.bias,
+            "torch_dtype": "float32",
+            "transformers_version": "4.0.0",
+            "openllm_version": "0.1.0",
+            "training_steps": self.training_info["step"],
+            "model_size": self.training_info["model_size"],
+        }
+        config_path = output_path / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(hf_config, f, indent=2)
+        # Copy tokenizer with HF naming
+        shutil.copy2(self.tokenizer_path, output_path / "tokenizer.model")
+        # Create tokenizer config
+        tokenizer_config = {
+            "tokenizer_class": "SentencePieceTokenizer",
+            "model_max_length": self.config.block_size,
+            "vocab_size": self.config.vocab_size,
+            "unk_token": "<unk>",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<pad>",
+        }
+        with open(output_path / "tokenizer_config.json", "w") as f:
+            json.dump(tokenizer_config, f, indent=2)
+        # Create generation config
+        generation_config = {
+            "max_length": 512,
+            "max_new_tokens": 256,
+            "temperature": 0.7,
+            "top_k": 40,
+            "top_p": 0.9,
+            "do_sample": True,
+            "pad_token_id": 0,
+            "eos_token_id": 1,
+            "bos_token_id": 2,
+        }
+        with open(output_path / "generation_config.json", "w") as f:
+            json.dump(generation_config, f, indent=2)
+        # Create HF loading script
+        self._create_hf_loader(output_path)
+        print(f"✅ Hugging Face export completed: {output_path}")
+        return str(output_path)
+    def export_onnx(self, optimize_for_inference: bool = False) -> str:
+        """
+        Export model to ONNX format for optimized inference.
+        Args:
+            optimize_for_inference: Whether to apply ONNX optimizations
+        Returns:
+            Path to exported ONNX model
+        """
+        try:
+            import onnx
+            import onnxruntime
+        except ImportError:
+            raise ImportError("ONNX export requires: pip install onnx onnxruntime")
+        output_path = self.output_dir / "onnx"
+        output_path.mkdir(parents=True, exist_ok=True)
+        print("🔄 Exporting to ONNX format...")
+        # Prepare model for export
+        self.model.eval()
+        # Create dummy input for tracing
+        batch_size = 1
+        seq_len = 64  # Use shorter sequence for compatibility
+        dummy_input = torch.randint(0, self.config.vocab_size, (batch_size, seq_len))
+        # Export to ONNX
+        onnx_path = output_path / "model.onnx"
+        torch.onnx.export(
+            self.model,
+            dummy_input,
+            onnx_path,
+            export_params=True,
+            opset_version=11,
+            do_constant_folding=True,
+            input_names=["input_ids"],
+            output_names=["logits"],
+            dynamic_axes={
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
+                "logits": {0: "batch_size", 1: "sequence_length"},
+            },
+        )
+        # Verify ONNX model
+        onnx_model = onnx.load(str(onnx_path))
+        onnx.checker.check_model(onnx_model)
+        # Apply optimizations if requested
+        if optimize_for_inference:
+            self._optimize_onnx_model(onnx_path)
+        # Save metadata
+        metadata = {
+            "model_config": self.config.__dict__,
+            "training_info": self.training_info,
+            "export_format": "onnx",
+            "input_shape": [batch_size, seq_len],
+            "input_names": ["input_ids"],
+            "output_names": ["logits"],
+            "optimized": optimize_for_inference,
+        }
+        with open(output_path / "metadata.json", "w") as f:
+            json.dump(metadata, f, indent=2)
+        # Copy tokenizer
+        shutil.copy2(self.tokenizer_path, output_path / "tokenizer.model")
+        # Create ONNX inference script
+        self._create_onnx_inference(output_path)
+        print(f"✅ ONNX export completed: {onnx_path}")
+        return str(onnx_path)
+    def _optimize_onnx_model(self, onnx_path: Path):
+        """Apply ONNX optimizations for inference."""
+        try:
+            import onnxruntime
+            from onnxruntime.tools import optimizer
+            print("🔧 Applying ONNX optimizations...")
+            # Create optimized model
+            optimized_path = onnx_path.parent / "model_optimized.onnx"
+            # Apply graph optimizations
+            optimizer.optimize_model(
+                str(onnx_path),
+                str(optimized_path),
+                optimization_level=optimizer.OptimizationLevel.ORT_ENABLE_ALL,
+            )
+            # Replace original with optimized
+            shutil.move(str(optimized_path), str(onnx_path))
+            print("✅ ONNX optimizations applied")
+        except ImportError:
+            print("⚠️  ONNX optimization requires onnxruntime-tools")
+        except Exception as e:
+            print(f"⚠️  ONNX optimization failed: {e}")
+    def _create_pytorch_loader(self, output_path: Path):
+        """Create PyTorch model loader script."""
+        loader_script = '''#!/usr/bin/env python3
+"""
+PyTorch Model Loader for OpenLLM
+Usage:
+    from load_model import load_model, generate_text
+    model, tokenizer, config = load_model(".")
+    text = generate_text(model, tokenizer, "Hello world", max_length=50)
+    print(text)
+"""
+import torch
+import json
+import sentencepiece as spm
+from pathlib import Path
+def load_model(model_dir="."):
+    """Load OpenLLM model from PyTorch export."""
+    model_dir = Path(model_dir)
+    # Load config
+    with open(model_dir / "config.json", 'r') as f:
+        config_data = json.load(f)
+    model_config = config_data['model_config']
+    # Recreate model architecture (you'll need to have the model.py file)
+    # This is a simplified loader - in practice you'd import your GPTModel class
+    print(f"Model config: {model_config}")
+    print("Note: You need to import and create the actual model class")
+    # Load model state
+    checkpoint = torch.load(model_dir / "model.pt", map_location='cpu')
+    # Load tokenizer
+    tokenizer = smp.SentencePieceProcessor()
+    tokenizer.load(str(model_dir / "tokenizer.model"))
+    return None, tokenizer, model_config  # Placeholder
+def generate_text(model, tokenizer, prompt, max_length=100):
+    """Generate text using the loaded model."""
+    # Implement text generation
+    return f"Generated text for: {prompt}"
+if __name__ == "__main__":
+    model, tokenizer, config = load_model()
+    print(f"Model loaded with {config.get('vocab_size', 'unknown')} vocabulary size")
+'''
+        with open(output_path / "load_model.py", "w") as f:
+            f.write(loader_script)
+    def _create_hf_loader(self, output_path: Path):
+        """Create Hugging Face model loader script."""
+        loader_script = '''#!/usr/bin/env python3
+"""
+Hugging Face Compatible Loader for OpenLLM
+Usage:
+    # Using transformers library (if you implement custom model class)
+    # from transformers import AutoModel, AutoTokenizer
+    # model = AutoModel.from_pretrained(".")
+    # tokenizer = AutoTokenizer.from_pretrained(".")
+    # Manual loading
+    from load_hf_model import load_model_manual
+    model, tokenizer = load_model_manual(".")
+"""
+import torch
+import json
+import sentencepiece as smp
+from pathlib import Path
+def load_model_manual(model_dir="."):
+    """Manually load model in HF format."""
+    model_dir = Path(model_dir)
+    # Load config
+    with open(model_dir / "config.json", 'r') as f:
+        config = json.load(f)
+    # Load model weights
+    state_dict = torch.load(model_dir / "pytorch_model.bin", map_location='cpu')
+    # Load tokenizer
+    tokenizer = smp.SentencePieceProcessor()
+    tokenizer.load(str(model_dir / "tokenizer.model"))
+    print(f"Loaded model: {config['model_type']} with {config['n_layer']} layers")
+    print(f"Vocabulary size: {config['vocab_size']}")
+    return state_dict, tokenizer
+if __name__ == "__main__":
+    state_dict, tokenizer = load_model_manual()
+    print(f"Model weights loaded: {len(state_dict)} parameters")
+    print(f"Tokenizer vocabulary: {tokenizer.vocab_size()}")
+'''
+        with open(output_path / "load_hf_model.py", "w") as f:
+            f.write(loader_script)
+    def _create_onnx_inference(self, output_path: Path):
+        """Create ONNX inference script."""
+        inference_script = '''#!/usr/bin/env python3
+"""
+ONNX Inference for OpenLLM
+Usage:
+    from onnx_inference import ONNXInference
+    inference = ONNXInference(".")
+    output = inference.generate("Hello world", max_length=50)
+    print(output)
+"""
+import numpy as np
+import json
+import sentencepiece as smp
+from pathlib import Path
+try:
+    import onnxruntime as ort
+except ImportError:
+    print("Install onnxruntime: pip install onnxruntime")
+    ort = None
+class ONNXInference:
+    def __init__(self, model_dir="."):
+        if ort is None:
+            raise ImportError("onnxruntime not available")
+        model_dir = Path(model_dir)
+        # Load ONNX model
+        self.session = ort.InferenceSession(str(model_dir / "model.onnx"))
+        # Load metadata
+        with open(model_dir / "metadata.json", 'r') as f:
+            self.metadata = json.load(f)
+        # Load tokenizer
+        self.tokenizer = smp.SentencePieceProcessor()
+        self.tokenizer.load(str(model_dir / "tokenizer.model"))
+        print(f"ONNX model loaded: {self.metadata['model_config']['model_name']}")
+    def predict(self, input_ids):
+        """Run inference on input token IDs."""
+        # Prepare input
+        input_data = {"input_ids": input_ids.astype(np.int64)}
+        # Run inference
+        outputs = self.session.run(None, input_data)
+        return outputs[0]  # logits
+    def generate(self, prompt, max_length=50, temperature=0.7):
+        """Generate text from prompt."""
+        # Tokenize prompt
+        tokens = self.tokenizer.encode(prompt)
+        input_ids = np.array([tokens], dtype=np.int64)
+        # Simple greedy generation (can be improved)
+        generated = tokens.copy()
+        for _ in range(max_length):
+            if len(generated) >= 512:  # Max sequence length
+                break
+            # Get current input (last 64 tokens to fit ONNX model)
+            current_input = np.array([generated[-64:]], dtype=np.int64)
+            # Predict next token
+            logits = self.predict(current_input)
+            next_token_logits = logits[0, -1, :]  # Last position
+            # Apply temperature and sample
+            if temperature > 0:
+                next_token_logits = next_token_logits / temperature
+                probs = np.exp(next_token_logits) / np.sum(np.exp(next_token_logits))
+                next_token = np.random.choice(len(probs), p=probs)
+            else:
+                next_token = np.argmax(next_token_logits)
+            generated.append(int(next_token))
+        # Decode generated text
+        generated_text = self.tokenizer.decode(generated[len(tokens):])
+        return generated_text
+if __name__ == "__main__":
+    inference = ONNXInference()
+    result = inference.generate("The future of AI is", max_length=30)
+    print(f"Generated: {result}")
+'''
+        with open(output_path / "onnx_inference.py", "w") as f:
+            f.write(inference_script)
+    def export_all_formats(self, optimize_onnx: bool = False) -> Dict[str, str]:
+        """
+        Export model to all supported formats.
+        Args:
+            optimize_onnx: Whether to optimize ONNX model
+        Returns:
+            Dictionary mapping format names to export paths
+        """
+        results = {}
+        print("🚀 Exporting to all formats...")
+        try:
+            results["pytorch"] = self.export_pytorch()
+        except Exception as e:
+            print(f"❌ PyTorch export failed: {e}")
+        try:
+            results["huggingface"] = self.export_huggingface()
+        except Exception as e:
+            print(f"❌ Hugging Face export failed: {e}")
+        try:
+            results["onnx"] = self.export_onnx(optimize_onnx)
+        except Exception as e:
+            print(f"❌ ONNX export failed: {e}")
+        # Create summary
+        summary = {
+            "export_timestamp": torch.datetime.now().isoformat(),
+            "model_info": {
+                "name": self.config.model_name,
+                "parameters": self.model.get_num_params(),
+                "training_steps": self.training_info["step"],
+                "best_loss": self.training_info["best_loss"],
+            },
+            "exports": results,
+        }
+        with open(self.output_dir / "export_summary.json", "w") as f:
+            json.dump(summary, f, indent=2)
+        print(f"✅ Export summary saved: {self.output_dir / 'export_summary.json'}")
+        return results
+def main():
+    """Main export function."""
+    parser = argparse.ArgumentParser(
+        description="Export OpenLLM models to various formats",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Export to PyTorch format
+  python core/src/export_model.py \\
+    --model_dir models/small-extended-4k \\
+    --format pytorch \\
+    --output_dir exports/pytorch/
+  # Export to Hugging Face format
+  python core/src/export_model.py \\
+    --model_dir models/small-extended-4k \\
+    --format huggingface \\
+    --output_dir exports/huggingface/
+  # Export to ONNX with optimizations
+  python core/src/export_model.py \\
+    --model_dir models/small-extended-4k \\
+    --format onnx \\
+    --output_dir exports/onnx/ \\
+    --optimize_for_inference
+  # Export to all formats
+  python core/src/export_model.py \\
+    --model_dir models/small-extended-4k \\
+    --format all \\
+    --output_dir exports/
+        """,
+    )
+    parser.add_argument(
+        "--model_dir", required=True, help="Directory containing trained model checkpoints"
+    )
+    parser.add_argument(
+        "--format",
+        choices=["pytorch", "huggingface", "onnx", "all"],
+        required=True,
+        help="Export format",
+    )
+    parser.add_argument("--output_dir", required=True, help="Output directory for exported models")
+    parser.add_argument(
+        "--optimize_for_inference",
+        action="store_true",
+        help="Apply optimizations for inference (ONNX only)",
+    )
+    args = parser.parse_args()
+    print("📦 OpenLLM Model Export")
+    print("=" * 50)
+    try:
+        # Create exporter
+        exporter = ModelExporter(args.model_dir, args.output_dir)
+        # Export based on format
+        if args.format == "pytorch":
+            result = exporter.export_pytorch()
+            print(f"\n✅ PyTorch export completed: {result}")
+        elif args.format == "huggingface":
+            result = exporter.export_huggingface()
+            print(f"\n✅ Hugging Face export completed: {result}")
+        elif args.format == "onnx":
+            result = exporter.export_onnx(args.optimize_for_inference)
+            print(f"\n✅ ONNX export completed: {result}")
+        elif args.format == "all":
+            results = exporter.export_all_formats(args.optimize_for_inference)
+            print("\n✅ All formats exported:")
+            for fmt, path in results.items():
+                print(f"  {fmt}: {path}")
+        print("\n🎉 Export completed successfully!")
+    except Exception as e:
+        print(f"\n❌ Export failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+if __name__ == "__main__":
+    main()

core/src/generate_text.py ADDED Viewed

	@@ -0,0 +1,866 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+OpenLLM Text Generation Script
+This script implements standalone text generation for OpenLLM models
+as specified in Step 5 of the training pipeline (Text Generation Quality assessment).
+Features:
+- Load trained OpenLLM models from checkpoint directories
+- Generate text with configurable parameters (temperature, length, etc.)
+- Support multiple model formats (auto-detection)
+- Quality assessment and metrics
+- Batch generation capabilities
+- Output formatting and saving
+Usage:
+    # Basic text generation
+    python core/src/generate_text.py \
+        --model_dir models/small-extended-4k \
+        --prompt "The history of artificial intelligence" \
+        --max_length 256 \
+        --temperature 0.7
+    # Multiple prompts with custom settings
+    python core/src/generate_text.py \
+        --model_dir models/small-extended-4k \
+        --prompts_file prompts.txt \
+        --max_length 100 \
+        --temperature 0.8 \
+        --top_k 40 \
+        --num_samples 3
+    # Save results to file
+    python core/src/generate_text.py \
+        --model_dir models/small-extended-4k \
+        --prompt "Once upon a time" \
+        --output_file generated_samples.txt
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import sentencepiece as spm
+import torch
+# Add current directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model import create_model
+class TextGenerator:
+    """
+    Comprehensive text generation engine for OpenLLM models.
+    This class handles loading trained models and generating high-quality text
+    with configurable sampling parameters and quality assessment.
+    """
+    def __init__(self, model_dir: str, device: str = "auto"):
+        """
+        Initialize the text generator.
+        Args:
+            model_dir: Directory containing trained model checkpoints
+            device: Device to use ("auto", "cpu", "cuda")
+        Implementation Details:
+            - Auto-detects best available device if device="auto"
+            - Loads model architecture based on checkpoint configuration
+            - Sets up tokenizer for text processing
+            - Validates model and tokenizer compatibility
+        """
+        self.model_dir = Path(model_dir)
+        # Determine device to use
+        # Auto-detection prioritizes CUDA if available for better performance
+        if device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        print("🚀 OpenLLM Text Generator")
+        print(f"📂 Model directory: {model_dir}")
+        print(f"🖥️  Device: {self.device}")
+        # Load model and tokenizer
+        # This handles the complete setup process
+        self._load_model()
+        self._load_tokenizer()
+        # Validate setup
+        # Ensure model and tokenizer are compatible
+        self._validate_setup()
+        print("✅ Text generator initialized successfully!")
+    def _load_model(self):
+        """
+        Load the trained model from checkpoint.
+        Implementation Details:
+            - Searches for best_model.pt or latest checkpoint
+            - Auto-detects model size from configuration
+            - Handles different checkpoint formats gracefully
+            - Sets model to evaluation mode for inference
+        """
+        # Find the best model checkpoint
+        # Priority: best_model.pt > latest checkpoint by step number
+        best_model_path = self.model_dir / "best_model.pt"
+        if best_model_path.exists():
+            checkpoint_path = best_model_path
+            print(f"📥 Loading best model: {checkpoint_path}")
+        else:
+            # Look for step-based checkpoints
+            checkpoints = list(self.model_dir.glob("checkpoint_step_*.pt"))
+            if not checkpoints:
+                raise FileNotFoundError(f"No model checkpoints found in {self.model_dir}")
+            # Get the latest checkpoint by step number
+            latest_checkpoint = max(checkpoints, key=lambda p: int(p.stem.split("_")[-1]))
+            checkpoint_path = latest_checkpoint
+            print(f"📥 Loading latest checkpoint: {checkpoint_path}")
+        # Load checkpoint data
+        # This contains model weights, configuration, and training metadata
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location=self.device)
+            print("✅ Checkpoint loaded successfully")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load checkpoint: {e}")
+        # Extract model configuration
+        # This tells us what architecture to create
+        if "config" in checkpoint:
+            config_dict = checkpoint["config"]
+        else:
+            # Fallback: try to infer from model state dict
+            print("⚠️  No config found in checkpoint, inferring from model structure...")
+            config_dict = self._infer_config_from_state_dict(
+                checkpoint.get("model_state_dict", checkpoint)
+            )
+        # Determine model size category
+        # This maps checkpoint config to our predefined model sizes
+        n_layer = config_dict.get("n_layer", 12)
+        n_embd = config_dict.get("n_embd", 768)
+        if n_layer <= 6:
+            model_size = "small"
+        elif n_layer <= 12:
+            model_size = "medium"
+        else:
+            model_size = "large"
+        print(f"🎯 Detected model size: {model_size}")
+        print(f"📊 Architecture: {n_layer} layers, {n_embd} embedding dim")
+        # Create model architecture
+        # This recreates the exact same model used during training
+        try:
+            self.model = create_model(model_size)
+            print(f"🏗️  Model architecture created: {self.model.get_num_params():,} parameters")
+        except Exception as e:
+            raise RuntimeError(f"Failed to create model architecture: {e}")
+        # Load trained weights
+        # This restores the model to its trained state
+        try:
+            if "model_state_dict" in checkpoint:
+                self.model.load_state_dict(checkpoint["model_state_dict"])
+            else:
+                # Fallback for different checkpoint formats
+                self.model.load_state_dict(checkpoint)
+            print("✅ Model weights loaded successfully")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model weights: {e}")
+        # Move model to device and set to evaluation mode
+        # Evaluation mode disables dropout and other training-specific behaviors
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        # Store model configuration for later use
+        # This is useful for generation parameters and limits
+        self.config = self.model.config
+        # Extract training metadata if available
+        # This provides context about model quality and training progress
+        self.training_info = {
+            "step": checkpoint.get("step", "Unknown"),
+            "best_loss": checkpoint.get("best_loss", "Unknown"),
+            "model_size": model_size,
+        }
+        print(
+            f"📈 Training info: step {self.training_info['step']}, "
+            f"best loss {self.training_info['best_loss']}"
+        )
+    def _infer_config_from_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, Any]:
+        """
+        Infer model configuration from state dict when config is missing.
+        Args:
+            state_dict: Model parameter dictionary
+        Returns:
+            Inferred configuration dictionary
+        Implementation Details:
+            - Analyzes parameter shapes to determine architecture
+            - Makes reasonable assumptions about standard GPT architecture
+            - Provides fallback values for missing parameters
+        """
+        # Extract key dimensions from parameter shapes
+        # This reverse-engineers the model architecture
+        # Embedding layer tells us vocab size and embedding dimension
+        if "transformer.wte.weight" in state_dict:
+            vocab_size, n_embd = state_dict["transformer.wte.weight"].shape
+        else:
+            # Fallback defaults
+            vocab_size, n_embd = 32000, 512
+        # Count transformer blocks to get number of layers
+        # Look for attention weight patterns
+        n_layer = 0
+        for key in state_dict.keys():
+            if "attn.c_attn.weight" in key:
+                # Extract layer number from key like 'transformer.h.0.attn.c_attn.weight'
+                layer_num = int(key.split(".")[2])
+                n_layer = max(n_layer, layer_num + 1)
+        # Infer number of attention heads from attention weights
+        # The c_attn weight combines query, key, value projections
+        if "transformer.h.0.attn.c_attn.weight" in state_dict:
+            _ = state_dict["transformer.h.0.attn.c_attn.weight"].shape
+            # Shape is [n_embd, 3 * n_embd] for combined Q,K,V
+            # So n_head = n_embd / head_dim, assuming head_dim = 64
+            n_head = n_embd // 64  # Standard head dimension
+        else:
+            n_head = 8  # Fallback
+        # Construct configuration dictionary
+        # Use reasonable defaults for missing values
+        config = {
+            "vocab_size": vocab_size,
+            "n_layer": n_layer,
+            "n_head": n_head,
+            "n_embd": n_embd,
+            "block_size": 1024,  # Standard context length
+            "dropout": 0.1,  # Standard dropout rate
+            "bias": True,  # Most models use bias
+            "model_name": f"gpt-inferred-{n_layer}L",
+        }
+        print(f"🔍 Inferred config: {config}")
+        return config
+    def _load_tokenizer(self):
+        """
+        Load the SentencePiece tokenizer.
+        Implementation Details:
+            - Searches multiple possible tokenizer locations
+            - Validates tokenizer vocabulary size against model
+            - Sets up special tokens if available
+        """
+        # Try multiple possible tokenizer locations
+        # Different training setups may store tokenizer in different places
+        possible_paths = [
+            self.model_dir / "tokenizer.model",
+            self.model_dir.parent / "tokenizer" / "tokenizer.model",
+            Path("data/tokenizer/tokenizer.model"),
+            self.model_dir / ".." / "tokenizer" / "tokenizer.model",
+        ]
+        tokenizer_path = None
+        for path in possible_paths:
+            if path.exists():
+                tokenizer_path = path
+                break
+        if tokenizer_path is None:
+            raise FileNotFoundError(f"Tokenizer not found in any of: {possible_paths}")
+        print(f"📝 Loading tokenizer from: {tokenizer_path}")
+        # Load SentencePiece tokenizer
+        # This handles all text-to-token and token-to-text conversion
+        try:
+            self.tokenizer = spm.SentencePieceProcessor()
+            self.tokenizer.load(str(tokenizer_path))
+            print(f"✅ Tokenizer loaded: {self.tokenizer.vocab_size()} vocabulary")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load tokenizer: {e}")
+    def _validate_setup(self):
+        """
+        Validate that model and tokenizer are compatible.
+        Implementation Details:
+            - Checks vocabulary size consistency
+            - Tests basic tokenization and model forward pass
+            - Warns about potential compatibility issues
+        """
+        # Check vocabulary size consistency
+        # Model and tokenizer should have matching vocabulary
+        model_vocab_size = self.config.vocab_size
+        tokenizer_vocab_size = self.tokenizer.vocab_size()
+        if model_vocab_size != tokenizer_vocab_size:
+            print("⚠️  Warning: Vocabulary size mismatch!")
+            print(f"   Model expects: {model_vocab_size}")
+            print(f"   Tokenizer has: {tokenizer_vocab_size}")
+            print("   This may cause generation issues.")
+        # Test basic functionality
+        # Quick validation that everything works together
+        try:
+            # Test tokenization
+            test_text = "Hello world"
+            tokens = self.tokenizer.encode(test_text)
+            _ = self.tokenizer.decode(tokens)
+            # Test model forward pass
+            input_ids = torch.tensor([tokens[:5]], dtype=torch.long, device=self.device)
+            with torch.no_grad():
+                _ = self.model(input_ids)
+            print("✅ Validation passed: tokenization and model forward pass work")
+        except Exception as e:
+            print(f"⚠️  Validation warning: {e}")
+            print("   Generation may still work, but there might be issues.")
+    def generate(
+        self,
+        prompt: str,
+        max_length: int = 100,
+        temperature: float = 0.7,
+        top_k: Optional[int] = 40,
+        top_p: Optional[float] = 0.9,
+        num_return_sequences: int = 1,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.0,
+    ) -> List[str]:
+        """
+        Generate text from a prompt using the loaded model.
+        Args:
+            prompt: Input text to continue
+            max_length: Maximum number of tokens to generate
+            temperature: Sampling temperature (0.1-2.0, higher = more random)
+            top_k: Limit to top-k most likely tokens (None = no limit)
+            top_p: Nucleus sampling threshold (None = no nucleus sampling)
+            num_return_sequences: Number of sequences to generate
+            do_sample: Whether to use sampling (False = greedy)
+            repetition_penalty: Penalty for repeating tokens (1.0 = no penalty)
+        Returns:
+            List of generated text strings
+        Implementation Details:
+            - Uses autoregressive generation (one token at a time)
+            - Supports multiple sampling strategies (greedy, top-k, nucleus)
+            - Handles context length limits gracefully
+            - Applies repetition penalty to improve quality
+            - Returns only the generated portion (excludes input prompt)
+        """
+        print(f"🎯 Generating text for: '{prompt[:50]}{'...' if len(prompt) > 50 else ''}'")
+        print(
+            f"⚙️  Parameters: max_length={max_length}, temperature={temperature}, "
+            f"top_k={top_k}, top_p={top_p}"
+        )
+        # Tokenize input prompt
+        # Convert text to token IDs for model processing
+        try:
+            input_tokens = self.tokenizer.encode(prompt)
+            if len(input_tokens) == 0:
+                raise ValueError("Empty tokenization result")
+        except Exception as e:
+            raise RuntimeError(f"Failed to tokenize prompt: {e}")
+        # Check prompt length against model context
+        # Ensure we don't exceed model's maximum sequence length
+        max_context = self.config.block_size
+        if len(input_tokens) >= max_context:
+            print(
+                f"⚠️  Warning: Prompt length ({len(input_tokens)}) approaches "
+                f"context limit ({max_context})"
+            )
+            # Truncate prompt if necessary
+            input_tokens = input_tokens[-(max_context - max_length) :]
+            print(f"   Truncated prompt to {len(input_tokens)} tokens")
+        # Generate multiple sequences
+        # Each sequence is generated independently
+        generated_texts = []
+        for seq_idx in range(num_return_sequences):
+            if num_return_sequences > 1:
+                print(f"🔄 Generating sequence {seq_idx + 1}/{num_return_sequences}")
+            try:
+                generated_text = self._generate_single_sequence(
+                    input_tokens=input_tokens,
+                    max_length=max_length,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                    do_sample=do_sample,
+                    repetition_penalty=repetition_penalty,
+                )
+                generated_texts.append(generated_text)
+            except Exception as e:
+                print(f"⚠️  Generation failed for sequence {seq_idx + 1}: {e}")
+                generated_texts.append(f"Generation error: {e}")
+        return generated_texts
+    def _generate_single_sequence(
+        self,
+        input_tokens: List[int],
+        max_length: int,
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        do_sample: bool,
+        repetition_penalty: float,
+    ) -> str:
+        """
+        Generate a single text sequence using autoregressive sampling.
+        Args:
+            input_tokens: Tokenized input prompt
+            max_length: Maximum tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling limit
+            top_p: Nucleus sampling threshold
+            do_sample: Whether to use sampling vs greedy
+            repetition_penalty: Repetition penalty factor
+        Returns:
+            Generated text string (excluding input prompt)
+        Implementation Details:
+            - Implements autoregressive generation loop
+            - Applies all specified sampling strategies
+            - Handles special tokens (EOS, padding)
+            - Tracks token frequencies for repetition penalty
+        """
+        # Initialize generation state
+        # Keep track of all generated tokens and their frequencies
+        generated_tokens = input_tokens.copy()
+        token_frequencies = {}  # For repetition penalty
+        # Count initial token frequencies
+        # This helps apply repetition penalty from the start
+        for token in input_tokens:
+            token_frequencies[token] = token_frequencies.get(token, 0) + 1
+        # Set model to evaluation mode and disable gradients
+        # This ensures consistent inference behavior and saves memory
+        self.model.eval()
+        with torch.no_grad():
+            # Main generation loop
+            # Generate one token at a time until stopping condition
+            for step in range(max_length):
+                # Check context length limits
+                # Prevent exceeding model's maximum sequence length
+                if len(generated_tokens) >= self.config.block_size:
+                    print(f"⚠️  Reached maximum context length ({self.config.block_size})")
+                    break
+                # Prepare model input
+                # Use all generated tokens as context for next prediction
+                input_ids = torch.tensor([generated_tokens], dtype=torch.long, device=self.device)
+                try:
+                    # Forward pass through model
+                    # Get logits (raw predictions) for all vocabulary tokens
+                    outputs = self.model(input_ids)
+                    # Handle different model output formats
+                    # Some models return tuples, others return tensors directly
+                    if isinstance(outputs, tuple):
+                        logits = outputs[0]  # First element is usually logits
+                    else:
+                        logits = outputs
+                    # Get predictions for next token (last position in sequence)
+                    next_token_logits = logits[0, -1, :].float()
+                except Exception as e:
+                    raise RuntimeError(f"Model forward pass failed at step {step}: {e}")
+                # Apply repetition penalty
+                # Reduce probability of recently used tokens
+                if repetition_penalty != 1.0:
+                    for token, freq in token_frequencies.items():
+                        if token < len(next_token_logits):
+                            penalty = repetition_penalty**freq
+                            if next_token_logits[token] > 0:
+                                next_token_logits[token] /= penalty
+                            else:
+                                next_token_logits[token] *= penalty
+                # Apply sampling strategy to select next token
+                # This determines the randomness and quality of generation
+                if do_sample:
+                    next_token = self._sample_next_token(
+                        next_token_logits, temperature, top_k, top_p
+                    )
+                else:
+                    # Greedy decoding: always pick most likely token
+                    next_token = torch.argmax(next_token_logits).item()
+                # Add generated token to sequence
+                generated_tokens.append(next_token)
+                # Update token frequency for repetition penalty
+                token_frequencies[next_token] = token_frequencies.get(next_token, 0) + 1
+                # Check for end-of-sequence token
+                # Some models/tokenizers have special EOS tokens
+                if hasattr(self.tokenizer, "eos_id") and next_token == self.tokenizer.eos_id():
+                    print(f"🔚 Reached end-of-sequence token at step {step}")
+                    break
+                # Optional: Check for other stopping conditions
+                # Could add custom stop words or patterns here
+        # Decode generated tokens to text
+        # Convert token IDs back to readable text, excluding input prompt
+        try:
+            # Extract only newly generated tokens (exclude input prompt)
+            new_tokens = generated_tokens[len(input_tokens) :]
+            if len(new_tokens) == 0:
+                return "⚠️  No tokens generated"
+            # Decode to text using tokenizer
+            generated_text = self.tokenizer.decode(new_tokens)
+            print(f"✅ Generated {len(new_tokens)} tokens")
+            return generated_text
+        except Exception as e:
+            raise RuntimeError(f"Failed to decode generated tokens: {e}")
+    def _sample_next_token(
+        self, logits: torch.Tensor, temperature: float, top_k: Optional[int], top_p: Optional[float]
+    ) -> int:
+        """
+        Sample next token using specified sampling strategy.
+        Args:
+            logits: Raw model predictions for next token
+            temperature: Sampling temperature
+            top_k: Top-k sampling limit
+            top_p: Nucleus sampling threshold
+        Returns:
+            Selected token ID
+        Implementation Details:
+            - Applies temperature scaling for randomness control
+            - Implements top-k sampling to limit choices
+            - Implements nucleus (top-p) sampling for quality
+            - Uses multinomial sampling for final selection
+        """
+        # Apply temperature scaling
+        # Higher temperature = more random, lower = more deterministic
+        if temperature != 1.0:
+            logits = logits / temperature
+        # Apply top-k filtering
+        # Only consider the k most likely tokens
+        if top_k is not None and top_k > 0:
+            # Get indices of top-k tokens
+            top_k_tokens = min(top_k, logits.size(-1))
+            top_k_values, top_k_indices = torch.topk(logits, top_k_tokens)
+            # Zero out non-top-k logits
+            filtered_logits = torch.full_like(logits, float("-inf"))
+            filtered_logits[top_k_indices] = top_k_values
+            logits = filtered_logits
+        # Apply nucleus (top-p) sampling
+        # Dynamically adjust vocabulary based on cumulative probability
+        if top_p is not None and top_p < 1.0:
+            # Sort logits in descending order
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            # Calculate cumulative probabilities
+            sorted_probs = torch.softmax(sorted_logits, dim=-1)
+            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+            # Find cutoff point where cumulative probability exceeds top_p
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Keep at least the top token
+            sorted_indices_to_remove[0] = False
+            # Zero out tokens beyond nucleus
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            logits[indices_to_remove] = float("-inf")
+        # Convert logits to probabilities and sample
+        # Use multinomial sampling for final token selection
+        probs = torch.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1).item()
+        return next_token
+    def generate_batch(self, prompts: List[str], **generation_kwargs) -> List[List[str]]:
+        """
+        Generate text for multiple prompts.
+        Args:
+            prompts: List of input prompts
+            **generation_kwargs: Arguments passed to generate()
+        Returns:
+            List of lists, where each inner list contains generated texts for one prompt
+        Implementation Details:
+            - Processes prompts sequentially (could be parallelized)
+            - Applies same generation parameters to all prompts
+            - Handles errors gracefully for individual prompts
+        """
+        print(f"🔄 Generating text for {len(prompts)} prompts...")
+        all_results = []
+        for i, prompt in enumerate(prompts):
+            print(f"\n--- Prompt {i + 1}/{len(prompts)} ---")
+            try:
+                results = self.generate(prompt, **generation_kwargs)
+                all_results.append(results)
+            except Exception as e:
+                print(f"❌ Failed to generate for prompt {i + 1}: {e}")
+                all_results.append([f"Generation failed: {e}"])
+        return all_results
+def load_prompts_from_file(file_path: str) -> List[str]:
+    """
+    Load prompts from a text file.
+    Args:
+        file_path: Path to file containing prompts (one per line)
+    Returns:
+        List of prompt strings
+    Implementation Details:
+        - Reads file line by line
+        - Strips whitespace and filters empty lines
+        - Handles different text encodings gracefully
+    """
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            prompts = [line.strip() for line in f if line.strip()]
+        print(f"📄 Loaded {len(prompts)} prompts from {file_path}")
+        return prompts
+    except Exception as e:
+        raise RuntimeError(f"Failed to load prompts from {file_path}: {e}")
+def save_results_to_file(results: List[str], output_path: str, prompts: List[str] = None):
+    """
+    Save generation results to a text file.
+    Args:
+        results: Generated text results
+        output_path: Path to output file
+        prompts: Original prompts (optional, for context)
+    Implementation Details:
+        - Formats output with clear separators
+        - Includes prompts and metadata when available
+        - Handles file creation and error reporting
+    """
+    try:
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("# OpenLLM Text Generation Results\n")
+            f.write(f"# Generated at: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"# Total samples: {len(results)}\n\n")
+            for i, result in enumerate(results):
+                f.write(f"--- Sample {i + 1} ---\n")
+                if prompts and i < len(prompts):
+                    f.write(f"Prompt: {prompts[i]}\n\n")
+                if isinstance(result, list):
+                    for j, text in enumerate(result):
+                        f.write(f"Generated {j + 1}: {text}\n\n")
+                else:
+                    f.write(f"Generated: {result}\n\n")
+                f.write("-" * 50 + "\n\n")
+        print(f"💾 Results saved to: {output_path}")
+    except Exception as e:
+        raise RuntimeError(f"Failed to save results to {output_path}: {e}")
+def main():
+    """Main function for command-line text generation."""
+    parser = argparse.ArgumentParser(
+        description="OpenLLM Text Generation",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic text generation
+  python core/src/generate_text.py \\
+    --model_dir ./openllm-trained \\
+    --prompt "Hello, how are you?" \\
+    --max_length 100
+  # Advanced generation with parameters
+  python core/src/generate_text.py \\
+    --model_dir ./openllm-trained \\
+    --prompt "The future of AI is" \\
+    --max_length 200 \\
+    --temperature 0.8 \\
+    --top_k 50 \\
+    --top_p 0.9
+        """,
+    )
+    parser.add_argument(
+        "--model_dir",
+        required=True,
+        help="Directory containing trained model checkpoints",
+    )
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="Input text prompt for generation",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=100,
+        help="Maximum number of tokens to generate (default: 100)",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature (default: 0.7)",
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=40,
+        help="Top-k sampling parameter (default: 40)",
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.9,
+        help="Nucleus sampling parameter (default: 0.9)",
+    )
+    parser.add_argument(
+        "--device",
+        default="auto",
+        choices=["auto", "cpu", "cuda"],
+        help="Device to use for generation (default: auto)",
+    )
+    args = parser.parse_args()
+    print("🚀 OpenLLM Text Generation")
+    print("=" * 50)
+    try:
+        # Initialize text generator
+        generator = TextGenerator(args.model_dir, args.device)
+        # Generate text
+        print(f"📝 Prompt: {args.prompt}")
+        print(f"⚙️  Parameters: max_length={args.max_length}, temperature={args.temperature}")
+        generated_text = generator.generate(
+            prompt=args.prompt,
+            max_length=args.max_length,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+        )
+        print("\n🎯 Generated text:")
+        print(f"{generated_text}")
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+def load_tokenizer(tokenizer_path: str):
+    """
+    Load tokenizer for testing purposes.
+    This function is used by tests to load tokenizers without initializing the full generator.
+    Args:
+        tokenizer_path: Path to tokenizer model file
+    Returns:
+        SentencePieceProcessor: Loaded tokenizer
+    """
+    import sentencepiece as spm
+    tokenizer = spm.SentencePieceProcessor()
+    tokenizer.load(tokenizer_path)
+    return tokenizer
+if __name__ == "__main__":
+    success = main()
+    exit(0 if success else 1)

core/src/inference_server.py ADDED Viewed

	@@ -0,0 +1,907 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+OpenLLM Inference Server
+This script implements the REST API server for OpenLLM model inference
+as specified in Step 6 of the training pipeline.
+Features:
+- FastAPI-based REST API
+- Support for multiple model formats (PyTorch, Hugging Face, ONNX)
+- Text generation with configurable parameters
+- Health checks and metrics
+- Production-ready deployment
+Usage:
+    python core/src/inference_server.py \
+        --model_path exports/huggingface/ \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --max_length 512
+API Endpoints:
+    POST /generate - Generate text from prompt
+    GET /health - Health check
+    GET /info - Model information
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import uvicorn
+# FastAPI imports (open source)
+try:
+    from fastapi import BackgroundTasks, FastAPI, HTTPException
+    from fastapi.middleware.cors import CORSMiddleware
+    from pydantic import BaseModel, Field
+except ImportError:
+    raise ImportError("Install FastAPI: pip install fastapi uvicorn[standard]")
+import os
+# Import our modules
+import sys
+import numpy as np
+import sentencepiece as smp
+import torch
+# Add current directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model import create_model
+class TextGenerationConfig(BaseModel):
+    """Configuration for text generation parameters."""
+    max_new_tokens: int = Field(
+        256, description="Maximum number of tokens to generate", ge=1, le=2048
+    )
+    temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
+    top_k: Optional[int] = Field(40, description="Top-k sampling parameter", ge=1, le=1000)
+    top_p: Optional[float] = Field(0.9, description="Nucleus sampling parameter", ge=0.1, le=1.0)
+    num_return_sequences: int = Field(1, description="Number of sequences to generate", ge=1, le=5)
+    stop_sequences: Optional[List[str]] = Field(
+        None, description="Stop generation at these sequences"
+    )
+class GenerationRequest(BaseModel):
+    """Request model for text generation."""
+    prompt: str = Field(..., description="Input text prompt")
+    max_length: int = Field(256, description="Maximum generation length", ge=1, le=2048)
+    temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
+    top_k: Optional[int] = Field(40, description="Top-k sampling parameter", ge=1, le=1000)
+    top_p: Optional[float] = Field(0.9, description="Nucleus sampling parameter", ge=0.1, le=1.0)
+    num_return_sequences: int = Field(1, description="Number of sequences to generate", ge=1, le=5)
+    stop_sequences: Optional[List[str]] = Field(
+        None, description="Stop generation at these sequences"
+    )
+class GenerationResponse(BaseModel):
+    """Response model for text generation."""
+    generated_text: List[str] = Field(..., description="Generated text sequences")
+    prompt: str = Field(..., description="Original prompt")
+    generation_time: float = Field(..., description="Generation time in seconds")
+    parameters: Dict[str, Any] = Field(..., description="Generation parameters used")
+class ModelInfo(BaseModel):
+    """Model information response."""
+    model_name: str
+    model_size: str
+    parameters: int
+    vocab_size: int
+    max_length: int
+    format: str
+    loaded_at: str
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str
+    model_loaded: bool
+    uptime_seconds: float
+    total_requests: int
+class OpenLLMInference:
+    """
+    OpenLLM model inference engine.
+    Supports multiple model formats and provides text generation capabilities.
+    """
+    def __init__(self, model_path: str, model_format: str = "auto"):
+        """
+        Initialize inference engine.
+        Args:
+            model_path: Path to exported model directory
+            model_format: Model format (pytorch, huggingface, onnx, auto)
+        """
+        self.model_path = Path(model_path)
+        self.model_format = model_format
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load model
+        self._load_model()
+        # Statistics
+        self.loaded_at = time.time()
+        self.total_requests = 0
+        print("🚀 OpenLLM Inference Engine initialized")
+        print(f"  Model: {self.config.get('model_name', 'Unknown')}")
+        print(f"  Format: {self.detected_format}")
+        print(f"  Device: {self.device}")
+    def _detect_format(self) -> str:
+        """Auto-detect model format from directory contents."""
+        if (self.model_path / "model.pt").exists():
+            return "pytorch"
+        elif (self.model_path / "pytorch_model.bin").exists():
+            return "huggingface"
+        elif (self.model_path / "model.onnx").exists():
+            return "onnx"
+        else:
+            raise ValueError(f"Could not detect model format in {self.model_path}")
+    def _load_model(self):
+        """Load model based on detected format."""
+        if self.model_format == "auto":
+            self.detected_format = self._detect_format()
+        else:
+            self.detected_format = self.model_format
+        print(f"📂 Loading {self.detected_format} model from {self.model_path}")
+        if self.detected_format == "pytorch":
+            self._load_pytorch_model()
+        elif self.detected_format == "huggingface":
+            self._load_huggingface_model()
+        elif self.detected_format == "onnx":
+            self._load_onnx_model()
+        else:
+            raise ValueError(f"Unsupported format: {self.detected_format}")
+        # Load tokenizer
+        self._load_tokenizer()
+        print("✅ Model loaded successfully")
+    def _load_pytorch_model(self):
+        """Load PyTorch format model."""
+        # Load config
+        with open(self.model_path / "config.json", "r") as f:
+            config_data = json.load(f)
+        self.config = config_data["model_config"]
+        # Load model
+        checkpoint = torch.load(self.model_path / "model.pt", map_location=self.device)
+        # Determine model size
+        n_layer = self.config.get("n_layer", 12)
+        if n_layer <= 6:
+            model_size = "small"
+        elif n_layer <= 12:
+            model_size = "medium"
+        else:
+            model_size = "large"
+        # Create model
+        self.model = create_model(model_size)
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.model.to(self.device)
+        self.model.eval()
+    def _load_huggingface_model(self):
+        """Load Hugging Face format model."""
+        # Load config
+        with open(self.model_path / "config.json", "r") as f:
+            self.config = json.load(f)
+        # Load model weights
+        state_dict = torch.load(self.model_path / "pytorch_model.bin", map_location=self.device)
+        # Determine model size
+        n_layer = self.config.get("n_layer", 12)
+        if n_layer <= 6:
+            model_size = "small"
+        elif n_layer <= 12:
+            model_size = "medium"
+        else:
+            model_size = "large"
+        # Create model
+        self.model = create_model(model_size)
+        self.model.load_state_dict(state_dict)
+        self.model.to(self.device)
+        self.model.eval()
+    def _load_onnx_model(self):
+        """Load ONNX format model."""
+        try:
+            import onnxruntime as ort
+        except ImportError:
+            raise ImportError("ONNX inference requires: pip install onnxruntime")
+        # Security mitigation: Validate model path to prevent arbitrary file access
+        model_file = self.model_path / "model.onnx"
+        if not model_file.exists():
+            raise FileNotFoundError(f"ONNX model not found: {model_file}")
+        # Security mitigation: Validate file is within expected directory
+        if not str(model_file).startswith(str(self.model_path)):
+            raise ValueError(f"Invalid model path: {model_file}")
+        # Load metadata with path validation
+        metadata_file = self.model_path / "metadata.json"
+        if not metadata_file.exists():
+            raise FileNotFoundError(f"ONNX metadata not found: {metadata_file}")
+        with open(metadata_file, "r") as f:
+            metadata = json.load(f)
+        self.config = metadata["model_config"]
+        # Create ONNX session with security options
+        providers = (
+            ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            if torch.cuda.is_available()
+            else ["CPUExecutionProvider"]
+        )
+        # Security mitigation: Use session options to restrict capabilities
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
+        session_options.enable_mem_pattern = False  # Disable memory optimization
+        session_options.enable_cpu_mem_arena = False  # Disable CPU memory arena
+        self.onnx_session = ort.InferenceSession(
+            str(model_file), providers=providers, sess_options=session_options
+        )
+        # ONNX models don't need device management
+        self.device = "onnx"
+    def _load_tokenizer(self):
+        """Load tokenizer."""
+        tokenizer_path = self.model_path / "tokenizer.model"
+        if not tokenizer_path.exists():
+            raise FileNotFoundError(f"Tokenizer not found: {tokenizer_path}")
+        self.tokenizer = smp.SentencePieceProcessor()
+        self.tokenizer.load(str(tokenizer_path))
+    def generate(
+        self,
+        prompt: str,
+        max_length: int = 256,
+        temperature: float = 0.7,
+        top_k: Optional[int] = 40,
+        top_p: Optional[float] = 0.9,
+        num_return_sequences: int = 1,
+        stop_sequences: Optional[List[str]] = None,
+    ) -> List[str]:
+        """
+        Generate text from prompt.
+        Args:
+            prompt: Input text prompt
+            max_length: Maximum generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Nucleus sampling parameter
+            num_return_sequences: Number of sequences to generate
+            stop_sequences: Stop generation at these sequences
+        Returns:
+            List of generated text sequences
+        """
+        self.total_requests += 1
+        if self.detected_format == "onnx":
+            return self._generate_onnx(
+                prompt, max_length, temperature, top_k, num_return_sequences, stop_sequences
+            )
+        else:
+            return self._generate_pytorch(
+                prompt, max_length, temperature, top_k, top_p, num_return_sequences, stop_sequences
+            )
+    def _generate_pytorch(
+        self,
+        prompt: str,
+        max_length: int,
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        num_return_sequences: int,
+        stop_sequences: Optional[List[str]],
+    ) -> List[str]:
+        """Generate using PyTorch model."""
+        # Tokenize prompt
+        input_ids = self.tokenizer.encode(prompt)
+        input_tensor = torch.tensor(
+            [input_ids] * num_return_sequences, dtype=torch.long, device=self.device
+        )
+        # Generate
+        with torch.no_grad():
+            outputs = []
+            for _ in range(num_return_sequences):
+                # Use model's generate method if available
+                if hasattr(self.model, "generate"):
+                    output = self.model.generate(
+                        input_tensor[:1],  # Single sequence
+                        max_new_tokens=max_length,
+                        temperature=temperature,
+                        top_k=top_k,
+                    )
+                    generated_ids = output[0].tolist()
+                    generated_text = self.tokenizer.decode(generated_ids[len(input_ids) :])
+                else:
+                    # Fallback simple generation
+                    generated_text = self._simple_generate(
+                        input_tensor[:1], max_length, temperature
+                    )
+                # Apply stop sequences
+                if stop_sequences:
+                    for stop_seq in stop_sequences:
+                        if stop_seq in generated_text:
+                            generated_text = generated_text.split(stop_seq)[0]
+                            break
+                outputs.append(generated_text)
+        return outputs
+    def _generate_onnx(
+        self,
+        prompt: str,
+        max_length: int,
+        temperature: float,
+        top_k: Optional[int],
+        num_return_sequences: int,
+        stop_sequences: Optional[List[str]],
+    ) -> List[str]:
+        """Generate using ONNX model."""
+        outputs = []
+        for _ in range(num_return_sequences):
+            # Tokenize prompt
+            tokens = self.tokenizer.encode(prompt)
+            generated = tokens.copy()
+            # Simple autoregressive generation
+            for _ in range(max_length):
+                if len(generated) >= 512:  # Max sequence length for ONNX
+                    break
+                # Prepare input (last 64 tokens to fit ONNX model)
+                current_input = np.array([generated[-64:]], dtype=np.int64)
+                # Run inference
+                logits = self.onnx_session.run(None, {"input_ids": current_input})[0]
+                next_token_logits = logits[0, -1, :]
+                # Apply temperature
+                if temperature > 0:
+                    next_token_logits = next_token_logits / temperature
+                    probs = np.exp(next_token_logits) / np.sum(np.exp(next_token_logits))
+                    # Apply top-k if specified
+                    if top_k:
+                        top_indices = np.argpartition(probs, -top_k)[-top_k:]
+                        probs_filtered = np.zeros_like(probs)
+                        probs_filtered[top_indices] = probs[top_indices]
+                        probs = probs_filtered / np.sum(probs_filtered)
+                    next_token = np.random.choice(len(probs), p=probs)
+                else:
+                    next_token = np.argmax(next_token_logits)
+                generated.append(int(next_token))
+            # Decode generated text
+            generated_text = self.tokenizer.decode(generated[len(tokens) :])
+            # Apply stop sequences
+            if stop_sequences:
+                for stop_seq in stop_sequences:
+                    if stop_seq in generated_text:
+                        generated_text = generated_text.split(stop_seq)[0]
+                        break
+            outputs.append(generated_text)
+        return outputs
+    def _simple_generate(
+        self, input_tensor: torch.Tensor, max_length: int, temperature: float
+    ) -> str:
+        """Simple fallback generation method."""
+        generated = input_tensor[0].tolist()
+        for _ in range(max_length):
+            if len(generated) >= self.config.get("block_size", 1024):
+                break
+            # Forward pass
+            current_input = torch.tensor([generated], dtype=torch.long, device=self.device)
+            with torch.no_grad():
+                logits, _ = self.model(current_input)
+            # Get next token logits and apply temperature
+            next_token_logits = logits[0, -1, :] / temperature
+            probs = torch.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1).item()
+            generated.append(next_token)
+        # Decode only the generated part
+        original_length = input_tensor.size(1)
+        generated_tokens = generated[original_length:]
+        return self.tokenizer.decode(generated_tokens)
+    def get_info(self) -> Dict[str, Any]:
+        """Get model information."""
+        return {
+            "model_name": self.config.get("model_name", "OpenLLM"),
+            "model_size": self.config.get("model_size", "unknown"),
+            "parameters": self.config.get("n_embd", 0)
+            * self.config.get("n_layer", 0),  # Approximate
+            "vocab_size": self.config.get("vocab_size", self.tokenizer.vocab_size()),
+            "max_length": self.config.get("block_size", 1024),
+            "format": self.detected_format,
+            "loaded_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.loaded_at)),
+        }
+    def get_health(self) -> Dict[str, Any]:
+        """Get health status."""
+        return {
+            "status": "healthy",
+            "model_loaded": self.model is not None,
+            "uptime_seconds": time.time() - self.loaded_at,
+            "total_requests": self.total_requests,
+        }
+# Global inference engine
+inference_engine: Optional[OpenLLMInference] = None
+# FastAPI app
+app = FastAPI(
+    title="OpenLLM Inference API",
+    description="REST API for OpenLLM text generation",
+    version="0.1.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure appropriately for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize inference engine on startup."""
+    print("🚀 Starting OpenLLM Inference Server...")
+    # Note: Model loading is handled in main() function
+    # For testing, we'll create a mock model if none exists
+    global inference_engine
+    if inference_engine is None:
+        print("⚠️ No model loaded - server will return 503 for generation requests")
+        print("   Use main() function to load a real model")
+        print("   For testing, use load_model_for_testing() function")
+@app.post("/generate", response_model=GenerationResponse)
+async def generate_text(request: GenerationRequest, background_tasks: BackgroundTasks):
+    """Generate text from prompt."""
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    start_time = time.time()
+    try:
+        # Generate text
+        generated_texts = inference_engine.generate(
+            prompt=request.prompt,
+            max_length=request.max_length,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            num_return_sequences=request.num_return_sequences,
+            stop_sequences=request.stop_sequences,
+        )
+        generation_time = time.time() - start_time
+        return GenerationResponse(
+            generated_text=generated_texts,
+            prompt=request.prompt,
+            generation_time=generation_time,
+            parameters={
+                "max_length": request.max_length,
+                "temperature": request.temperature,
+                "top_k": request.top_k,
+                "top_p": request.top_p,
+                "num_return_sequences": request.num_return_sequences,
+            },
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.post("/generate/stream")
+async def generate_text_stream(request: GenerationRequest):
+    """Generate text with streaming response."""
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    try:
+        # For now, return a simple streaming response
+        # In a real implementation, this would stream tokens as they're generated
+        generated_texts = inference_engine.generate(
+            prompt=request.prompt,
+            max_length=request.max_length,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            num_return_sequences=request.num_return_sequences,
+            stop_sequences=request.stop_sequences,
+        )
+        # Return as streaming response
+        return {
+            "generated_text": generated_texts,
+            "prompt": request.prompt,
+            "streaming": True,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.get("/info", response_model=ModelInfo)
+async def get_model_info():
+    """Get model information."""
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    info = inference_engine.get_info()
+    return ModelInfo(**info)
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint."""
+    if inference_engine is None:
+        return HealthResponse(
+            status="unhealthy", model_loaded=False, uptime_seconds=0.0, total_requests=0
+        )
+    health = inference_engine.get_health()
+    return HealthResponse(**health)
+@app.get("/")
+async def root():
+    """Root endpoint."""
+    return {
+        "message": "OpenLLM Inference API",
+        "version": "0.1.0",
+        "docs": "/docs",
+        "health": "/health",
+        "info": "/info",
+        "endpoints": ["/generate", "/generate/stream", "/health", "/info"],
+    }
+def main():
+    """Main server function."""
+    parser = argparse.ArgumentParser(
+        description="OpenLLM Inference Server",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Start server with Hugging Face model
+  python core/src/inference_server.py \\
+    --model_path exports/huggingface/ \\
+    --host 0.0.0.0 \\
+    --port 8000
+  # Start server with ONNX model
+  python core/src/inference_server.py \\
+    --model_path exports/onnx/ \\
+    --format onnx \\
+    --port 8001
+        """,
+    )
+    parser.add_argument(
+        "--model_path",
+        required=True,
+        help="Path to exported model directory",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["pytorch", "huggingface", "onnx", "auto"],
+        default="auto",
+        help="Model format (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--host",
+        default="127.0.0.1",
+        help="Host to bind to (default: 127.0.0.1)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port to bind to (default: 8000)",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=512,
+        help="Maximum generation length (default: 512)",
+    )
+    args = parser.parse_args()
+    # Initialize inference engine
+    global inference_engine
+    inference_engine = OpenLLMInference(args.model_path, args.format)
+    # Start server
+    print(f"🚀 Starting server on {args.host}:{args.port}")
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="info",
+    )
+def load_model(model_path: str, model_format: str = "auto"):
+    """
+    Load model for testing purposes.
+    This function is used by tests to load models without starting the full server.
+    Args:
+        model_path: Path to exported model directory
+        model_format: Model format (pytorch, huggingface, onnx, auto)
+    Returns:
+        OpenLLMInference: Initialized inference engine
+    """
+    return OpenLLMInference(model_path, model_format)
+def load_model_for_testing(
+    model_path: str = "exports/huggingface", model_format: str = "huggingface"
+):
+    """
+    Load a real model for testing purposes.
+    This function loads the actual trained model for testing.
+    Args:
+        model_path: Path to the model directory (default: exports/huggingface)
+        model_format: Model format (default: huggingface)
+    Returns:
+        OpenLLMInference: Real inference engine with loaded model
+    """
+    global inference_engine
+    try:
+        inference_engine = OpenLLMInference(model_path, model_format)
+        print(f"✅ Real model loaded for testing from {model_path}")
+        return inference_engine
+    except Exception as e:
+        print(f"❌ Failed to load real model: {e}")
+        # Fallback to mock model for testing
+        return create_test_model()
+def create_test_model():
+    """
+    Create a real lightweight test model for testing purposes.
+    This creates a real model with minimal parameters for testing,
+    without requiring large model files to be downloaded.
+    Returns:
+        OpenLLMInference: Real lightweight inference engine
+    """
+    try:
+        # Create a real model with minimal parameters
+        import sentencepiece as smp
+        from model import GPTConfig, GPTModel
+        # Create minimal config for testing
+        config = GPTConfig.small()
+        config.n_embd = 128  # Very small for testing
+        config.n_layer = 2  # Very small for testing
+        config.vocab_size = 1000  # Small vocabulary
+        config.block_size = 64  # Small context
+        # Create real model
+        model = GPTModel(config)
+        model.eval()
+        # Create minimal tokenizer
+        class MinimalTokenizer:
+            def __init__(self):
+                self.vocab_size = 1000
+            def encode(self, text):
+                # Simple character-based encoding for testing
+                return [ord(c) % 1000 for c in text[:50]]  # Limit to 50 chars
+            def decode(self, tokens):
+                # Simple character-based decoding for testing
+                return "".join([chr(t % 256) for t in tokens if t < 256])
+            def vocab_size(self):
+                return 1000
+        # Create real inference engine with lightweight model
+        class LightweightInferenceEngine:
+            def __init__(self):
+                self.model = model
+                self.tokenizer = MinimalTokenizer()
+                self.config = {
+                    "model_name": "openllm-small-test",
+                    "model_size": "small",
+                    "n_embd": config.n_embd,
+                    "n_layer": config.n_layer,
+                    "vocab_size": config.vocab_size,
+                    "block_size": config.block_size,
+                }
+                self.detected_format = "pytorch"
+                self.device = "cpu"
+                self.loaded_at = time.time()
+                self.total_requests = 0
+            def generate(self, prompt, max_length=10, temperature=0.7, **kwargs):
+                """Real text generation with lightweight model."""
+                self.total_requests += 1
+                # Tokenize input
+                input_ids = self.tokenizer.encode(prompt)
+                if len(input_ids) == 0:
+                    input_ids = [1]  # Default token
+                # Simple autoregressive generation
+                generated = input_ids.copy()
+                for _ in range(max_length):
+                    if len(generated) >= self.config["block_size"]:
+                        break
+                    # Create input tensor
+                    input_tensor = torch.tensor([generated], dtype=torch.long)
+                    # Forward pass
+                    with torch.no_grad():
+                        logits, _ = self.model(input_tensor)
+                    # Get next token
+                    next_token_logits = logits[0, -1, :] / temperature
+                    probs = torch.softmax(next_token_logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1).item()
+                    generated.append(next_token)
+                # Decode generated text
+                generated_text = self.tokenizer.decode(generated[len(input_ids) :])
+                return [generated_text]
+            def get_info(self):
+                """Get real model information."""
+                return {
+                    "model_name": "openllm-small-test",
+                    "model_size": "small",
+                    "parameters": config.n_embd * config.n_layer * 1000,
+                    "vocab_size": config.vocab_size,
+                    "max_length": config.block_size,
+                    "format": "pytorch",
+                    "loaded_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.loaded_at)),
+                }
+            def get_health(self):
+                """Get real health status."""
+                return {
+                    "status": "healthy",
+                    "model_loaded": True,
+                    "uptime_seconds": time.time() - self.loaded_at,
+                    "total_requests": self.total_requests,
+                }
+        return LightweightInferenceEngine()
+    except Exception as e:
+        print(f"⚠️ Failed to create lightweight model: {e}")
+        # Fallback to simple mock if real model creation fails
+        class SimpleMockInferenceEngine:
+            def __init__(self):
+                self.model = "simple_mock"
+                self.tokenizer = "simple_mock"
+                self.config = {"model_name": "fallback-model"}
+                self.detected_format = "pytorch"
+                self.device = "cpu"
+                self.loaded_at = time.time()
+                self.total_requests = 0
+            def generate(self, prompt, **kwargs):
+                self.total_requests += 1
+                return [f"Generated: {prompt[:10]}..."]
+            def get_info(self):
+                return {
+                    "model_name": "fallback-model",
+                    "model_size": "small",
+                    "parameters": 1000,
+                    "vocab_size": 1000,
+                    "max_length": 100,
+                    "format": "pytorch",
+                    "loaded_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.loaded_at)),
+                }
+            def get_health(self):
+                return {
+                    "status": "healthy",
+                    "model_loaded": True,
+                    "uptime_seconds": time.time() - self.loaded_at,
+                    "total_requests": self.total_requests,
+                }
+        return SimpleMockInferenceEngine()
+if __name__ == "__main__":
+    main()

core/src/main.py ADDED Viewed

	@@ -0,0 +1,842 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+OpenLLM - Main CLI Entry Point
+This module provides a unified command-line interface for all OpenLLM operations
+including data preparation, tokenizer training, model training, and inference.
+Usage:
+    python core/src/main.py <command> [options]
+Available Commands:
+    prepare-data    Download and prepare training data from SQUAD dataset
+    train-tokenizer Train a SentencePiece tokenizer on the prepared data
+    test-model      Test and validate model architecture
+    train-model     Train the language model
+    inference       Run model inference (coming soon)
+    evaluate        Evaluate model performance (coming soon)
+Examples:
+    # Full pipeline
+    python core/src/main.py prepare-data
+    python core/src/main.py train-tokenizer --vocab-size 32000
+    python core/src/main.py test-model --model-size small
+    python core/src/main.py train-model --model-size small --output-dir models/my-model
+    # Help for specific commands
+    python core/src/main.py train-model --help
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+# Set console encoding for Windows compatibility
+if sys.platform == "win32":
+    import codecs
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+    sys.stderr = codecs.getwriter("utf-8")(sys.stderr.detach())
+# Add the current directory to Python path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+try:
+    from download_and_prepare import prepare_training_data
+    from model_test import ModelTester
+    from train_tokenizer import (
+        count_training_sentences,
+        save_huggingface_config,
+        test_tokenizer,
+        train_sentencepiece_tokenizer,
+        validate_input_file,
+    )
+except ImportError as e:
+    print(f"Error importing modules: {e}")
+    print("Make sure you're running this from the correct directory.")
+    sys.exit(1)
+def cmd_prepare_data(args):
+    """Execute data preparation command."""
+    print("🗂️  Starting data preparation...")
+    print(f"Output path: {args.output}")
+    print(f"Minimum words per passage: {args.min_words}")
+    try:
+        prepare_training_data(output_path=args.output, min_words=args.min_words)
+        print("✅ Data preparation completed successfully!")
+        return True
+    except Exception as e:
+        print(f"❌ Data preparation failed: {e}")
+        return False
+def cmd_train_tokenizer(args):
+    """Execute tokenizer training command."""
+    print("🔤 Starting tokenizer training...")
+    print(f"Input: {args.input}")
+    print(f"Output directory: {args.output_dir}")
+    print(f"Vocabulary size: {args.vocab_size:,}")
+    print(f"Model type: {args.model_type}")
+    try:
+        # Step 1: Validate input
+        validate_input_file(args.input)
+        # Step 2: Count training data
+        sentence_count = count_training_sentences(args.input)
+        # Step 3: Train tokenizer
+        config = train_sentencepiece_tokenizer(
+            input_path=args.input,
+            output_dir=args.output_dir,
+            vocab_size=args.vocab_size,
+            model_type=args.model_type,
+            character_coverage=args.character_coverage,
+            max_sentence_length=args.max_sentence_length,
+        )
+        # Step 4: Save Hugging Face config
+        save_huggingface_config(args.output_dir, config)
+        # Step 5: Test tokenizer (unless skipped)
+        if not args.no_test:
+            model_path = os.path.join(args.output_dir, "tokenizer.model")
+            test_tokenizer(model_path)
+        print("✅ Tokenizer training completed successfully!")
+        print(f"📁 Output: {args.output_dir}")
+        print(f"📊 Vocabulary size: {config['vocab_size']:,}")
+        print(f"📄 Training sentences: {sentence_count:,}")
+        return True
+    except Exception as e:
+        print(f"❌ Tokenizer training failed: {e}")
+        return False
+def cmd_train_model(args):
+    """Execute model training command."""
+    print("🏗️  Starting model training...")
+    try:
+        import os
+        import torch
+        from data_loader import TextDataLoader
+        from train_model import ModelTrainer, create_model
+        # Determine device
+        if args.device == "auto":
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            device = args.device
+        print(f"Device: {device}")
+        # Create model
+        print(f"Creating {args.model_size} model...")
+        model = create_model(args.model_size)
+        # Create data loader
+        print("Setting up data loader...")
+        tokenizer_path = os.path.join(args.tokenizer_dir, "tokenizer.model")
+        if not os.path.exists(tokenizer_path):
+            print(f"❌ Tokenizer not found at {tokenizer_path}")
+            print(
+                "Please run: python core/src/main.py train-tokenizer --input data/clean/training_data.txt"
+            )
+            return False
+        data_loader = TextDataLoader(
+            data_file=args.data_file,
+            tokenizer_path=tokenizer_path,
+            seq_len=args.seq_len,
+            batch_size=args.batch_size,
+            shuffle=True,
+        )
+        # Get data statistics
+        _ = data_loader.get_data_stats()
+        # Create trainer
+        print("Setting up trainer...")
+        trainer = ModelTrainer(
+            model=model,
+            data_loader=data_loader,
+            output_dir=args.output_dir,
+            device=device,
+            learning_rate=args.learning_rate,
+            max_steps=args.max_steps,
+            warmup_steps=args.warmup_steps,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            save_every=args.save_every,
+        )
+        # Resume from checkpoint if specified
+        if args.resume:
+            trainer._load_checkpoint(args.resume)
+        # Start training
+        trainer.train()
+        return True
+    except Exception as e:
+        print(f"❌ Training failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def cmd_inference(args):
+    """
+    Execute model inference command.
+    This function implements text generation using trained OpenLLM models.
+    It supports multiple model formats and provides flexible generation options.
+    Args:
+        args: Namespace containing CLI arguments including:
+            - model_path: Path to trained model directory
+            - prompt: Input text prompt for generation
+            - max_length: Maximum number of tokens to generate
+            - temperature: Sampling temperature (0.1-2.0)
+            - format: Model format (auto-detect by default)
+    Returns:
+        bool: True if inference succeeded, False otherwise
+    Implementation Details:
+        - Auto-detects model format (PyTorch, Hugging Face, ONNX)
+        - Uses inference_server.py's OpenLLMInference class for generation
+        - Supports configurable generation parameters
+        - Handles errors gracefully with informative messages
+    """
+    print("🚀 OpenLLM Model Inference")
+    print("=" * 40)
+    try:
+        # Import inference functionality
+        # We import here to avoid circular imports and handle missing dependencies
+        import os
+        import sys
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from inference_server import OpenLLMInference
+        # Validate model path exists
+        # Early validation prevents confusing error messages later
+        model_path = Path(args.model_path)
+        if not model_path.exists():
+            print(f"❌ Model path not found: {args.model_path}")
+            print("   Please check the path and try again.")
+            return False
+        # Initialize inference engine
+        # This handles model loading and format detection automatically
+        print(f"📂 Loading model from: {args.model_path}")
+        inference_engine = OpenLLMInference(
+            model_path=str(model_path),
+            model_format=getattr(args, "format", "auto"),  # Default to auto-detection
+        )
+        # Prepare generation parameters
+        # These parameters control the quality and style of generated text
+        generation_params = {
+            "max_length": args.max_length,
+            "temperature": getattr(args, "temperature", 0.7),  # Default temperature
+            "top_k": getattr(args, "top_k", 40),  # Default top-k
+            "top_p": getattr(args, "top_p", 0.9),  # Default nucleus sampling
+            "num_return_sequences": getattr(args, "num_sequences", 1),  # Default single sequence
+        }
+        print(f"💭 Generating text for prompt: '{args.prompt}'")
+        print(
+            f"⚙️  Parameters: max_length={generation_params['max_length']}, "
+            f"temperature={generation_params['temperature']}"
+        )
+        # Generate text using the inference engine
+        # This is the core functionality that produces the output
+        import time
+        start_time = time.time()
+        generated_texts = inference_engine.generate(prompt=args.prompt, **generation_params)
+        generation_time = time.time() - start_time
+        # Display results with formatting
+        # Clear presentation helps users understand the output
+        print("\n✨ Generated Text:")
+        print("-" * 50)
+        for i, text in enumerate(generated_texts, 1):
+            if len(generated_texts) > 1:
+                print(f"\n[Sequence {i}]")
+            print(text)
+        print("-" * 50)
+        print(f"⏱️  Generation time: {generation_time:.2f} seconds")
+        print(f"📊 Tokens generated: ~{len(generated_texts[0].split())}")
+        print(f"🎯 Model: {inference_engine.config.get('model_name', 'OpenLLM')}")
+        return True
+    except ImportError as e:
+        print(f"❌ Missing dependencies for inference: {e}")
+        print("   Please install: pip install fastapi uvicorn")
+        return False
+    except Exception as e:
+        print(f"❌ Inference failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def cmd_evaluate(args):
+    """
+    Execute model evaluation command.
+    This function implements comprehensive model evaluation including intrinsic
+    metrics (perplexity) and downstream task performance assessment.
+    Args:
+        args: Namespace containing CLI arguments including:
+            - model_path: Path to trained model directory
+            - eval_data: Path to evaluation dataset (optional)
+            - metrics: Comma-separated list of metrics to compute
+            - output_dir: Directory to save evaluation results
+            - format: Model format (auto-detect by default)
+    Returns:
+        bool: True if evaluation succeeded, False otherwise
+    Implementation Details:
+        - Uses evaluate_model.py's ModelEvaluator class for comprehensive testing
+        - Computes perplexity on held-out data if provided
+        - Runs downstream task evaluation (reading comprehension, sentiment, etc.)
+        - Generates detailed evaluation report with metrics and examples
+        - Saves results to JSON file for further analysis
+    """
+    print("📊 OpenLLM Model Evaluation")
+    print("=" * 40)
+    try:
+        # Import evaluation functionality
+        # We import here to avoid circular imports and handle missing dependencies
+        import json
+        import os
+        import sys
+        from pathlib import Path
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+        from evaluate_model import ModelEvaluator
+        # Validate model path exists
+        # Early validation prevents confusing error messages later
+        model_path = Path(args.model_path)
+        if not model_path.exists():
+            print(f"❌ Model path not found: {args.model_path}")
+            print("   Please check the path and try again.")
+            return False
+        # Determine output directory for results
+        # Create output directory if it doesn't exist
+        output_dir = Path(getattr(args, "output_dir", "evaluation_results"))
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Parse requested metrics
+        # Default to comprehensive evaluation if not specified
+        requested_metrics = getattr(args, "metrics", "perplexity,generation,downstream").split(",")
+        requested_metrics = [m.strip() for m in requested_metrics]
+        print(f"📂 Loading model from: {args.model_path}")
+        print(f"📋 Requested metrics: {', '.join(requested_metrics)}")
+        print(f"💾 Results will be saved to: {output_dir}")
+        # Initialize model evaluator
+        # This handles model loading and tokenizer setup
+        evaluator = ModelEvaluator(
+            model_dir=str(model_path),
+            tokenizer_path=getattr(args, "tokenizer_path", None),  # Auto-detect if not provided
+        )
+        # Prepare evaluation results container
+        # This will store all evaluation metrics and examples
+        evaluation_results = {
+            "model_info": {
+                "model_path": str(model_path),
+                "model_name": evaluator.config.get("model_name", "OpenLLM"),
+                "parameters": evaluator.model.get_num_params(),
+                "evaluation_time": None,
+            },
+            "metrics": {},
+            "examples": {},
+            "summary": {},
+        }
+        import time
+        start_time = time.time()
+        # 1. Perplexity Evaluation
+        # This measures how well the model predicts the next token
+        if "perplexity" in requested_metrics:
+            print("\n🔍 Computing perplexity...")
+            eval_data_path = getattr(args, "eval_data", None)
+            if eval_data_path and Path(eval_data_path).exists():
+                # Use provided evaluation data
+                perplexity_result = evaluator.evaluate_perplexity(eval_data_path)
+            else:
+                # Use a subset of training data for perplexity calculation
+                print("   No eval data provided, using default test set")
+                perplexity_result = evaluator.evaluate_perplexity()
+            evaluation_results["metrics"]["perplexity"] = perplexity_result
+            print(f"   ✅ Perplexity: {perplexity_result.get('perplexity', 'N/A'):.2f}")
+            print(f"   📊 Loss: {perplexity_result.get('loss', 'N/A'):.4f}")
+        # 2. Text Generation Quality Assessment
+        # This evaluates the coherence and quality of generated text
+        if "generation" in requested_metrics:
+            print("\n✍️  Evaluating text generation quality...")
+            generation_result = evaluator.evaluate_text_generation()
+            evaluation_results["metrics"]["generation"] = generation_result
+            evaluation_results["examples"]["generation"] = generation_result.get("examples", [])
+            print(
+                f"   ✅ Average quality score: {generation_result.get('average_quality', 'N/A'):.2f}"
+            )
+            print(f"   📝 Generated {len(generation_result.get('examples', []))} examples")
+        # 3. Downstream Task Evaluation
+        # This tests specific capabilities like reading comprehension
+        if "downstream" in requested_metrics:
+            print("\n🎯 Evaluating downstream tasks...")
+            downstream_result = evaluator.evaluate_downstream_tasks()
+            evaluation_results["metrics"]["downstream"] = downstream_result
+            evaluation_results["examples"]["downstream"] = {
+                task: result.get("examples", []) for task, result in downstream_result.items()
+            }
+            # Display summary of downstream results
+            for task_name, task_result in downstream_result.items():
+                accuracy = task_result.get("accuracy", 0) * 100
+                print(f"   ✅ {task_name.replace('_', ' ').title()}: {accuracy:.1f}%")
+        # Calculate total evaluation time
+        evaluation_time = time.time() - start_time
+        evaluation_results["model_info"]["evaluation_time"] = evaluation_time
+        # Generate evaluation summary
+        # This provides a high-level overview of model performance
+        summary = {
+            "overall_score": 0.0,  # Will be calculated based on available metrics
+            "strengths": [],
+            "weaknesses": [],
+            "recommendations": [],
+        }
+        # Calculate overall score based on available metrics
+        scores = []
+        if "perplexity" in evaluation_results["metrics"]:
+            ppl = evaluation_results["metrics"]["perplexity"].get("perplexity", float("inf"))
+            # Convert perplexity to 0-100 score (lower perplexity is better)
+            ppl_score = max(0, 100 - (ppl - 10) * 5)  # Rough conversion
+            scores.append(ppl_score)
+            if ppl < 15:
+                summary["strengths"].append("Good language modeling (low perplexity)")
+            else:
+                summary["weaknesses"].append("High perplexity indicates poor language modeling")
+        if "generation" in evaluation_results["metrics"]:
+            gen_score = evaluation_results["metrics"]["generation"].get("average_quality", 0) * 100
+            scores.append(gen_score)
+            if gen_score > 70:
+                summary["strengths"].append("High-quality text generation")
+            else:
+                summary["weaknesses"].append("Text generation needs improvement")
+        if "downstream" in evaluation_results["metrics"]:
+            downstream_scores = []
+            for task_result in evaluation_results["metrics"]["downstream"].values():
+                downstream_scores.append(task_result.get("accuracy", 0) * 100)
+            if downstream_scores:
+                avg_downstream = sum(downstream_scores) / len(downstream_scores)
+                scores.append(avg_downstream)
+                if avg_downstream > 50:
+                    summary["strengths"].append("Good performance on downstream tasks")
+                else:
+                    summary["weaknesses"].append("Poor downstream task performance")
+        # Calculate overall score
+        if scores:
+            summary["overall_score"] = sum(scores) / len(scores)
+        # Add recommendations based on performance
+        if summary["overall_score"] < 40:
+            summary["recommendations"].extend(
+                [
+                    "Consider training for more steps",
+                    "Verify training data quality",
+                    "Check model architecture and hyperparameters",
+                ]
+            )
+        elif summary["overall_score"] < 70:
+            summary["recommendations"].extend(
+                [
+                    "Model shows promise - consider extended training",
+                    "Fine-tune on specific downstream tasks",
+                ]
+            )
+        else:
+            summary["recommendations"].append("Model performs well - ready for deployment")
+        evaluation_results["summary"] = summary
+        # Save detailed results to file
+        # This allows for further analysis and comparison between models
+        results_file = output_dir / f"evaluation_results_{int(time.time())}.json"
+        with open(results_file, "w") as f:
+            json.dump(evaluation_results, f, indent=2, default=str)
+        # Display comprehensive results summary
+        print("\n" + "=" * 60)
+        print("📊 EVALUATION SUMMARY")
+        print("=" * 60)
+        print(f"🎯 Overall Score: {summary['overall_score']:.1f}/100")
+        print(f"⏱️  Evaluation Time: {evaluation_time:.1f} seconds")
+        if summary["strengths"]:
+            print("\n✅ Strengths:")
+            for strength in summary["strengths"]:
+                print(f"   • {strength}")
+        if summary["weaknesses"]:
+            print("\n⚠️  Areas for Improvement:")
+            for weakness in summary["weaknesses"]:
+                print(f"   • {weakness}")
+        if summary["recommendations"]:
+            print("\n💡 Recommendations:")
+            for rec in summary["recommendations"]:
+                print(f"   • {rec}")
+        print(f"\n💾 Detailed results saved to: {results_file}")
+        print("🎉 Evaluation completed successfully!")
+        return True
+    except ImportError as e:
+        print(f"❌ Missing dependencies for evaluation: {e}")
+        print("   Please check that all required packages are installed.")
+        return False
+    except Exception as e:
+        print(f"❌ Evaluation failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def cmd_test_model(args):
+    """Execute model testing command."""
+    print("🧪 Testing model architecture...")
+    try:
+        # Initialize model tester
+        tester = ModelTester(device=args.device)
+        if args.all_sizes:
+            # Test all model sizes
+            test_sizes = ["small", "medium", "large"]
+            all_success = True
+            for size in test_sizes:
+                print(f"\n{'='*20} Testing {size.upper()} Model {'='*20}")
+                results = tester.run_comprehensive_test(size)
+                if not results["initialization"]["success"]:
+                    all_success = False
+                    print(f"❌ {size.upper()} model failed initialization")
+                else:
+                    print(f"✓ {size.upper()} model passed all tests")
+            return all_success
+        else:
+            # Test single model size
+            results = tester.run_comprehensive_test(args.model_size)
+            if args.save_results:
+                import json
+                with open(args.save_results, "w") as f:
+                    json.dump(results, f, indent=2)
+                print(f"\n💾 Results saved to {args.save_results}")
+            return results["initialization"]["success"]
+    except Exception as e:
+        print(f"❌ Model testing failed: {e}")
+        return False
+def create_parser():
+    """Create the main argument parser with subcommands."""
+    parser = argparse.ArgumentParser(
+        description="OpenLLM - Open Source Large Language Model Training Pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Prepare training data from SQUAD dataset
+  python core/src/main.py prepare-data --output data/clean/training_data.txt
+  # Train tokenizer with custom settings
+  python core/src/main.py train-tokenizer \\
+    --input data/clean/training_data.txt \\
+    --vocab-size 32000 \\
+    --output-dir data/tokenizer/
+  # Get help for specific commands
+  python core/src/main.py train-tokenizer --help
+        """,
+    )
+    parser.add_argument("--version", action="version", version="OpenLLM v0.1.0")
+    # Create subparsers for different commands
+    subparsers = parser.add_subparsers(dest="command", help="Available commands", required=True)
+    # Data preparation command
+    parser_data = subparsers.add_parser(
+        "prepare-data",
+        help="Download and prepare training data from SQUAD dataset",
+        description="Downloads SQUAD v1.1 and v2.0 datasets, extracts Wikipedia passages, and prepares clean training text.",
+    )
+    parser_data.add_argument(
+        "--output",
+        default="data/clean/training_data.txt",
+        help="Output path for cleaned training data (default: data/clean/training_data.txt)",
+    )
+    parser_data.add_argument(
+        "--min-words",
+        type=int,
+        default=10,
+        help="Minimum number of words per passage (default: 10)",
+    )
+    parser_data.set_defaults(func=cmd_prepare_data)
+    # Tokenizer training command
+    parser_tokenizer = subparsers.add_parser(
+        "train-tokenizer",
+        help="Train a SentencePiece tokenizer on prepared data",
+        description="Trains a BPE or Unigram tokenizer using SentencePiece on the prepared training text.",
+    )
+    parser_tokenizer.add_argument("--input", required=True, help="Path to training text file")
+    parser_tokenizer.add_argument(
+        "--vocab-size", type=int, default=32000, help="Vocabulary size (default: 32000)"
+    )
+    parser_tokenizer.add_argument(
+        "--model-type",
+        choices=["bpe", "unigram"],
+        default="bpe",
+        help="Tokenization algorithm (default: bpe)",
+    )
+    parser_tokenizer.add_argument(
+        "--output-dir",
+        default="data/tokenizer/",
+        help="Output directory for tokenizer files (default: data/tokenizer/)",
+    )
+    parser_tokenizer.add_argument(
+        "--character-coverage",
+        type=float,
+        default=0.9995,
+        help="Character coverage (default: 0.9995)",
+    )
+    parser_tokenizer.add_argument(
+        "--max-sentence-length",
+        type=int,
+        default=4192,
+        help="Maximum sentence length (default: 4192)",
+    )
+    parser_tokenizer.add_argument(
+        "--no-test", action="store_true", help="Skip tokenizer testing after training"
+    )
+    parser_tokenizer.set_defaults(func=cmd_train_tokenizer)
+    # Model testing command
+    parser_test = subparsers.add_parser(
+        "test-model",
+        help="Test and validate model architecture",
+        description="Test model initialization, forward pass, memory usage, and tokenizer integration.",
+    )
+    parser_test.add_argument(
+        "--model-size",
+        choices=["small", "medium", "large"],
+        default="medium",
+        help="Model size to test (default: medium)",
+    )
+    parser_test.add_argument("--all-sizes", action="store_true", help="Test all model sizes")
+    parser_test.add_argument(
+        "--device",
+        choices=["cpu", "cuda", "auto"],
+        default="auto",
+        help="Device to use for testing (default: auto)",
+    )
+    parser_test.add_argument("--save-results", help="Save test results to JSON file")
+    parser_test.set_defaults(func=cmd_test_model)
+    # Model training command
+    parser_model = subparsers.add_parser(
+        "train-model",
+        help="Train the language model",
+        description="Train a GPT-style transformer language model on tokenized text.",
+    )
+    parser_model.add_argument(
+        "--model-size",
+        choices=["small", "medium", "large"],
+        default="small",
+        help="Model size to train (default: small)",
+    )
+    parser_model.add_argument(
+        "--tokenizer-dir",
+        default="data/tokenizer/",
+        help="Path to trained tokenizer directory (default: data/tokenizer/)",
+    )
+    parser_model.add_argument(
+        "--data-file",
+        default="data/clean/training_data.txt",
+        help="Path to training text file (default: data/clean/training_data.txt)",
+    )
+    parser_model.add_argument(
+        "--output-dir", required=True, help="Output directory for model checkpoints"
+    )
+    parser_model.add_argument(
+        "--seq-len", type=int, default=512, help="Sequence length for training (default: 512)"
+    )
+    parser_model.add_argument(
+        "--batch-size", type=int, default=4, help="Batch size (default: 4, reduce for low memory)"
+    )
+    parser_model.add_argument(
+        "--learning-rate", type=float, default=3e-4, help="Learning rate (default: 3e-4)"
+    )
+    parser_model.add_argument(
+        "--max-steps", type=int, default=10000, help="Maximum training steps (default: 10000)"
+    )
+    parser_model.add_argument(
+        "--warmup-steps", type=int, default=1000, help="Warmup steps (default: 1000)"
+    )
+    parser_model.add_argument(
+        "--gradient-accumulation-steps",
+        type=int,
+        default=4,
+        help="Gradient accumulation steps (default: 4)",
+    )
+    parser_model.add_argument(
+        "--device",
+        choices=["cpu", "cuda", "auto"],
+        default="auto",
+        help="Training device (default: auto)",
+    )
+    parser_model.add_argument("--resume", help="Path to checkpoint to resume training from")
+    parser_model.add_argument(
+        "--save-every", type=int, default=1000, help="Save checkpoint every N steps (default: 1000)"
+    )
+    parser_model.set_defaults(func=cmd_train_model)
+    # Inference command (placeholder)
+    parser_inference = subparsers.add_parser(
+        "inference",
+        help="Run model inference (coming soon)",
+        description="Generate text using a trained model.",
+    )
+    parser_inference.add_argument("--model-path", required=True, help="Path to trained model")
+    parser_inference.add_argument("--prompt", required=True, help="Input text prompt")
+    parser_inference.add_argument(
+        "--max-length", type=int, default=256, help="Maximum generation length"
+    )
+    parser_inference.set_defaults(func=cmd_inference)
+    # Evaluation command (placeholder)
+    parser_eval = subparsers.add_parser(
+        "evaluate",
+        help="Evaluate model performance (coming soon)",
+        description="Evaluate model on various benchmarks and metrics.",
+    )
+    parser_eval.add_argument("--model-path", required=True, help="Path to trained model")
+    parser_eval.add_argument("--eval-data", help="Path to evaluation dataset")
+    parser_eval.add_argument(
+        "--metrics", nargs="+", default=["perplexity"], help="Metrics to compute"
+    )
+    parser_eval.set_defaults(func=cmd_evaluate)
+    # --- Optional: Enterprise module integration ---
+    # Load enterprise-only CLI commands if an external module is available.
+    # This preserves the core's open-source nature while allowing private
+    # extensions to register additional commands without modifying core code.
+    try:
+        from enterprise_integration import load_enterprise_cli
+        if load_enterprise_cli(subparsers):
+            print("🧩 Enterprise extensions detected and loaded")
+        else:
+            # No enterprise plugin found (normal for open-source-only usage)
+            pass
+    except Exception as e:
+        # Never fail core CLI due to enterprise integration issues
+        print(f"Warning: Enterprise integration failed: {e}")
+    return parser
+def main():
+    """Main entry point for the OpenLLM CLI."""
+    parser = create_parser()
+    args = parser.parse_args()
+    print("🚀 OpenLLM - Open Source Large Language Model")
+    print("=" * 60)
+    # Execute the selected command
+    success = args.func(args)
+    # Exit with appropriate code
+    if success:
+        print("\n🎉 Command completed successfully!")
+        sys.exit(0)
+    else:
+        print("\n❌ Command failed or not implemented yet.")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

core/src/mixed_precision.py ADDED Viewed

	@@ -0,0 +1,220 @@

+#!/usr/bin/env python3
+"""
+Mixed Precision Training Utilities
+This module provides utilities for mixed precision training using PyTorch's
+automatic mixed precision (AMP) to improve training speed and reduce memory usage.
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast, GradScaler
+from typing import Optional, Callable
+class MixedPrecisionTrainer:
+    """
+    Mixed precision training wrapper for improved performance.
+    This class provides automatic mixed precision training capabilities
+    that can significantly improve training speed and reduce memory usage
+    on compatible hardware (especially NVIDIA GPUs with Tensor Cores).
+    """
+    def __init__(self,
+                 model: nn.Module,
+                 optimizer: torch.optim.Optimizer,
+                 device: str = "auto",
+                 dtype: torch.dtype = torch.float16,
+                 enabled: bool = True):
+        """
+        Initialize mixed precision trainer.
+        Args:
+            model: The model to train
+            optimizer: The optimizer to use
+            device: Device to use ("auto", "cpu", "cuda")
+            dtype: Precision dtype (float16, bfloat16)
+            enabled: Whether to enable mixed precision
+        """
+        self.model = model
+        self.optimizer = optimizer
+        self.device = self._get_device(device)
+        self.dtype = dtype
+        self.enabled = enabled and self.device.type == "cuda"
+        # Initialize gradient scaler for mixed precision
+        self.scaler = GradScaler() if self.enabled else None
+        # Move model to device
+        self.model.to(self.device)
+        print(f"Mixed Precision Training: {'Enabled' if self.enabled else 'Disabled'}")
+        print(f"Device: {self.device}")
+        print(f"Precision: {self.dtype}")
+    def _get_device(self, device: str) -> torch.device:
+        """Get the appropriate device."""
+        if device == "auto":
+            if torch.cuda.is_available():
+                return torch.device("cuda")
+            else:
+                return torch.device("cpu")
+        else:
+            return torch.device(device)
+    def train_step(self,
+                   batch: torch.Tensor,
+                   targets: torch.Tensor,
+                   loss_fn: Optional[Callable] = None) -> dict:
+        """
+        Perform a single training step with mixed precision.
+        Args:
+            batch: Input batch
+            targets: Target batch
+            loss_fn: Optional custom loss function
+        Returns:
+            dict: Training metrics
+        """
+        self.model.train()
+        self.optimizer.zero_grad()
+        # Move data to device
+        batch = batch.to(self.device)
+        targets = targets.to(self.device)
+        if self.enabled:
+            # Mixed precision forward pass
+            with autocast(dtype=self.dtype):
+                if loss_fn is None:
+                    # Use model's built-in loss computation
+                    logits, loss = self.model(batch, targets)
+                else:
+                    # Use custom loss function
+                    logits = self.model(batch)
+                    loss = loss_fn(logits, targets)
+            # Scaled backward pass
+            self.scaler.scale(loss).backward()
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+        else:
+            # Standard precision training
+            if loss_fn is None:
+                logits, loss = self.model(batch, targets)
+            else:
+                logits = self.model(batch)
+                loss = loss_fn(logits, targets)
+            loss.backward()
+            self.optimizer.step()
+        return {
+            "loss": loss.item(),
+            "logits": logits,
+            "scaler_scale": self.scaler.get_scale() if self.scaler else 1.0
+        }
+    def eval_step(self,
+                  batch: torch.Tensor,
+                  targets: torch.Tensor,
+                  loss_fn: Optional[Callable] = None) -> dict:
+        """
+        Perform a single evaluation step.
+        Args:
+            batch: Input batch
+            targets: Target batch
+            loss_fn: Optional custom loss function
+        Returns:
+            dict: Evaluation metrics
+        """
+        self.model.eval()
+        # Move data to device
+        batch = batch.to(self.device)
+        targets = targets.to(self.device)
+        with torch.no_grad():
+            if self.enabled:
+                with autocast(dtype=self.dtype):
+                    if loss_fn is None:
+                        logits, loss = self.model(batch, targets)
+                    else:
+                        logits = self.model(batch)
+                        loss = loss_fn(logits, targets)
+            else:
+                if loss_fn is None:
+                    logits, loss = self.model(batch, targets)
+                else:
+                    logits = self.model(batch)
+                    loss = loss_fn(logits, targets)
+        return {
+            "loss": loss.item(),
+            "logits": logits
+        }
+    def save_checkpoint(self, path: str, **kwargs):
+        """Save model checkpoint with mixed precision state."""
+        checkpoint = {
+            "model_state_dict": self.model.state_dict(),
+            "optimizer_state_dict": self.optimizer.state_dict(),
+            "scaler_state_dict": self.scaler.state_dict() if self.scaler else None,
+            "dtype": self.dtype,
+            "enabled": self.enabled,
+            **kwargs
+        }
+        torch.save(checkpoint, path)
+    def load_checkpoint(self, path: str):
+        """Load model checkpoint with mixed precision state."""
+        checkpoint = torch.load(path, map_location=self.device)
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        if self.scaler and checkpoint.get("scaler_state_dict"):
+            self.scaler.load_state_dict(checkpoint["scaler_state_dict"])
+        return checkpoint
+def enable_mixed_precision(model: nn.Module,
+                          optimizer: torch.optim.Optimizer,
+                          **kwargs) -> MixedPrecisionTrainer:
+    """
+    Convenience function to enable mixed precision training.
+    Args:
+        model: The model to train
+        optimizer: The optimizer to use
+        **kwargs: Additional arguments for MixedPrecisionTrainer
+    Returns:
+        MixedPrecisionTrainer: Configured trainer
+    """
+    return MixedPrecisionTrainer(model, optimizer, **kwargs)
+def get_optimal_dtype() -> torch.dtype:
+    """
+    Get the optimal dtype for mixed precision training.
+    Returns:
+        torch.dtype: Optimal dtype (bfloat16 for newer GPUs, float16 for older)
+    """
+    if torch.cuda.is_available():
+        # Check if bfloat16 is supported
+        if hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+            return torch.bfloat16
+        else:
+            return torch.float16
+    else:
+        return torch.float32

core/src/model.py ADDED Viewed

	@@ -0,0 +1,665 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+GPT-style Language Model Architecture
+This module implements a standard GPT (Generative Pre-trained Transformer) architecture
+using pure PyTorch. The model is a decoder-only transformer designed for autoregressive
+language modeling (next-token prediction).
+ARCHITECTURE OVERVIEW:
+- Token Embedding: Maps token IDs to dense vectors
+- Positional Embedding: Adds position information to token embeddings
+- Transformer Blocks: Stack of multi-head attention + feed-forward layers
+- Layer Normalization: Pre-norm placement for training stability
+- Output Head: Linear projection to vocabulary for next-token prediction
+FEATURES:
+- Configurable model size (small/medium/large)
+- Dropout for regularization
+- Causal (autoregressive) attention masking
+- Compatible with our SentencePiece tokenizer
+- Memory-efficient implementation for training on limited hardware
+Usage:
+    from model import GPTConfig, GPTModel
+    config = GPTConfig(vocab_size=32000, n_layer=12, n_head=12, n_embd=768)
+    model = GPTModel(config)
+    # Forward pass
+    logits = model(input_ids)  # Shape: (batch_size, seq_len, vocab_size)
+Hardware Requirements:
+- Small Model (25M params): 4-8GB RAM, CPU/integrated GPU
+- Medium Model (117M params): 8-16GB RAM, dedicated GPU recommended
+- Large Model (350M params): 16GB+ RAM, high-end GPU required
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+@dataclass
+class GPTConfig:
+    """
+    Configuration class for GPT model hyperparameters.
+    This class defines all the architectural parameters needed to instantiate
+    a GPT model. Use the provided class methods to get pre-configured setups
+    for different model sizes.
+    """
+    # Model architecture
+    vocab_size: int = 32000  # Vocabulary size (from tokenizer)
+    n_layer: int = 12  # Number of transformer layers
+    n_head: int = 12  # Number of attention heads
+    n_embd: int = 768  # Embedding dimension
+    # Sequence and context
+    block_size: int = 1024  # Maximum sequence length
+    # Training hyperparameters
+    dropout: float = 0.1  # Dropout probability
+    bias: bool = True  # Use bias in linear layers
+    # Model size identifier
+    model_name: str = "gpt-medium"  # Human-readable model identifier
+    @classmethod
+    def small(cls) -> "GPTConfig":
+        """Small model configuration (~25M parameters) - Good for CPU training"""
+        return cls(
+            vocab_size=32000,
+            n_layer=6,
+            n_head=8,
+            n_embd=512,
+            block_size=1024,
+            dropout=0.1,
+            model_name="gpt-small",
+        )
+    @classmethod
+    def medium(cls) -> "GPTConfig":
+        """Medium model configuration (~117M parameters) - Balanced performance"""
+        return cls(
+            vocab_size=32000,
+            n_layer=12,
+            n_head=12,
+            n_embd=768,
+            block_size=2048,
+            dropout=0.1,
+            model_name="gpt-medium",
+        )
+    @classmethod
+    def large(cls) -> "GPTConfig":
+        """Large model configuration (~350M parameters) - High performance"""
+        return cls(
+            vocab_size=32000,
+            n_layer=24,
+            n_head=16,
+            n_embd=1024,
+            block_size=2048,
+            dropout=0.1,
+            model_name="gpt-large",
+        )
+    def estimate_parameters(self) -> int:
+        """
+        Estimate the total number of trainable parameters.
+        Returns:
+            int: Estimated parameter count
+        """
+        # Token embeddings
+        token_emb = self.vocab_size * self.n_embd
+        # Position embeddings
+        pos_emb = self.block_size * self.n_embd
+        # Transformer layers
+        # Each layer: attention (4 * n_embd^2) + mlp (8 * n_embd^2) + layer_norms
+        layer_params = self.n_layer * (12 * self.n_embd**2 + 4 * self.n_embd)
+        # Output head
+        output_head = self.vocab_size * self.n_embd
+        total = token_emb + pos_emb + layer_params + output_head
+        return total
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-head causal self-attention mechanism.
+    This implements the core attention mechanism of the transformer, with causal
+    masking to ensure autoregressive behavior (tokens can only attend to previous
+    tokens, not future ones).
+    """
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        assert (
+            config.n_embd % config.n_head == 0
+        ), "Embedding dim must be divisible by number of heads"
+        self.config = config
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        # Key, query, value projections for all heads (batched)
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # Output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Dropout
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        # Causal mask - lower triangular matrix
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                1, 1, config.block_size, config.block_size
+            ),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of causal self-attention.
+        This method implements the scaled dot-product attention mechanism with causal masking.
+        The attention mechanism allows each token to attend to all previous tokens in the sequence,
+        but not to future tokens, maintaining the autoregressive property essential for language modeling.
+        Mathematical formulation:
+            Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
+            where Q, K, V are query, key, value matrices derived from input x
+        Implementation details:
+            - Uses batch matrix multiplication for efficiency
+            - Applies causal mask to prevent future token attention
+            - Implements multi-head attention by reshaping and parallel processing
+            - Applies dropout for regularization during training
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, n_embd)
+               Contains embedded token representations from previous layer
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
+        """
+        # Extract tensor dimensions for clear variable naming and validation
+        # B = batch size (number of sequences processed in parallel)
+        # T = sequence length (number of tokens in each sequence)
+        # C = embedding dimensionality (n_embd from config)
+        B, T, C = x.size()
+        # Generate query, key, and value projections for all attention heads
+        # The c_attn linear layer outputs 3 * n_embd features, which we split into Q, K, V
+        # This batched approach is more efficient than separate linear layers
+        # Input shape: (B, T, C) -> Output shape: (B, T, 3*C) -> Split to 3x (B, T, C)
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        # Reshape tensors for multi-head attention computation
+        # Transform from (B, T, C) to (B, nh, T, hs) where:
+        # - nh = number of heads (self.n_head)
+        # - hs = head size (self.head_dim = C // nh)
+        # The transpose(1, 2) moves the head dimension before sequence dimension for efficient computation
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
+        # Compute scaled dot-product attention scores
+        # Matrix multiplication: Q @ K^T gives attention affinities between all token pairs
+        # Scaling by 1/sqrt(head_dim) prevents softmax saturation for large embedding dimensions
+        # Shape: (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nh, T, T)
+        # The resulting (T, T) matrix represents attention weights from each token to every other token
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
+        # Apply causal masking to enforce autoregressive property
+        # The causal mask ensures that token i can only attend to tokens j where j <= i
+        # This prevents the model from "cheating" by looking at future tokens during training
+        # We use -inf for masked positions so they become 0 after softmax
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        # Convert attention scores to probabilities using softmax
+        # Each row of the attention matrix now sums to 1, representing a probability distribution
+        # over which tokens to attend to for each query position
+        att = F.softmax(att, dim=-1)
+        # Apply dropout to attention weights for regularization
+        # This randomly zeros some attention connections during training to prevent overfitting
+        att = self.attn_dropout(att)
+        # Apply attention weights to value vectors
+        # This weighted combination produces the actual output of the attention mechanism
+        # Shape: (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
+        # Each output position is a weighted sum of all value vectors, with weights from attention
+        y = att @ v
+        # Concatenate multi-head outputs back to original embedding dimension
+        # Transform from (B, nh, T, hs) back to (B, T, C) where C = nh * hs
+        # The transpose moves head dimension back, and contiguous() ensures memory layout efficiency
+        # This combines information from all attention heads into a single representation
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply final output projection and residual dropout
+        # The output projection allows the model to learn how to best combine multi-head information
+        # Residual dropout provides additional regularization before the residual connection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    """
+    Multi-Layer Perceptron (Feed-Forward Network) for Transformer.
+    This implements the position-wise feed-forward network that appears in each transformer layer.
+    The MLP provides additional non-linear transformation capacity beyond what attention provides.
+    Architecture:
+        Input -> Linear(n_embd -> 4*n_embd) -> GELU -> Linear(4*n_embd -> n_embd) -> Dropout -> Output
+    Design rationale:
+        - 4x expansion is standard in transformers (from "Attention Is All You Need")
+        - GELU activation provides smoother gradients than ReLU for language modeling
+        - Dropout prevents overfitting in the feed-forward layers
+        - Two linear layers allow complex non-linear transformations of attention outputs
+    Parameters:
+        - First linear layer: n_embd * 4*n_embd parameters (expansion)
+        - Second linear layer: 4*n_embd * n_embd parameters (projection back)
+        - Total: 8 * n_embd^2 parameters (significant portion of model size)
+    """
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        # First linear layer: expand embedding dimension by 4x
+        # This expansion gives the network more representational capacity
+        # The 4x factor is a standard choice that balances capacity vs efficiency
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        # GELU (Gaussian Error Linear Unit) activation function
+        # GELU provides smoother gradients compared to ReLU and works better for language modeling
+        # It's approximately: GELU(x) = x * Φ(x) where Φ is the CDF of standard normal distribution
+        self.gelu = nn.GELU()
+        # Second linear layer: project back to original embedding dimension
+        # This projection allows the network to combine information from the expanded representation
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        # Dropout for regularization in the feed-forward network
+        # Applied after the final projection to prevent overfitting
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the feed-forward network.
+        This method applies a two-layer MLP with GELU activation to transform
+        the attention outputs. The MLP operates independently on each position
+        in the sequence, providing position-wise non-linear transformations.
+        Mathematical operation:
+            MLP(x) = Dropout(Linear₂(GELU(Linear₁(x))))
+            where Linear₁: R^n_embd -> R^4*n_embd and Linear₂: R^4*n_embd -> R^n_embd
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, n_embd)
+               Contains attended representations from the attention layer
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
+                         Contains transformed representations ready for residual connection
+        """
+        # First linear transformation: expand from n_embd to 4*n_embd dimensions
+        # This expansion provides the network with a higher-dimensional space for computation
+        # Shape: (batch_size, seq_len, n_embd) -> (batch_size, seq_len, 4*n_embd)
+        x = self.c_fc(x)
+        # Apply GELU activation function for non-linearity
+        # GELU is smoother than ReLU and provides better gradients for language modeling
+        # It introduces non-linearity while maintaining differentiability everywhere
+        x = self.gelu(x)
+        # Second linear transformation: project back to original n_embd dimensions
+        # This projection combines information from the expanded representation
+        # Shape: (batch_size, seq_len, 4*n_embd) -> (batch_size, seq_len, n_embd)
+        x = self.c_proj(x)
+        # Apply dropout for regularization before residual connection
+        # Dropout randomly zeros some neurons during training to prevent overfitting
+        # This is particularly important in the feed-forward layers which have many parameters
+        x = self.dropout(x)
+        return x
+class Block(nn.Module):
+    """
+    Single Transformer block.
+    Consists of:
+    1. Layer normalization
+    2. Multi-head causal self-attention
+    3. Residual connection
+    4. Layer normalization
+    5. MLP (feed-forward network)
+    6. Residual connection
+    Uses pre-norm architecture for better training stability.
+    """
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of transformer block.
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, n_embd)
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
+        """
+        # Pre-norm attention with residual connection
+        x = x + self.attn(self.ln_1(x))
+        # Pre-norm MLP with residual connection
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class GPTModel(nn.Module):
+    """
+    Complete GPT Language Model.
+    This is the main model class that combines all components:
+    - Token and positional embeddings
+    - Stack of transformer blocks
+    - Final layer normalization
+    - Language modeling head
+    The model can be used for:
+    - Training from scratch on text data
+    - Fine-tuning on downstream tasks
+    - Text generation (inference)
+    """
+    def __init__(self, config: GPTConfig, use_checkpoint=True):
+        super().__init__()
+        assert config.vocab_size is not None, "vocab_size must be specified"
+        assert config.block_size is not None, "block_size must be specified"
+        self.config = config
+        self.use_checkpoint = use_checkpoint
+        # Embeddings
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),  # Token embeddings
+                wpe=nn.Embedding(config.block_size, config.n_embd),  # Position embeddings
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList(
+                    [Block(config) for _ in range(config.n_layer)]
+                ),  # Transformer blocks
+                ln_f=nn.LayerNorm(config.n_embd),  # Final layer norm
+            )
+        )
+        # Language modeling head (maps hidden states to vocabulary)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Tie weights between token embeddings and output head (common practice)
+        self.transformer.wte.weight = self.lm_head.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+        # Report parameter count
+        print(f"Model initialized: {self.config.model_name}")
+        print(f"Parameters: {self.get_num_params():,}")
+        print(f"Estimated: {self.config.estimate_parameters():,}")
+    def _init_weights(self, module):
+        """Initialize model weights using standard practices."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def get_num_params(self, non_embedding: bool = False) -> int:
+        """
+        Count the number of parameters in the model.
+        Args:
+            non_embedding: If True, subtract embedding parameters
+        Returns:
+            int: Number of parameters
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+            n_params -= self.transformer.wte.weight.numel()
+        return n_params
+    def forward(
+        self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Forward pass of the GPT model.
+        Args:
+            idx: Input token indices of shape (batch_size, seq_len)
+            targets: Optional target tokens for loss calculation (batch_size, seq_len)
+        Returns:
+            Tuple containing:
+            - logits: Output logits of shape (batch_size, seq_len, vocab_size)
+            - loss: Cross-entropy loss if targets provided, None otherwise
+        """
+        device = idx.device
+        b, t = idx.size()
+        assert (
+            t <= self.config.block_size
+        ), f"Sequence length {t} exceeds block size {self.config.block_size}"
+        # Token embeddings
+        tok_emb = self.transformer.wte(idx)  # (b, t, n_embd)
+        # Position embeddings
+        pos = torch.arange(0, t, dtype=torch.long, device=device)  # (t,)
+        pos_emb = self.transformer.wpe(pos)  # (t, n_embd)
+        # Combine embeddings and apply dropout
+        x = self.transformer.drop(tok_emb + pos_emb)
+        # Pass through transformer blocks with optional gradient checkpointing
+        if self.use_checkpoint and self.training:
+            # Use gradient checkpointing to save memory during training
+            for block in self.transformer.h:
+                x = torch.utils.checkpoint.checkpoint(block, x)
+        else:
+            # Standard forward pass
+            for block in self.transformer.h:
+                x = block(x)
+        # Final layer normalization
+        x = self.transformer.ln_f(x)
+        # Language modeling head
+        # Always compute full logits for training and evaluation
+        logits = self.lm_head(x)
+        if targets is not None:
+            # If we have targets, compute loss
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
+            )
+        else:
+            # If no targets, no loss computation
+            loss = None
+        return logits, loss
+    def generate(
+        self,
+        idx: torch.Tensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Generate new tokens autoregressively.
+        Args:
+            idx: Starting token indices (batch_size, seq_len)
+            max_new_tokens: Maximum number of new tokens to generate
+            temperature: Sampling temperature (higher = more random)
+            top_k: If set, only sample from top-k most likely tokens
+        Returns:
+            torch.Tensor: Generated sequence (batch_size, seq_len + max_new_tokens)
+        """
+        self.eval()
+        with torch.no_grad():
+            for _ in range(max_new_tokens):
+                # Crop sequence if it exceeds block size
+                idx_cond = (
+                    idx
+                    if idx.size(1) <= self.config.block_size
+                    else idx[:, -self.config.block_size :]
+                )
+                # Forward pass
+                logits, _ = self(idx_cond)
+                # Get logits for the last token and apply temperature
+                logits = logits[:, -1, :] / temperature
+                # Optionally crop to top-k most likely tokens
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float("inf")
+                # Apply softmax and sample
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+                # Append to sequence
+                idx = torch.cat((idx, idx_next), dim=1)
+        self.train()  # Return to training mode
+        return idx
+    def estimate_memory_usage(self, batch_size: int = 1, seq_len: int = None) -> dict:
+        """
+        Estimate memory usage for training and inference.
+        Args:
+            batch_size: Batch size for estimation
+            seq_len: Sequence length (defaults to block_size)
+        Returns:
+            dict: Memory usage estimates in MB
+        """
+        if seq_len is None:
+            seq_len = self.config.block_size
+        # Model parameters (weights)
+        param_memory = self.get_num_params() * 4 / (1024**2)  # 4 bytes per float32
+        # Activations (rough estimate)
+        activation_memory = (
+            batch_size * seq_len * self.config.n_embd * self.config.n_layer * 8  # Rough estimate
+        ) / (1024**2)
+        # Gradients (same size as parameters during training)
+        gradient_memory = param_memory
+        return {
+            "parameters_mb": param_memory,
+            "activations_mb": activation_memory,
+            "gradients_mb": gradient_memory,
+            "total_training_mb": param_memory + activation_memory + gradient_memory,
+            "total_inference_mb": param_memory + activation_memory * 0.5,  # No gradients needed
+        }
+def create_model(model_size: str = "medium") -> GPTModel:
+    """
+    Factory function to create a GPT model with predefined configurations.
+    Args:
+        model_size: Size of model to create ("small", "medium", "large")
+    Returns:
+        GPTModel: Initialized model
+    """
+    configs = {
+        "small": GPTConfig.small(),
+        "medium": GPTConfig.medium(),
+        "large": GPTConfig.large(),
+    }
+    if model_size not in configs:
+        raise ValueError(f"Unknown model size: {model_size}. Choose from {list(configs.keys())}")
+    config = configs[model_size]
+    model = GPTModel(config)
+    return model
+if __name__ == "__main__":
+    # Example usage
+    print("🧠 GPT Model Architecture")
+    print("=" * 50)
+    # Create models of different sizes
+    for size in ["small", "medium", "large"]:
+        print(f"\n{size.upper()} MODEL:")
+        model = create_model(size)
+        # Show memory estimates
+        memory = model.estimate_memory_usage(batch_size=4, seq_len=512)
+        print(
+            f"Memory (4 batch, 512 seq): {memory['total_training_mb']:.1f}MB training, {memory['total_inference_mb']:.1f}MB inference"
+        )
+        # Test forward pass
+        x = torch.randint(0, 32000, (2, 64))  # Batch size 2, sequence length 64
+        with torch.no_grad():
+            logits, _ = model(x)
+        print(f"Test forward pass: {x.shape} -> {logits.shape} ✓")

core/src/model_test.py ADDED Viewed

	@@ -0,0 +1,564 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+Model Architecture Testing and Validation Script
+This script provides comprehensive testing and validation for the GPT model architecture.
+It helps verify that the model is correctly implemented and can run on your hardware.
+FEATURES:
+- Model initialization testing
+- Forward pass validation
+- Memory usage analysis
+- Tokenizer integration testing
+- Performance benchmarking
+- Hardware compatibility checks
+Usage:
+    python core/src/test_model.py --model_size medium
+    python core/src/test_model.py --model_size small --test_generation
+    python core/src/test_model.py --all_sizes --benchmark
+Requirements:
+    - torch
+    - sentencepiece (for tokenizer integration)
+    - Our trained tokenizer in data/tokenizer/
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import argparse
+import json
+import os
+import time
+import traceback
+from typing import Dict, List
+import torch
+# Import our model architecture
+try:
+    from model import GPTModel, create_model
+except ImportError:
+    import sys
+    sys.path.append(os.path.dirname(__file__))
+    from model import GPTModel, create_model
+# Import tokenizer if available
+try:
+    import sentencepiece as spm
+    TOKENIZER_AVAILABLE = True
+except ImportError:
+    TOKENIZER_AVAILABLE = False
+    print("Warning: SentencePiece not available. Tokenizer tests will be skipped.")
+class ModelTester:
+    """
+    Comprehensive model testing class.
+    Provides methods to test model initialization, forward passes, memory usage,
+    and integration with the tokenizer.
+    """
+    def __init__(self, device: str = "auto"):
+        """
+        Initialize the model tester.
+        Args:
+            device: Device to use ("cpu", "cuda", or "auto")
+        """
+        if device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        print("🔧 Model Tester initialized")
+        print(f"Device: {self.device}")
+        print(f"PyTorch version: {torch.__version__}")
+        # Try to load tokenizer
+        self.tokenizer = None
+        self.load_tokenizer()
+    def load_tokenizer(self) -> None:
+        """Load the trained SentencePiece tokenizer if available."""
+        if not TOKENIZER_AVAILABLE:
+            return
+        tokenizer_path = "data/tokenizer/tokenizer.model"
+        if os.path.exists(tokenizer_path):
+            try:
+                self.tokenizer = spm.SentencePieceProcessor()
+                self.tokenizer.load(tokenizer_path)
+                print(f"✓ Tokenizer loaded: {tokenizer_path}")
+                print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
+            except Exception as e:
+                print(f"⚠️  Failed to load tokenizer: {e}")
+        else:
+            print(f"⚠️  Tokenizer not found at {tokenizer_path}")
+    def test_model_initialization(self, model_size: str = "medium") -> Dict:
+        """
+        Test model initialization and basic properties.
+        Args:
+            model_size: Size of model to test
+        Returns:
+            dict: Test results
+        """
+        print(f"\n🧠 Testing {model_size.upper()} model initialization...")
+        try:
+            # Create model
+            start_time = time.time()
+            model = create_model(model_size)
+            init_time = time.time() - start_time
+            # Move to device
+            model = model.to(self.device)
+            # Basic checks
+            param_count = model.get_num_params()
+            config = model.config
+            print("✓ Model created successfully")
+            print(f"  Parameters: {param_count:,}")
+            print(f"  Layers: {config.n_layer}")
+            print(f"  Heads: {config.n_head}")
+            print(f"  Embedding dim: {config.n_embd}")
+            print(f"  Block size: {config.block_size}")
+            print(f"  Initialization time: {init_time:.2f}s")
+            return {
+                "success": True,
+                "model_size": model_size,
+                "parameters": param_count,
+                "config": config.__dict__,
+                "init_time": init_time,
+                "device": str(next(model.parameters()).device),
+            }
+        except Exception as e:
+            print(f"❌ Model initialization failed: {e}")
+            traceback.print_exc()
+            return {"success": False, "error": str(e)}
+    def test_forward_pass(self, model: GPTModel, batch_size: int = 2, seq_len: int = 64) -> Dict:
+        """
+        Test model forward pass with synthetic data.
+        Args:
+            model: Model to test
+            batch_size: Batch size for test
+            seq_len: Sequence length for test
+        Returns:
+            dict: Test results
+        """
+        print(f"\n🔄 Testing forward pass (batch={batch_size}, seq_len={seq_len})...")
+        try:
+            model.eval()
+            # Create synthetic input
+            x = torch.randint(0, model.config.vocab_size, (batch_size, seq_len))
+            x = x.to(self.device)
+            # Test inference mode
+            start_time = time.time()
+            with torch.no_grad():
+                logits, _ = model(x)
+            inference_time = time.time() - start_time
+            # Test training mode with targets
+            model.train()
+            targets = torch.randint(0, model.config.vocab_size, (batch_size, seq_len))
+            targets = targets.to(self.device)
+            start_time = time.time()
+            logits_train, loss = model(x, targets)
+            train_time = time.time() - start_time
+            print("✓ Forward pass successful")
+            print(f"  Input shape: {x.shape}")
+            print(f"  Output shape: {logits.shape}")
+            print(f"  Loss: {loss.item():.4f}")
+            print(f"  Inference time: {inference_time:.4f}s")
+            print(f"  Training time: {train_time:.4f}s")
+            return {
+                "success": True,
+                "input_shape": list(x.shape),
+                "output_shape": list(logits.shape),
+                "loss": loss.item(),
+                "inference_time": inference_time,
+                "training_time": train_time,
+            }
+        except Exception as e:
+            print(f"❌ Forward pass failed: {e}")
+            traceback.print_exc()
+            return {"success": False, "error": str(e)}
+    def test_memory_usage(self, model: GPTModel, batch_sizes: List[int] = [1, 2, 4]) -> Dict:
+        """
+        Test memory usage for different batch sizes.
+        Args:
+            model: Model to test
+            batch_sizes: List of batch sizes to test
+        Returns:
+            dict: Memory usage results
+        """
+        print("\n💾 Testing memory usage...")
+        results = {}
+        for batch_size in batch_sizes:
+            try:
+                # Clear cache
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                # Get initial memory
+                if torch.cuda.is_available():
+                    initial_memory = torch.cuda.memory_allocated() / (1024**2)
+                else:
+                    initial_memory = 0
+                # Forward pass
+                seq_len = min(512, model.config.block_size)
+                x = torch.randint(0, model.config.vocab_size, (batch_size, seq_len))
+                x = x.to(self.device)
+                with torch.no_grad():
+                    logits, _ = model(x)
+                # Get peak memory
+                if torch.cuda.is_available():
+                    peak_memory = torch.cuda.max_memory_allocated() / (1024**2)
+                    memory_used = peak_memory - initial_memory
+                else:
+                    memory_used = model.estimate_memory_usage(batch_size, seq_len)[
+                        "total_inference_mb"
+                    ]
+                results[f"batch_{batch_size}"] = {
+                    "memory_mb": memory_used,
+                    "memory_per_sample": memory_used / batch_size,
+                }
+                print(
+                    f"  Batch size {batch_size}: {memory_used:.1f}MB ({memory_used/batch_size:.1f}MB per sample)"
+                )
+            except Exception as e:
+                print(f"  Batch size {batch_size}: Failed - {e}")
+                results[f"batch_{batch_size}"] = {"error": str(e)}
+        return results
+    def test_tokenizer_integration(self, model: GPTModel) -> Dict:
+        """
+        Test integration with the trained tokenizer.
+        Args:
+            model: Model to test
+        Returns:
+            dict: Integration test results
+        """
+        print("\n🔤 Testing tokenizer integration...")
+        if self.tokenizer is None:
+            print("⚠️  No tokenizer available, skipping integration test")
+            return {"success": False, "reason": "No tokenizer available"}
+        try:
+            # Test sentences
+            test_sentences = [
+                "The quick brown fox jumps over the lazy dog.",
+                "Machine learning is transforming technology.",
+                "GPT models use transformer architecture for language modeling.",
+            ]
+            results = []
+            for sentence in test_sentences:
+                # Tokenize
+                tokens = self.tokenizer.encode(sentence)
+                token_tensor = torch.tensor([tokens]).to(self.device)
+                # Forward pass
+                with torch.no_grad():
+                    logits, _ = model(token_tensor)
+                # Get predictions for next token
+                next_token_logits = logits[0, -1, :]
+                next_token_probs = torch.softmax(next_token_logits, dim=0)
+                top5_tokens = torch.topk(next_token_probs, 5)
+                # Decode top predictions
+                top5_decoded = []
+                for token_id in top5_tokens.indices:
+                    try:
+                        decoded = self.tokenizer.decode([token_id.item()])
+                        prob = top5_tokens.values[len(top5_decoded)].item()
+                        top5_decoded.append((decoded, prob))
+                    except Exception:
+                        top5_decoded.append(("<??>", 0.0))
+                results.append(
+                    {"input": sentence, "tokens": len(tokens), "top_predictions": top5_decoded}
+                )
+                print(f"✓ '{sentence[:30]}...' -> {len(tokens)} tokens")
+                print(f"  Top prediction: '{top5_decoded[0][0]}' ({top5_decoded[0][1]:.3f})")
+            return {
+                "success": True,
+                "vocab_size_match": self.tokenizer.vocab_size() == model.config.vocab_size,
+                "test_results": results,
+            }
+        except Exception as e:
+            print(f"❌ Tokenizer integration failed: {e}")
+            traceback.print_exc()
+            return {"success": False, "error": str(e)}
+    def test_generation(self, model: GPTModel, prompt: str = "The future of AI") -> Dict:
+        """
+        Test text generation capabilities.
+        Args:
+            model: Model to test
+            prompt: Starting prompt for generation
+        Returns:
+            dict: Generation test results
+        """
+        print("\n✍️  Testing text generation...")
+        if self.tokenizer is None:
+            print("⚠️  No tokenizer available, skipping generation test")
+            return {"success": False, "reason": "No tokenizer available"}
+        try:
+            # Tokenize prompt
+            tokens = self.tokenizer.encode(prompt)
+            input_tensor = torch.tensor([tokens]).to(self.device)
+            print(f"Prompt: '{prompt}'")
+            print("Generating...")
+            # Generate
+            start_time = time.time()
+            output = model.generate(input_tensor, max_new_tokens=50, temperature=0.8, top_k=50)
+            generation_time = time.time() - start_time
+            # Decode output
+            generated_tokens = output[0].tolist()
+            generated_text = self.tokenizer.decode(generated_tokens)
+            print(f"✓ Generated text: '{generated_text}'")
+            print(f"  Generation time: {generation_time:.2f}s")
+            print(f"  Tokens per second: {50/generation_time:.1f}")
+            return {
+                "success": True,
+                "prompt": prompt,
+                "generated_text": generated_text,
+                "generation_time": generation_time,
+                "tokens_per_second": 50 / generation_time,
+            }
+        except Exception as e:
+            print(f"❌ Text generation failed: {e}")
+            traceback.print_exc()
+            return {"success": False, "error": str(e)}
+    def run_comprehensive_test(self, model_size: str = "medium") -> Dict:
+        """
+        Run all tests for a given model size.
+        Args:
+            model_size: Size of model to test
+        Returns:
+            dict: Complete test results
+        """
+        print(f"\n🔍 Running comprehensive test for {model_size.upper()} model")
+        print("=" * 60)
+        results = {"model_size": model_size, "device": self.device}
+        # Test 1: Model initialization
+        init_result = self.test_model_initialization(model_size)
+        results["initialization"] = init_result
+        if not init_result["success"]:
+            return results
+        # Create model for remaining tests
+        model = create_model(model_size).to(self.device)
+        # Test 2: Forward pass
+        results["forward_pass"] = self.test_forward_pass(model)
+        # Test 3: Memory usage
+        results["memory_usage"] = self.test_memory_usage(model)
+        # Test 4: Tokenizer integration
+        results["tokenizer_integration"] = self.test_tokenizer_integration(model)
+        # Test 5: Text generation
+        results["generation"] = self.test_generation(model)
+        return results
+def load_model_config(model_size: str) -> Dict:
+    """Load model configuration from JSON file."""
+    config_path = f"configs/{model_size}_model.json"
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            return json.load(f)
+    return {}
+def print_hardware_recommendations(model_size: str) -> None:
+    """Print hardware recommendations for the given model size."""
+    config = load_model_config(model_size)
+    if config:
+        print(f"\n💻 Hardware Recommendations for {model_size.upper()} model:")
+        print(f"  Parameters: {config.get('parameters', 'Unknown')}")
+        print(f"  Recommended: {config.get('recommended_hardware', 'Unknown')}")
+        if "memory_estimates" in config:
+            mem = config["memory_estimates"]
+            print(f"  Memory usage: ~{mem.get('parameters_mb', '?')}MB parameters")
+            print(f"  Training: ~{mem.get('training_mb_per_sample', '?')}MB per sample")
+            print(f"  Inference: ~{mem.get('inference_mb_per_sample', '?')}MB per sample")
+        if "cpu_training_notes" in config:
+            cpu_notes = config["cpu_training_notes"]
+            if cpu_notes.get("feasible"):
+                print(
+                    f"  CPU Training: Feasible but slow ({cpu_notes.get('expected_training_time', '?')})"
+                )
+            else:
+                print(f"  CPU Training: Not recommended - {cpu_notes.get('reason', 'Too large')}")
+def main():
+    """Main function to handle command line testing."""
+    parser = argparse.ArgumentParser(
+        description="Test and validate GPT model architecture",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Test medium model
+  python core/src/test_model.py --model_size medium
+  # Test all model sizes
+  python core/src/test_model.py --all_sizes
+  # Test with text generation
+  python core/src/test_model.py --model_size small --test_generation
+  # Show hardware recommendations
+  python core/src/test_model.py --recommendations
+        """,
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["small", "medium", "large"],
+        default="medium",
+        help="Model size to test (default: medium)",
+    )
+    parser.add_argument("--all_sizes", action="store_true", help="Test all model sizes")
+    parser.add_argument(
+        "--test_generation", action="store_true", help="Include text generation test"
+    )
+    parser.add_argument(
+        "--device",
+        choices=["cpu", "cuda", "auto"],
+        default="auto",
+        help="Device to use for testing (default: auto)",
+    )
+    parser.add_argument(
+        "--recommendations",
+        action="store_true",
+        help="Show hardware recommendations for all model sizes",
+    )
+    parser.add_argument("--save_results", help="Save test results to JSON file")
+    args = parser.parse_args()
+    print("🧪 GPT Model Architecture Tester")
+    print("=" * 50)
+    # Show hardware recommendations
+    if args.recommendations:
+        for size in ["small", "medium", "large"]:
+            print_hardware_recommendations(size)
+        return
+    # Initialize tester
+    tester = ModelTester(device=args.device)
+    # Run tests
+    all_results = {}
+    if args.all_sizes:
+        test_sizes = ["small", "medium", "large"]
+    else:
+        test_sizes = [args.model_size]
+    for size in test_sizes:
+        results = tester.run_comprehensive_test(size)
+        all_results[size] = results
+        # Print summary
+        print(f"\n📊 {size.upper()} Model Test Summary:")
+        print(f"  Initialization: {'✓' if results['initialization']['success'] else '❌'}")
+        print(f"  Forward Pass: {'✓' if results.get('forward_pass', {}).get('success') else '❌'}")
+        print(f"  Memory Test: {'✓' if 'memory_usage' in results else '❌'}")
+        print(
+            f"  Tokenizer: {'✓' if results.get('tokenizer_integration', {}).get('success') else '❌'}"
+        )
+        print(f"  Generation: {'✓' if results.get('generation', {}).get('success') else '❌'}")
+    # Save results if requested
+    if args.save_results:
+        with open(args.save_results, "w") as f:
+            json.dump(all_results, f, indent=2)
+        print(f"\n💾 Results saved to {args.save_results}")
+    print("\n🎉 Testing completed!")
+if __name__ == "__main__":
+    main()

core/src/optimized_data_loader.py ADDED Viewed

	@@ -0,0 +1,437 @@

+#!/usr/bin/env python3
+"""
+Optimized Data Loader for Training
+This module provides an optimized data loader with prefetching, caching,
+and efficient batch processing to improve training performance.
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset, Sampler
+from typing import Optional, List, Tuple, Dict, Any
+import numpy as np
+import threading
+import queue
+import time
+from collections import deque
+import psutil
+import os
+class OptimizedDataset(Dataset):
+    """
+    Optimized dataset with caching and memory management.
+    This dataset provides efficient data loading with optional caching
+    and memory management to improve training performance.
+    """
+    def __init__(self,
+                 data: torch.Tensor,
+                 targets: torch.Tensor,
+                 cache_size: Optional[int] = None,
+                 pin_memory: bool = True):
+        """
+        Initialize optimized dataset.
+        Args:
+            data: Input data tensor
+            targets: Target tensor
+            cache_size: Number of samples to cache in memory
+            pin_memory: Whether to pin memory for faster GPU transfer
+        """
+        self.data = data
+        self.targets = targets
+        self.cache_size = cache_size
+        self.pin_memory = pin_memory
+        # Initialize cache
+        self.cache = {}
+        self.cache_hits = 0
+        self.cache_misses = 0
+        if cache_size and cache_size > 0:
+            print(f"Initializing cache with {cache_size} samples")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        # Check cache first
+        if self.cache_size and idx in self.cache:
+            self.cache_hits += 1
+            return self.cache[idx]
+        self.cache_misses += 1
+        # Get data
+        sample_data = self.data[idx]
+        sample_target = self.targets[idx]
+        # Pin memory if requested
+        if self.pin_memory and torch.cuda.is_available():
+            sample_data = sample_data.pin_memory()
+            sample_target = sample_target.pin_memory()
+        # Cache if enabled
+        if self.cache_size and len(self.cache) < self.cache_size:
+            self.cache[idx] = (sample_data, sample_target)
+        return sample_data, sample_target
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        total_requests = self.cache_hits + self.cache_misses
+        hit_rate = self.cache_hits / total_requests if total_requests > 0 else 0
+        return {
+            "cache_hits": self.cache_hits,
+            "cache_misses": self.cache_misses,
+            "hit_rate": hit_rate,
+            "cache_size": len(self.cache),
+            "max_cache_size": self.cache_size
+        }
+class PrefetchDataLoader:
+    """
+    Data loader with prefetching for improved performance.
+    This data loader uses background threads to prefetch data,
+    reducing the time spent waiting for data during training.
+    """
+    def __init__(self,
+                 dataset: Dataset,
+                 batch_size: int = 32,
+                 num_workers: int = 4,
+                 prefetch_factor: int = 2,
+                 pin_memory: bool = True,
+                 shuffle: bool = True,
+                 drop_last: bool = False):
+        """
+        Initialize prefetch data loader.
+        Args:
+            dataset: Dataset to load
+            batch_size: Batch size
+            num_workers: Number of worker processes
+            prefetch_factor: Number of batches to prefetch
+            pin_memory: Whether to pin memory
+            shuffle: Whether to shuffle data
+            drop_last: Whether to drop incomplete batches
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.pin_memory = pin_memory
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        # Initialize data loader
+        self.data_loader = DataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            drop_last=drop_last,
+            persistent_workers=True if num_workers > 0 else False
+        )
+        # Prefetch queue
+        self.prefetch_queue = queue.Queue(maxsize=prefetch_factor)
+        self.prefetch_thread = None
+        self.stop_prefetch = False
+        # Start prefetching
+        self._start_prefetch()
+        print(f"PrefetchDataLoader initialized with {num_workers} workers")
+    def _start_prefetch(self):
+        """Start prefetching thread."""
+        if self.prefetch_factor > 0:
+            self.prefetch_thread = threading.Thread(target=self._prefetch_worker)
+            self.prefetch_thread.daemon = True
+            self.prefetch_thread.start()
+    def _prefetch_worker(self):
+        """Worker thread for prefetching data."""
+        try:
+            for batch in self.data_loader:
+                if self.stop_prefetch:
+                    break
+                # Put batch in queue (block if full)
+                self.prefetch_queue.put(batch, block=True)
+        except Exception as e:
+            print(f"Prefetch worker error: {e}")
+    def __iter__(self):
+        """Iterate over prefetched batches."""
+        return self
+    def __next__(self):
+        """Get next batch from prefetch queue."""
+        if self.stop_prefetch:
+            raise StopIteration
+        try:
+            # Get batch from prefetch queue
+            batch = self.prefetch_queue.get(timeout=1.0)
+            return batch
+        except queue.Empty:
+            # If queue is empty, get directly from data loader
+            return next(self.data_loader.__iter__())
+    def __len__(self):
+        return len(self.data_loader)
+    def stop(self):
+        """Stop prefetching."""
+        self.stop_prefetch = True
+        if self.prefetch_thread:
+            self.prefetch_thread.join()
+class DynamicBatchSampler(Sampler):
+    """
+    Dynamic batch sampler that adjusts batch size based on memory availability.
+    This sampler monitors system memory and adjusts batch sizes dynamically
+    to optimize memory usage and training performance.
+    """
+    def __init__(self,
+                 dataset_size: int,
+                 base_batch_size: int = 32,
+                 max_batch_size: int = 128,
+                 memory_threshold: float = 0.8,
+                 adjustment_factor: float = 1.2):
+        """
+        Initialize dynamic batch sampler.
+        Args:
+            dataset_size: Size of the dataset
+            base_batch_size: Base batch size
+            max_batch_size: Maximum batch size
+            memory_threshold: Memory usage threshold for adjustment
+            adjustment_factor: Factor for batch size adjustment
+        """
+        self.dataset_size = dataset_size
+        self.base_batch_size = base_batch_size
+        self.max_batch_size = max_batch_size
+        self.memory_threshold = memory_threshold
+        self.adjustment_factor = adjustment_factor
+        self.current_batch_size = base_batch_size
+        self.batch_history = deque(maxlen=10)
+        print(f"DynamicBatchSampler initialized with base batch size: {base_batch_size}")
+    def _get_memory_usage(self) -> float:
+        """Get current memory usage as a fraction."""
+        memory = psutil.virtual_memory()
+        return memory.percent / 100.0
+    def _adjust_batch_size(self):
+        """Adjust batch size based on memory usage."""
+        memory_usage = self._get_memory_usage()
+        if memory_usage > self.memory_threshold:
+            # Reduce batch size if memory usage is high
+            self.current_batch_size = max(
+                self.base_batch_size,
+                int(self.current_batch_size / self.adjustment_factor)
+            )
+        else:
+            # Increase batch size if memory usage is low
+            self.current_batch_size = min(
+                self.max_batch_size,
+                int(self.current_batch_size * self.adjustment_factor)
+            )
+        self.batch_history.append(self.current_batch_size)
+    def __iter__(self):
+        """Generate batch indices."""
+        indices = list(range(self.dataset_size))
+        # Shuffle indices
+        np.random.shuffle(indices)
+        # Generate batches
+        for i in range(0, len(indices), self.current_batch_size):
+            batch_indices = indices[i:i + self.current_batch_size]
+            # Adjust batch size for next iteration
+            self._adjust_batch_size()
+            yield batch_indices
+    def __len__(self):
+        return (self.dataset_size + self.current_batch_size - 1) // self.current_batch_size
+    def get_stats(self) -> Dict[str, Any]:
+        """Get sampler statistics."""
+        return {
+            "current_batch_size": self.current_batch_size,
+            "base_batch_size": self.base_batch_size,
+            "max_batch_size": self.max_batch_size,
+            "memory_usage": self._get_memory_usage(),
+            "batch_history": list(self.batch_history)
+        }
+class OptimizedDataLoader:
+    """
+    High-performance data loader with multiple optimizations.
+    This data loader combines multiple optimization techniques:
+    - Prefetching with background threads
+    - Dynamic batch sizing
+    - Memory pinning
+    - Caching
+    - Efficient memory management
+    """
+    def __init__(self,
+                 dataset: Dataset,
+                 batch_size: int = 32,
+                 num_workers: int = 4,
+                 prefetch_factor: int = 2,
+                 pin_memory: bool = True,
+                 shuffle: bool = True,
+                 drop_last: bool = False,
+                 use_dynamic_batching: bool = True,
+                 cache_size: Optional[int] = None):
+        """
+        Initialize optimized data loader.
+        Args:
+            dataset: Dataset to load
+            batch_size: Base batch size
+            num_workers: Number of worker processes
+            prefetch_factor: Number of batches to prefetch
+            pin_memory: Whether to pin memory
+            shuffle: Whether to shuffle data
+            drop_last: Whether to drop incomplete batches
+            use_dynamic_batching: Whether to use dynamic batch sizing
+            cache_size: Number of samples to cache
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.pin_memory = pin_memory
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_dynamic_batching = use_dynamic_batching
+        self.cache_size = cache_size
+        # Create optimized dataset if caching is enabled
+        if cache_size and cache_size > 0:
+            self.dataset = OptimizedDataset(
+                dataset.data if hasattr(dataset, 'data') else dataset,
+                dataset.targets if hasattr(dataset, 'targets') else None,
+                cache_size=cache_size,
+                pin_memory=pin_memory
+            )
+        # Create sampler
+        if use_dynamic_batching:
+            self.sampler = DynamicBatchSampler(
+                dataset_size=len(self.dataset),
+                base_batch_size=batch_size,
+                max_batch_size=batch_size * 4
+            )
+        else:
+            self.sampler = None
+        # Create data loader
+        self.data_loader = DataLoader(
+            dataset=self.dataset,
+            batch_size=batch_size,
+            sampler=self.sampler,
+            shuffle=shuffle if not use_dynamic_batching else False,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            drop_last=drop_last,
+            persistent_workers=True if num_workers > 0 else False
+        )
+        # Create prefetch loader
+        self.prefetch_loader = PrefetchDataLoader(
+            dataset=self.dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            prefetch_factor=prefetch_factor,
+            pin_memory=pin_memory,
+            shuffle=shuffle,
+            drop_last=drop_last
+        )
+        print(f"OptimizedDataLoader initialized with {num_workers} workers")
+    def __iter__(self):
+        """Iterate over batches."""
+        return iter(self.prefetch_loader)
+    def __len__(self):
+        return len(self.data_loader)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get loader statistics."""
+        stats = {
+            "batch_size": self.batch_size,
+            "num_workers": self.num_workers,
+            "prefetch_factor": self.prefetch_factor,
+            "cache_enabled": self.cache_size is not None,
+            "dynamic_batching": self.use_dynamic_batching
+        }
+        if hasattr(self.dataset, 'get_cache_stats'):
+            stats.update(self.dataset.get_cache_stats())
+        if self.sampler:
+            stats.update(self.sampler.get_stats())
+        return stats
+    def stop(self):
+        """Stop the data loader."""
+        self.prefetch_loader.stop()
+def create_optimized_loader(dataset: Dataset,
+                           batch_size: int = 32,
+                           num_workers: Optional[int] = None,
+                           **kwargs) -> OptimizedDataLoader:
+    """
+    Create an optimized data loader with automatic configuration.
+    Args:
+        dataset: Dataset to load
+        batch_size: Batch size
+        num_workers: Number of workers (auto-detect if None)
+        **kwargs: Additional arguments
+    Returns:
+        OptimizedDataLoader: Configured data loader
+    """
+    if num_workers is None:
+        # Auto-detect optimal number of workers
+        num_workers = min(4, os.cpu_count() or 1)
+    return OptimizedDataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **kwargs
+    )

core/src/optimized_inference_server.py ADDED Viewed

	@@ -0,0 +1,739 @@

+#!/usr/bin/env python3
+"""
+Optimized OpenLLM Inference Server
+This module provides an optimized inference server with:
+- Model caching and memory management
+- Request batching for improved throughput
+- Response streaming for real-time generation
+- Performance monitoring and metrics
+- Load balancing and concurrent processing
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import asyncio
+import json
+import time
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Optional, List, Dict, Any, AsyncGenerator
+from collections import deque
+import torch
+import torch.nn.functional as F
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import uvicorn
+import logging
+import psutil
+import os
+import sys
+from pathlib import Path
+# Add current directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model import GPTConfig, GPTModel
+from quantization import QuantizedModel, quantize_model_dynamic
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class OptimizedInferenceEngine:
+    """
+    Optimized inference engine with caching and batching.
+    This engine provides high-performance inference with:
+    - Model caching and memory management
+    - Request batching for improved throughput
+    - Quantization support for reduced memory usage
+    - Performance monitoring and metrics
+    """
+    def __init__(self,
+                 model_path: str,
+                 device: str = "auto",
+                 use_quantization: bool = True,
+                 cache_size: int = 1000,
+                 max_batch_size: int = 32,
+                 num_workers: int = 4):
+        """
+        Initialize optimized inference engine.
+        Args:
+            model_path: Path to the model
+            device: Device to use ("auto", "cpu", "cuda")
+            use_quantization: Whether to use quantization
+            cache_size: Size of response cache
+            max_batch_size: Maximum batch size for processing
+            num_workers: Number of worker threads
+        """
+        self.model_path = model_path
+        self.device = self._get_device(device)
+        self.use_quantization = use_quantization
+        self.cache_size = cache_size
+        self.max_batch_size = max_batch_size
+        self.num_workers = num_workers
+        # Initialize components
+        self.model = None
+        self.tokenizer = None
+        self.quantized_model = None
+        self.response_cache = {}
+        self.request_queue = deque()
+        self.processing_lock = threading.Lock()
+        # Performance metrics
+        self.metrics = {
+            "total_requests": 0,
+            "cache_hits": 0,
+            "cache_misses": 0,
+            "avg_generation_time": 0.0,
+            "total_generation_time": 0.0,
+            "requests_per_second": 0.0
+        }
+        # Thread pool for concurrent processing
+        self.executor = ThreadPoolExecutor(max_workers=num_workers)
+        # Load model
+        self._load_model()
+        logger.info(f"OptimizedInferenceEngine initialized on {self.device}")
+    def _get_device(self, device: str) -> torch.device:
+        """Get the appropriate device."""
+        if device == "auto":
+            if torch.cuda.is_available():
+                return torch.device("cuda")
+            else:
+                return torch.device("cpu")
+        else:
+            return torch.device(device)
+    def _load_model(self):
+        """Load and optimize the model."""
+        try:
+            logger.info(f"Loading model from {self.model_path}")
+            # Load model configuration
+            config_path = Path(self.model_path) / "config.json"
+            if config_path.exists():
+                with open(config_path, 'r') as f:
+                    config_data = json.load(f)
+                config = GPTConfig(**config_data)
+            else:
+                # Use default config
+                config = GPTConfig.small()
+            # Create model
+            self.model = GPTModel(config, use_checkpoint=False)  # No checkpointing for inference
+            # Load model weights
+            model_path = Path(self.model_path) / "pytorch_model.bin"
+            if model_path.exists():
+                self.model.load_state_dict(torch.load(model_path, map_location=self.device))
+                logger.info("Model weights loaded successfully")
+            else:
+                logger.warning("No model weights found, using initialized weights")
+            # Move model to device
+            self.model.to(self.device)
+            self.model.eval()
+            # Apply quantization if requested
+            if self.use_quantization and self.device.type == "cpu":
+                logger.info("Applying dynamic quantization")
+                self.quantized_model = QuantizedModel(self.model)
+                self.quantized_model.quantize_dynamic()
+                logger.info("Quantization completed")
+            # Load tokenizer
+            tokenizer_path = Path(self.model_path) / "tokenizer.model"
+            if tokenizer_path.exists():
+                import sentencepiece as spm
+                self.tokenizer = spm.SentencePieceProcessor()
+                self.tokenizer.load(str(tokenizer_path))
+                logger.info("Tokenizer loaded successfully")
+            else:
+                logger.warning("No tokenizer found")
+            logger.info("Model loading completed")
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise
+    def _get_cache_key(self, prompt: str, **kwargs) -> str:
+        """Generate cache key for request."""
+        # Create a hash of the prompt and parameters
+        import hashlib
+        key_data = f"{prompt}_{kwargs}"
+        return hashlib.md5(key_data.encode()).hexdigest()
+    def _check_cache(self, cache_key: str) -> Optional[List[str]]:
+        """Check if response is cached."""
+        if cache_key in self.response_cache:
+            self.metrics["cache_hits"] += 1
+            return self.response_cache[cache_key]
+        else:
+            self.metrics["cache_misses"] += 1
+            return None
+    def _update_cache(self, cache_key: str, response: List[str]):
+        """Update response cache."""
+        if len(self.response_cache) >= self.cache_size:
+            # Remove oldest entry
+            oldest_key = next(iter(self.response_cache))
+            del self.response_cache[oldest_key]
+        self.response_cache[cache_key] = response
+    def _tokenize(self, text: str) -> torch.Tensor:
+        """Tokenize text using the loaded tokenizer."""
+        if self.tokenizer is None:
+            # Fallback to simple tokenization
+            return torch.tensor([ord(c) % 1000 for c in text], dtype=torch.long)
+        tokens = self.tokenizer.encode_as_ids(text)
+        return torch.tensor(tokens, dtype=torch.long)
+    def _detokenize(self, tokens: torch.Tensor) -> str:
+        """Detokenize tokens to text."""
+        if self.tokenizer is None:
+            # Fallback to simple detokenization
+            return ''.join([chr(t % 1000) for t in tokens.tolist()])
+        return self.tokenizer.decode(tokens.tolist())
+    def generate(self,
+                 prompt: str,
+                 max_length: int = 256,
+                 temperature: float = 0.7,
+                 top_k: Optional[int] = 40,
+                 top_p: Optional[float] = 0.9,
+                 num_return_sequences: int = 1,
+                 stop_sequences: Optional[List[str]] = None) -> List[str]:
+        """
+        Generate text with optimizations.
+        Args:
+            prompt: Input prompt
+            max_length: Maximum generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling parameter
+            top_p: Nucleus sampling parameter
+            num_return_sequences: Number of sequences to generate
+            stop_sequences: Stop generation at these sequences
+        Returns:
+            List of generated texts
+        """
+        start_time = time.time()
+        # Check cache first
+        cache_key = self._get_cache_key(prompt, max_length=max_length,
+                                      temperature=temperature, top_k=top_k, top_p=top_p)
+        cached_response = self._check_cache(cache_key)
+        if cached_response:
+            return cached_response
+        # Tokenize input
+        input_tokens = self._tokenize(prompt)
+        input_tokens = input_tokens.unsqueeze(0).to(self.device)  # Add batch dimension
+        # Generate text
+        with torch.no_grad():
+            if self.quantized_model and self.quantized_model.is_quantized:
+                # Use quantized model
+                generated_tokens = self.quantized_model.quantized_model.generate(
+                    input_tokens,
+                    max_new_tokens=max_length,
+                    temperature=temperature,
+                    top_k=top_k,
+                    do_sample=True
+                )
+            else:
+                # Use regular model
+                generated_tokens = self.model.generate(
+                    input_tokens,
+                    max_new_tokens=max_length,
+                    temperature=temperature,
+                    top_k=top_k,
+                    do_sample=True
+                )
+        # Detokenize
+        generated_texts = []
+        for i in range(num_return_sequences):
+            # Extract generated part (remove input)
+            generated_part = generated_tokens[0, len(input_tokens[0]):]
+            text = self._detokenize(generated_part)
+            # Apply stop sequences
+            if stop_sequences:
+                for stop_seq in stop_sequences:
+                    if stop_seq in text:
+                        text = text[:text.find(stop_seq)]
+                        break
+            generated_texts.append(text)
+        # Update cache
+        self._update_cache(cache_key, generated_texts)
+        # Update metrics
+        generation_time = time.time() - start_time
+        self.metrics["total_requests"] += 1
+        self.metrics["total_generation_time"] += generation_time
+        self.metrics["avg_generation_time"] = (
+            self.metrics["total_generation_time"] / self.metrics["total_requests"]
+        )
+        return generated_texts
+    async def generate_async(self,
+                           prompt: str,
+                           max_length: int = 256,
+                           temperature: float = 0.7,
+                           top_k: Optional[int] = 40,
+                           top_p: Optional[float] = 0.9,
+                           num_return_sequences: int = 1,
+                           stop_sequences: Optional[List[str]] = None) -> List[str]:
+        """
+        Asynchronous text generation.
+        Args:
+            Same as generate()
+        Returns:
+            List of generated texts
+        """
+        # Run generation in thread pool
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            self.executor,
+            self.generate,
+            prompt, max_length, temperature, top_k, top_p,
+            num_return_sequences, stop_sequences
+        )
+    async def generate_stream(self,
+                            prompt: str,
+                            max_length: int = 256,
+                            temperature: float = 0.7,
+                            top_k: Optional[int] = 40,
+                            top_p: Optional[float] = 0.9,
+                            stop_sequences: Optional[List[str]] = None) -> AsyncGenerator[str, None]:
+        """
+        Stream generated text token by token.
+        Args:
+            Same as generate()
+        Yields:
+            Generated text tokens
+        """
+        # Tokenize input
+        input_tokens = self._tokenize(prompt)
+        input_tokens = input_tokens.unsqueeze(0).to(self.device)
+        # Generate tokens one by one
+        current_tokens = input_tokens.clone()
+        with torch.no_grad():
+            for _ in range(max_length):
+                # Get next token
+                if self.quantized_model and self.quantized_model.is_quantized:
+                    logits = self.quantized_model.quantized_model(current_tokens)
+                else:
+                    logits = self.model(current_tokens)
+                # Sample next token
+                logits = logits[:, -1, :] / temperature
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float("inf")
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                # Add to sequence
+                current_tokens = torch.cat([current_tokens, next_token], dim=1)
+                # Convert token to text
+                token_text = self._detokenize(next_token[0])
+                yield token_text
+                # Check for stop sequences
+                if stop_sequences:
+                    full_text = self._detokenize(current_tokens[0, len(input_tokens[0]):])
+                    for stop_seq in stop_sequences:
+                        if stop_seq in full_text:
+                            return
+    def get_metrics(self) -> Dict[str, Any]:
+        """Get performance metrics."""
+        memory_usage = psutil.virtual_memory().percent
+        return {
+            **self.metrics,
+            "memory_usage_percent": memory_usage,
+            "cache_size": len(self.response_cache),
+            "max_cache_size": self.cache_size,
+            "cache_hit_rate": (
+                self.metrics["cache_hits"] /
+                (self.metrics["cache_hits"] + self.metrics["cache_misses"])
+                if (self.metrics["cache_hits"] + self.metrics["cache_misses"]) > 0 else 0
+            ),
+            "device": str(self.device),
+            "quantization_enabled": self.quantized_model is not None
+        }
+    def cleanup(self):
+        """Clean up resources."""
+        if self.executor:
+            self.executor.shutdown(wait=True)
+        # Clear cache
+        self.response_cache.clear()
+        logger.info("Inference engine cleaned up")
+# Request/Response models
+class GenerationRequest(BaseModel):
+    """Request model for text generation."""
+    prompt: str = Field(..., description="Input text prompt")
+    max_length: int = Field(256, description="Maximum generation length", ge=1, le=2048)
+    temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
+    top_k: Optional[int] = Field(40, description="Top-k sampling parameter", ge=1, le=1000)
+    top_p: Optional[float] = Field(0.9, description="Nucleus sampling parameter", ge=0.1, le=1.0)
+    num_return_sequences: int = Field(1, description="Number of sequences to generate", ge=1, le=5)
+    stop_sequences: Optional[List[str]] = Field(None, description="Stop generation at these sequences")
+class GenerationResponse(BaseModel):
+    """Response model for text generation."""
+    generated_text: List[str]
+    prompt: str
+    generation_time: float
+    parameters: Dict[str, Any]
+class BatchGenerationRequest(BaseModel):
+    """Request model for batch text generation."""
+    prompts: List[str] = Field(..., description="List of input prompts")
+    max_length: int = Field(256, description="Maximum generation length", ge=1, le=2048)
+    temperature: float = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
+    top_k: Optional[int] = Field(40, description="Top-k sampling parameter", ge=1, le=1000)
+    top_p: Optional[float] = Field(0.9, description="Nucleus sampling parameter", ge=0.1, le=1.0)
+    stop_sequences: Optional[List[str]] = Field(None, description="Stop generation at these sequences")
+class BatchGenerationResponse(BaseModel):
+    """Response model for batch text generation."""
+    generated_texts: List[List[str]]
+    prompts: List[str]
+    generation_time: float
+    parameters: Dict[str, Any]
+# Global inference engine
+inference_engine: Optional[OptimizedInferenceEngine] = None
+# FastAPI app
+app = FastAPI(
+    title="Optimized OpenLLM Inference API",
+    description="High-performance REST API for OpenLLM text generation",
+    version="0.1.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize inference engine on startup."""
+    logger.info("🚀 Starting Optimized OpenLLM Inference Server...")
+    global inference_engine
+    if inference_engine is None:
+        logger.warning("No model loaded - server will return 503 for generation requests")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Clean up resources on shutdown."""
+    global inference_engine
+    if inference_engine:
+        inference_engine.cleanup()
+    logger.info("Server shutdown complete")
+@app.post("/generate", response_model=GenerationResponse)
+async def generate_text(request: GenerationRequest):
+    """Generate text from prompt with optimizations."""
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    start_time = time.time()
+    try:
+        # Generate text asynchronously
+        generated_texts = await inference_engine.generate_async(
+            prompt=request.prompt,
+            max_length=request.max_length,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            num_return_sequences=request.num_return_sequences,
+            stop_sequences=request.stop_sequences,
+        )
+        generation_time = time.time() - start_time
+        return GenerationResponse(
+            generated_text=generated_texts,
+            prompt=request.prompt,
+            generation_time=generation_time,
+            parameters={
+                "max_length": request.max_length,
+                "temperature": request.temperature,
+                "top_k": request.top_k,
+                "top_p": request.top_p,
+                "num_return_sequences": request.num_return_sequences,
+            },
+        )
+    except Exception as e:
+        logger.error(f"Generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.post("/generate/stream")
+async def generate_text_stream(request: GenerationRequest):
+    """Generate text with streaming response."""
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    async def generate_stream():
+        try:
+            async for token in inference_engine.generate_stream(
+                prompt=request.prompt,
+                max_length=request.max_length,
+                temperature=request.temperature,
+                top_k=request.top_k,
+                top_p=request.top_p,
+                stop_sequences=request.stop_sequences,
+            ):
+                yield f"data: {json.dumps({'token': token})}\n\n"
+            yield f"data: {json.dumps({'done': True})}\n\n"
+        except Exception as e:
+            logger.error(f"Streaming generation failed: {e}")
+            yield f"data: {json.dumps({'error': str(e)})}\n\n"
+    return StreamingResponse(
+        generate_stream(),
+        media_type="text/plain",
+        headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
+    )
+@app.post("/generate/batch", response_model=BatchGenerationResponse)
+async def generate_text_batch(request: BatchGenerationRequest):
+    """Generate text for multiple prompts in batch."""
+    if inference_engine is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    start_time = time.time()
+    try:
+        # Process prompts in parallel
+        tasks = []
+        for prompt in request.prompts:
+            task = inference_engine.generate_async(
+                prompt=prompt,
+                max_length=request.max_length,
+                temperature=request.temperature,
+                top_k=request.top_k,
+                top_p=request.top_p,
+                num_return_sequences=1,
+                stop_sequences=request.stop_sequences,
+            )
+            tasks.append(task)
+        # Wait for all tasks to complete
+        generated_texts = await asyncio.gather(*tasks)
+        generation_time = time.time() - start_time
+        return BatchGenerationResponse(
+            generated_texts=generated_texts,
+            prompts=request.prompts,
+            generation_time=generation_time,
+            parameters={
+                "max_length": request.max_length,
+                "temperature": request.temperature,
+                "top_k": request.top_k,
+                "top_p": request.top_p,
+                "num_prompts": len(request.prompts),
+            },
+        )
+    except Exception as e:
+        logger.error(f"Batch generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Batch generation failed: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    global inference_engine
+    if inference_engine is None:
+        return {"status": "unhealthy", "message": "Model not loaded"}
+    try:
+        # Quick generation test
+        test_result = await inference_engine.generate_async(
+            prompt="Hello",
+            max_length=5,
+            temperature=0.7
+        )
+        return {
+            "status": "healthy",
+            "model_loaded": True,
+            "test_generation": len(test_result) > 0
+        }
+    except Exception as e:
+        return {
+            "status": "unhealthy",
+            "message": f"Generation test failed: {str(e)}"
+        }
+@app.get("/metrics")
+async def get_metrics():
+    """Get performance metrics."""
+    global inference_engine
+    if inference_engine is None:
+        return {"error": "Model not loaded"}
+    return inference_engine.get_metrics()
+@app.get("/info")
+async def get_model_info():
+    """Get model information."""
+    global inference_engine
+    if inference_engine is None:
+        return {"error": "Model not loaded"}
+    model = inference_engine.model
+    if model is None:
+        return {"error": "Model not available"}
+    return {
+        "model_name": model.config.model_name,
+        "vocab_size": model.config.vocab_size,
+        "n_layer": model.config.n_layer,
+        "n_head": model.config.n_head,
+        "n_embd": model.config.n_embd,
+        "block_size": model.config.block_size,
+        "parameters": model.get_num_params(),
+        "device": str(inference_engine.device),
+        "quantization_enabled": inference_engine.quantized_model is not None,
+        "cache_size": len(inference_engine.response_cache),
+        "max_cache_size": inference_engine.cache_size,
+    }
+def create_optimized_server(model_path: str,
+                           host: str = "0.0.0.0",
+                           port: int = 8000,
+                           device: str = "auto",
+                           use_quantization: bool = True,
+                           cache_size: int = 1000,
+                           max_batch_size: int = 32,
+                           num_workers: int = 4) -> FastAPI:
+    """
+    Create an optimized inference server.
+    Args:
+        model_path: Path to the model
+        host: Server host
+        port: Server port
+        device: Device to use
+        use_quantization: Whether to use quantization
+        cache_size: Size of response cache
+        max_batch_size: Maximum batch size
+        num_workers: Number of worker threads
+    Returns:
+        FastAPI app instance
+    """
+    global inference_engine
+    # Initialize inference engine
+    inference_engine = OptimizedInferenceEngine(
+        model_path=model_path,
+        device=device,
+        use_quantization=use_quantization,
+        cache_size=cache_size,
+        max_batch_size=max_batch_size,
+        num_workers=num_workers
+    )
+    return app
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Optimized OpenLLM Inference Server")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to model")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Server host")
+    parser.add_argument("--port", type=int, default=8000, help="Server port")
+    parser.add_argument("--device", type=str, default="auto", help="Device to use")
+    parser.add_argument("--use_quantization", action="store_true", help="Use quantization")
+    parser.add_argument("--cache_size", type=int, default=1000, help="Cache size")
+    parser.add_argument("--max_batch_size", type=int, default=32, help="Max batch size")
+    parser.add_argument("--num_workers", type=int, default=4, help="Number of workers")
+    args = parser.parse_args()
+    # Create server
+    app = create_optimized_server(
+        model_path=args.model_path,
+        host=args.host,
+        port=args.port,
+        device=args.device,
+        use_quantization=args.use_quantization,
+        cache_size=args.cache_size,
+        max_batch_size=args.max_batch_size,
+        num_workers=args.num_workers
+    )
+    # Run server
+    uvicorn.run(app, host=args.host, port=args.port)

core/src/performance_monitor.py ADDED Viewed

	@@ -0,0 +1,543 @@

+#!/usr/bin/env python3
+"""
+Performance Monitoring and Profiling
+This module provides comprehensive performance monitoring and profiling
+capabilities for the OpenLLM project, including system resources,
+model performance, and optimization recommendations.
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import time
+import psutil
+import torch
+import threading
+from typing import Dict, List, Any, Optional, Callable
+from dataclasses import dataclass, field
+from collections import deque
+import json
+import logging
+from pathlib import Path
+import numpy as np
+logger = logging.getLogger(__name__)
+@dataclass
+class SystemMetrics:
+    """System resource metrics."""
+    cpu_percent: float
+    memory_percent: float
+    memory_available_gb: float
+    disk_usage_percent: float
+    network_io: Dict[str, float]
+    gpu_utilization: Optional[float] = None
+    gpu_memory_percent: Optional[float] = None
+    timestamp: float = field(default_factory=time.time)
+@dataclass
+class ModelMetrics:
+    """Model performance metrics."""
+    inference_time_ms: float
+    tokens_per_second: float
+    memory_usage_mb: float
+    batch_size: int
+    sequence_length: int
+    model_parameters: int
+    timestamp: float = field(default_factory=time.time)
+@dataclass
+class TrainingMetrics:
+    """Training performance metrics."""
+    loss: float
+    learning_rate: float
+    gradient_norm: float
+    training_time_ms: float
+    samples_per_second: float
+    memory_usage_mb: float
+    epoch: int
+    step: int
+    timestamp: float = field(default_factory=time.time)
+class PerformanceProfiler:
+    """
+    Performance profiler for monitoring and optimizing system performance.
+    This profiler tracks system resources, model performance, and training metrics
+    to provide insights and optimization recommendations.
+    """
+    def __init__(self,
+                 history_size: int = 1000,
+                 monitoring_interval: float = 1.0,
+                 enable_gpu_monitoring: bool = True):
+        """
+        Initialize performance profiler.
+        Args:
+            history_size: Number of metrics to keep in history
+            monitoring_interval: Interval between system checks (seconds)
+            enable_gpu_monitoring: Whether to monitor GPU usage
+        """
+        self.history_size = history_size
+        self.monitoring_interval = monitoring_interval
+        self.enable_gpu_monitoring = enable_gpu_monitoring
+        # Metrics storage
+        self.system_metrics = deque(maxlen=history_size)
+        self.model_metrics = deque(maxlen=history_size)
+        self.training_metrics = deque(maxlen=history_size)
+        # Monitoring state
+        self.monitoring_active = False
+        self.monitoring_thread = None
+        # Performance counters
+        self.total_inference_requests = 0
+        self.total_training_steps = 0
+        self.start_time = time.time()
+        # Optimization recommendations
+        self.recommendations = []
+        logger.info("PerformanceProfiler initialized")
+    def start_monitoring(self):
+        """Start continuous system monitoring."""
+        if self.monitoring_active:
+            logger.warning("Monitoring already active")
+            return
+        self.monitoring_active = True
+        self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
+        self.monitoring_thread.start()
+        logger.info("System monitoring started")
+    def stop_monitoring(self):
+        """Stop continuous system monitoring."""
+        self.monitoring_active = False
+        if self.monitoring_thread:
+            self.monitoring_thread.join()
+        logger.info("System monitoring stopped")
+    def _monitoring_loop(self):
+        """Main monitoring loop."""
+        while self.monitoring_active:
+            try:
+                metrics = self._collect_system_metrics()
+                self.system_metrics.append(metrics)
+                # Check for performance issues
+                self._check_performance_issues(metrics)
+                time.sleep(self.monitoring_interval)
+            except Exception as e:
+                logger.error(f"Monitoring error: {e}")
+                time.sleep(self.monitoring_interval)
+    def _collect_system_metrics(self) -> SystemMetrics:
+        """Collect current system metrics."""
+        # CPU and memory
+        cpu_percent = psutil.cpu_percent(interval=0.1)
+        memory = psutil.virtual_memory()
+        memory_percent = memory.percent
+        memory_available_gb = memory.available / (1024**3)
+        # Disk usage
+        disk_usage = psutil.disk_usage('/')
+        disk_usage_percent = disk_usage.percent
+        # Network I/O
+        network_io = psutil.net_io_counters()
+        network_metrics = {
+            'bytes_sent': network_io.bytes_sent,
+            'bytes_recv': network_io.bytes_recv,
+            'packets_sent': network_io.packets_sent,
+            'packets_recv': network_io.packets_recv
+        }
+        # GPU metrics (if available)
+        gpu_utilization = None
+        gpu_memory_percent = None
+        if self.enable_gpu_monitoring and torch.cuda.is_available():
+            try:
+                gpu_utilization = torch.cuda.utilization()
+                gpu_memory = torch.cuda.memory_stats()
+                gpu_memory_percent = (
+                    gpu_memory['allocated_bytes.all.current'] /
+                    gpu_memory['reserved_bytes.all.current']
+                ) * 100 if gpu_memory['reserved_bytes.all.current'] > 0 else 0
+            except Exception as e:
+                logger.debug(f"GPU monitoring error: {e}")
+        return SystemMetrics(
+            cpu_percent=cpu_percent,
+            memory_percent=memory_percent,
+            memory_available_gb=memory_available_gb,
+            disk_usage_percent=disk_usage_percent,
+            network_io=network_metrics,
+            gpu_utilization=gpu_utilization,
+            gpu_memory_percent=gpu_memory_percent
+        )
+    def _check_performance_issues(self, metrics: SystemMetrics):
+        """Check for performance issues and generate recommendations."""
+        recommendations = []
+        # Memory usage check
+        if metrics.memory_percent > 90:
+            recommendations.append({
+                'type': 'memory_high',
+                'severity': 'high',
+                'message': f'Memory usage is very high ({metrics.memory_percent:.1f}%)',
+                'suggestion': 'Consider reducing batch size or using gradient checkpointing'
+            })
+        elif metrics.memory_percent > 80:
+            recommendations.append({
+                'type': 'memory_high',
+                'severity': 'medium',
+                'message': f'Memory usage is high ({metrics.memory_percent:.1f}%)',
+                'suggestion': 'Monitor memory usage and consider optimization'
+            })
+        # CPU usage check
+        if metrics.cpu_percent > 95:
+            recommendations.append({
+                'type': 'cpu_high',
+                'severity': 'high',
+                'message': f'CPU usage is very high ({metrics.cpu_percent:.1f}%)',
+                'suggestion': 'Consider reducing number of workers or using GPU'
+            })
+        # GPU usage check
+        if metrics.gpu_utilization is not None:
+            if metrics.gpu_utilization < 50:
+                recommendations.append({
+                    'type': 'gpu_underutilized',
+                    'severity': 'low',
+                    'message': f'GPU utilization is low ({metrics.gpu_utilization:.1f}%)',
+                    'suggestion': 'Consider increasing batch size or using mixed precision'
+                })
+            elif metrics.gpu_memory_percent and metrics.gpu_memory_percent > 90:
+                recommendations.append({
+                    'type': 'gpu_memory_high',
+                    'severity': 'high',
+                    'message': f'GPU memory usage is very high ({metrics.gpu_memory_percent:.1f}%)',
+                    'suggestion': 'Consider reducing batch size or using gradient checkpointing'
+                })
+        # Add recommendations to history
+        for rec in recommendations:
+            rec['timestamp'] = time.time()
+            self.recommendations.append(rec)
+        # Keep only recent recommendations
+        if len(self.recommendations) > 100:
+            self.recommendations = self.recommendations[-100:]
+    def record_inference(self,
+                        inference_time_ms: float,
+                        tokens_generated: int,
+                        memory_usage_mb: float,
+                        batch_size: int,
+                        sequence_length: int,
+                        model_parameters: int):
+        """Record inference performance metrics."""
+        tokens_per_second = (tokens_generated / (inference_time_ms / 1000)) if inference_time_ms > 0 else 0
+        metrics = ModelMetrics(
+            inference_time_ms=inference_time_ms,
+            tokens_per_second=tokens_per_second,
+            memory_usage_mb=memory_usage_mb,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            model_parameters=model_parameters
+        )
+        self.model_metrics.append(metrics)
+        self.total_inference_requests += 1
+    def record_training(self,
+                       loss: float,
+                       learning_rate: float,
+                       gradient_norm: float,
+                       training_time_ms: float,
+                       samples_processed: int,
+                       memory_usage_mb: float,
+                       epoch: int,
+                       step: int):
+        """Record training performance metrics."""
+        samples_per_second = (samples_processed / (training_time_ms / 1000)) if training_time_ms > 0 else 0
+        metrics = TrainingMetrics(
+            loss=loss,
+            learning_rate=learning_rate,
+            gradient_norm=gradient_norm,
+            training_time_ms=training_time_ms,
+            samples_per_second=samples_per_second,
+            memory_usage_mb=memory_usage_mb,
+            epoch=epoch,
+            step=step
+        )
+        self.training_metrics.append(metrics)
+        self.total_training_steps += 1
+    def get_system_summary(self) -> Dict[str, Any]:
+        """Get system performance summary."""
+        if not self.system_metrics:
+            return {"error": "No system metrics available"}
+        recent_metrics = list(self.system_metrics)[-100:]  # Last 100 measurements
+        cpu_values = [m.cpu_percent for m in recent_metrics]
+        memory_values = [m.memory_percent for m in recent_metrics]
+        return {
+            "cpu": {
+                "current": cpu_values[-1] if cpu_values else 0,
+                "average": np.mean(cpu_values) if cpu_values else 0,
+                "max": np.max(cpu_values) if cpu_values else 0,
+                "min": np.min(cpu_values) if cpu_values else 0
+            },
+            "memory": {
+                "current_percent": memory_values[-1] if memory_values else 0,
+                "average_percent": np.mean(memory_values) if memory_values else 0,
+                "available_gb": recent_metrics[-1].memory_available_gb if recent_metrics else 0
+            },
+            "gpu": {
+                "utilization": recent_metrics[-1].gpu_utilization if recent_metrics else None,
+                "memory_percent": recent_metrics[-1].gpu_memory_percent if recent_metrics else None
+            },
+            "uptime_hours": (time.time() - self.start_time) / 3600
+        }
+    def get_model_summary(self) -> Dict[str, Any]:
+        """Get model performance summary."""
+        if not self.model_metrics:
+            return {"error": "No model metrics available"}
+        recent_metrics = list(self.model_metrics)[-100:]  # Last 100 measurements
+        inference_times = [m.inference_time_ms for m in recent_metrics]
+        tokens_per_sec = [m.tokens_per_second for m in recent_metrics]
+        memory_usage = [m.memory_usage_mb for m in recent_metrics]
+        return {
+            "inference": {
+                "avg_time_ms": np.mean(inference_times) if inference_times else 0,
+                "min_time_ms": np.min(inference_times) if inference_times else 0,
+                "max_time_ms": np.max(inference_times) if inference_times else 0,
+                "avg_tokens_per_second": np.mean(tokens_per_sec) if tokens_per_sec else 0
+            },
+            "memory": {
+                "avg_usage_mb": np.mean(memory_usage) if memory_usage else 0,
+                "max_usage_mb": np.max(memory_usage) if memory_usage else 0
+            },
+            "total_requests": self.total_inference_requests,
+            "recent_requests": len(recent_metrics)
+        }
+    def get_training_summary(self) -> Dict[str, Any]:
+        """Get training performance summary."""
+        if not self.training_metrics:
+            return {"error": "No training metrics available"}
+        recent_metrics = list(self.training_metrics)[-100:]  # Last 100 measurements
+        losses = [m.loss for m in recent_metrics]
+        samples_per_sec = [m.samples_per_second for m in recent_metrics]
+        memory_usage = [m.memory_usage_mb for m in recent_metrics]
+        return {
+            "loss": {
+                "current": losses[-1] if losses else 0,
+                "average": np.mean(losses) if losses else 0,
+                "min": np.min(losses) if losses else 0,
+                "trend": "decreasing" if len(losses) > 1 and losses[-1] < losses[0] else "increasing"
+            },
+            "performance": {
+                "avg_samples_per_second": np.mean(samples_per_sec) if samples_per_sec else 0,
+                "avg_memory_usage_mb": np.mean(memory_usage) if memory_usage else 0
+            },
+            "total_steps": self.total_training_steps,
+            "recent_steps": len(recent_metrics),
+            "current_epoch": recent_metrics[-1].epoch if recent_metrics else 0
+        }
+    def get_recommendations(self) -> List[Dict[str, Any]]:
+        """Get current optimization recommendations."""
+        return self.recommendations[-10:]  # Return last 10 recommendations
+    def generate_optimization_report(self) -> Dict[str, Any]:
+        """Generate comprehensive optimization report."""
+        system_summary = self.get_system_summary()
+        model_summary = self.get_model_summary()
+        training_summary = self.get_training_summary()
+        recommendations = self.get_recommendations()
+        # Calculate overall performance score
+        performance_score = self._calculate_performance_score(
+            system_summary, model_summary, training_summary
+        )
+        return {
+            "timestamp": time.time(),
+            "performance_score": performance_score,
+            "system_summary": system_summary,
+            "model_summary": model_summary,
+            "training_summary": training_summary,
+            "recommendations": recommendations,
+            "optimization_priority": self._get_optimization_priority(recommendations)
+        }
+    def _calculate_performance_score(self,
+                                   system_summary: Dict,
+                                   model_summary: Dict,
+                                   training_summary: Dict) -> float:
+        """Calculate overall performance score (0-100)."""
+        score = 100.0
+        # Deduct points for system issues
+        if "cpu" in system_summary:
+            cpu_avg = system_summary["cpu"]["average"]
+            if cpu_avg > 90:
+                score -= 20
+            elif cpu_avg > 80:
+                score -= 10
+            elif cpu_avg > 70:
+                score -= 5
+        if "memory" in system_summary:
+            memory_avg = system_summary["memory"]["average_percent"]
+            if memory_avg > 90:
+                score -= 20
+            elif memory_avg > 80:
+                score -= 10
+            elif memory_avg > 70:
+                score -= 5
+        # Deduct points for model performance issues
+        if "inference" in model_summary:
+            avg_time = model_summary["inference"]["avg_time_ms"]
+            if avg_time > 1000:  # More than 1 second
+                score -= 15
+            elif avg_time > 500:  # More than 500ms
+                score -= 10
+            elif avg_time > 100:  # More than 100ms
+                score -= 5
+        return max(0, score)
+    def _get_optimization_priority(self, recommendations: List[Dict]) -> str:
+        """Get optimization priority based on recommendations."""
+        high_priority = sum(1 for r in recommendations if r.get('severity') == 'high')
+        medium_priority = sum(1 for r in recommendations if r.get('severity') == 'medium')
+        if high_priority > 0:
+            return "high"
+        elif medium_priority > 2:
+            return "medium"
+        else:
+            return "low"
+    def save_metrics(self, filepath: str):
+        """Save metrics to file."""
+        try:
+            data = {
+                "system_metrics": [self._metric_to_dict(m) for m in self.system_metrics],
+                "model_metrics": [self._metric_to_dict(m) for m in self.model_metrics],
+                "training_metrics": [self._metric_to_dict(m) for m in self.training_metrics],
+                "recommendations": self.recommendations,
+                "summary": {
+                    "total_inference_requests": self.total_inference_requests,
+                    "total_training_steps": self.total_training_steps,
+                    "uptime_hours": (time.time() - self.start_time) / 3600
+                }
+            }
+            with open(filepath, 'w') as f:
+                json.dump(data, f, indent=2, default=str)
+            logger.info(f"Metrics saved to {filepath}")
+        except Exception as e:
+            logger.error(f"Failed to save metrics: {e}")
+    def _metric_to_dict(self, metric) -> Dict:
+        """Convert metric object to dictionary."""
+        return {k: v for k, v in metric.__dict__.items() if not k.startswith('_')}
+    def load_metrics(self, filepath: str):
+        """Load metrics from file."""
+        try:
+            with open(filepath, 'r') as f:
+                data = json.load(f)
+            # Reconstruct metrics objects
+            self.system_metrics = deque(
+                [SystemMetrics(**m) for m in data.get("system_metrics", [])],
+                maxlen=self.history_size
+            )
+            self.model_metrics = deque(
+                [ModelMetrics(**m) for m in data.get("model_metrics", [])],
+                maxlen=self.history_size
+            )
+            self.training_metrics = deque(
+                [TrainingMetrics(**m) for m in data.get("training_metrics", [])],
+                maxlen=self.history_size
+            )
+            self.recommendations = data.get("recommendations", [])
+            logger.info(f"Metrics loaded from {filepath}")
+        except Exception as e:
+            logger.error(f"Failed to load metrics: {e}")
+# Global profiler instance
+_global_profiler: Optional[PerformanceProfiler] = None
+def get_profiler() -> PerformanceProfiler:
+    """Get global profiler instance."""
+    global _global_profiler
+    if _global_profiler is None:
+        _global_profiler = PerformanceProfiler()
+    return _global_profiler
+def start_monitoring():
+    """Start global performance monitoring."""
+    profiler = get_profiler()
+    profiler.start_monitoring()
+def stop_monitoring():
+    """Stop global performance monitoring."""
+    profiler = get_profiler()
+    profiler.stop_monitoring()
+def record_inference(**kwargs):
+    """Record inference metrics using global profiler."""
+    profiler = get_profiler()
+    profiler.record_inference(**kwargs)
+def record_training(**kwargs):
+    """Record training metrics using global profiler."""
+    profiler = get_profiler()
+    profiler.record_training(**kwargs)
+def get_performance_report() -> Dict[str, Any]:
+    """Get performance report using global profiler."""
+    profiler = get_profiler()
+    return profiler.generate_optimization_report()

core/src/quantization.py ADDED Viewed

	@@ -0,0 +1,286 @@

+#!/usr/bin/env python3
+"""
+Model Quantization Utilities
+This module provides utilities for model quantization to reduce memory usage
+and improve inference speed while maintaining reasonable accuracy.
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import torch
+import torch.nn as nn
+import torch.quantization as quantization
+from typing import Optional, Dict, Any
+import copy
+class QuantizedModel:
+    """
+    Wrapper for quantized models with easy conversion and inference.
+    This class provides utilities for converting models to quantized versions
+    and performing efficient inference with reduced memory usage.
+    """
+    def __init__(self, model: nn.Module, quantized_model: Optional[nn.Module] = None):
+        """
+        Initialize quantized model wrapper.
+        Args:
+            model: Original model
+            quantized_model: Pre-quantized model (optional)
+        """
+        self.original_model = model
+        self.quantized_model = quantized_model
+        self.is_quantized = quantized_model is not None
+    def quantize_dynamic(self,
+                        qconfig_spec: Optional[Dict] = None,
+                        dtype: torch.dtype = torch.qint8) -> 'QuantizedModel':
+        """
+        Perform dynamic quantization on the model.
+        Args:
+            qconfig_spec: Quantization configuration
+            dtype: Quantization dtype (qint8, quint8)
+        Returns:
+            QuantizedModel: Self with quantized model
+        """
+        if qconfig_spec is None:
+            qconfig_spec = {
+                nn.Linear: quantization.default_dynamic_qconfig,
+                nn.LSTM: quantization.default_dynamic_qconfig,
+                nn.LSTMCell: quantization.default_dynamic_qconfig,
+                nn.RNNCell: quantization.default_dynamic_qconfig,
+                nn.GRUCell: quantization.default_dynamic_qconfig,
+            }
+        # Create a copy of the model for quantization
+        model_copy = copy.deepcopy(self.original_model)
+        model_copy.eval()
+        # Prepare model for quantization
+        model_prepared = quantization.prepare_dynamic(model_copy, qconfig_spec)
+        # Convert to quantized model
+        self.quantized_model = quantization.convert(model_prepared)
+        self.is_quantized = True
+        print(f"Dynamic quantization completed with dtype: {dtype}")
+        return self
+    def quantize_static(self,
+                       calibration_data: torch.utils.data.DataLoader,
+                       qconfig: Optional[quantization.QConfig] = None) -> 'QuantizedModel':
+        """
+        Perform static quantization on the model.
+        Args:
+            calibration_data: DataLoader for calibration
+            qconfig: Quantization configuration
+        Returns:
+            QuantizedModel: Self with quantized model
+        """
+        if qconfig is None:
+            qconfig = quantization.get_default_qconfig('fbgemm')
+        # Create a copy of the model for quantization
+        model_copy = copy.deepcopy(self.original_model)
+        model_copy.eval()
+        # Prepare model for quantization
+        model_prepared = quantization.prepare(model_copy, qconfig)
+        # Calibrate the model
+        print("Calibrating model...")
+        with torch.no_grad():
+            for batch_idx, (data, _) in enumerate(calibration_data):
+                if batch_idx >= 100:  # Limit calibration samples
+                    break
+                model_prepared(data)
+        # Convert to quantized model
+        self.quantized_model = quantization.convert(model_prepared)
+        self.is_quantized = True
+        print("Static quantization completed")
+        return self
+    def forward(self, *args, **kwargs):
+        """Forward pass using quantized model if available."""
+        if self.is_quantized and self.quantized_model is not None:
+            return self.quantized_model(*args, **kwargs)
+        else:
+            return self.original_model(*args, **kwargs)
+    def get_memory_usage(self) -> Dict[str, float]:
+        """
+        Get memory usage comparison between original and quantized models.
+        Returns:
+            dict: Memory usage in MB
+        """
+        def get_model_size(model):
+            param_size = 0
+            buffer_size = 0
+            for param in model.parameters():
+                param_size += param.nelement() * param.element_size()
+            for buffer in model.buffers():
+                buffer_size += buffer.nelement() * buffer.element_size()
+            return (param_size + buffer_size) / (1024 * 1024)  # Convert to MB
+        original_size = get_model_size(self.original_model)
+        quantized_size = get_model_size(self.quantized_model) if self.quantized_model else original_size
+        return {
+            "original_mb": original_size,
+            "quantized_mb": quantized_size,
+            "compression_ratio": original_size / quantized_size if quantized_size > 0 else 1.0
+        }
+    def save_quantized(self, path: str):
+        """Save quantized model."""
+        if self.quantized_model is not None:
+            torch.save(self.quantized_model.state_dict(), path)
+            print(f"Quantized model saved to: {path}")
+        else:
+            raise ValueError("No quantized model available")
+    def load_quantized(self, path: str):
+        """Load quantized model."""
+        self.quantized_model.load_state_dict(torch.load(path))
+        self.is_quantized = True
+        print(f"Quantized model loaded from: {path}")
+def quantize_model_dynamic(model: nn.Module,
+                          dtype: torch.dtype = torch.qint8) -> QuantizedModel:
+    """
+    Convenience function for dynamic quantization.
+    Args:
+        model: Model to quantize
+        dtype: Quantization dtype
+    Returns:
+        QuantizedModel: Quantized model wrapper
+    """
+    quantized = QuantizedModel(model)
+    return quantized.quantize_dynamic(dtype=dtype)
+def quantize_model_static(model: nn.Module,
+                         calibration_data: torch.utils.data.DataLoader,
+                         qconfig: Optional[quantization.QConfig] = None) -> QuantizedModel:
+    """
+    Convenience function for static quantization.
+    Args:
+        model: Model to quantize
+        calibration_data: Data for calibration
+        qconfig: Quantization configuration
+    Returns:
+        QuantizedModel: Quantized model wrapper
+    """
+    quantized = QuantizedModel(model)
+    return quantized.quantize_static(calibration_data, qconfig)
+def create_quantization_config(backend: str = 'fbgemm',
+                              dtype: torch.dtype = torch.qint8) -> quantization.QConfig:
+    """
+    Create quantization configuration.
+    Args:
+        backend: Quantization backend ('fbgemm', 'qnnpack')
+        dtype: Quantization dtype
+    Returns:
+        QConfig: Quantization configuration
+    """
+    if backend == 'fbgemm':
+        return quantization.QConfig(
+            activation=quantization.default_observer,
+            weight=quantization.default_per_channel_weight_observer
+        )
+    elif backend == 'qnnpack':
+        return quantization.QConfig(
+            activation=quantization.default_observer,
+            weight=quantization.default_weight_observer
+        )
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+def benchmark_quantization(original_model: nn.Module,
+                          quantized_model: QuantizedModel,
+                          test_data: torch.Tensor,
+                          num_runs: int = 100) -> Dict[str, float]:
+    """
+    Benchmark original vs quantized model performance.
+    Args:
+        original_model: Original model
+        quantized_model: Quantized model
+        test_data: Test data for benchmarking
+        num_runs: Number of runs for averaging
+    Returns:
+        dict: Performance metrics
+    """
+    original_model.eval()
+    quantized_model.quantized_model.eval()
+    # Benchmark original model
+    start_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
+    end_time = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
+    if start_time:
+        start_time.record()
+    with torch.no_grad():
+        for _ in range(num_runs):
+            _ = original_model(test_data)
+    if end_time:
+        end_time.record()
+        torch.cuda.synchronize()
+        original_time = start_time.elapsed_time(end_time) / num_runs
+    else:
+        import time
+        start = time.time()
+        for _ in range(num_runs):
+            _ = original_model(test_data)
+        original_time = (time.time() - start) * 1000 / num_runs  # Convert to ms
+    # Benchmark quantized model
+    if start_time:
+        start_time.record()
+    with torch.no_grad():
+        for _ in range(num_runs):
+            _ = quantized_model.quantized_model(test_data)
+    if end_time:
+        end_time.record()
+        torch.cuda.synchronize()
+        quantized_time = start_time.elapsed_time(end_time) / num_runs
+    else:
+        start = time.time()
+        for _ in range(num_runs):
+            _ = quantized_model.quantized_model(test_data)
+        quantized_time = (time.time() - start) * 1000 / num_runs  # Convert to ms
+    return {
+        "original_time_ms": original_time,
+        "quantized_time_ms": quantized_time,
+        "speedup": original_time / quantized_time if quantized_time > 0 else 1.0
+    }

core/src/train_model.py ADDED Viewed

	@@ -0,0 +1,668 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+Language Model Training Script
+This script implements the complete training pipeline for GPT-style language models.
+It includes optimization, checkpointing, progress monitoring, and CPU-optimized training
+for limited hardware environments.
+FEATURES:
+- CPU-optimized training with memory management
+- Gradient accumulation for effective large batch sizes
+- Learning rate scheduling with warmup
+- Model checkpointing and resume capability
+- Real-time monitoring of loss, perplexity, and speed
+- Memory usage tracking and optimization
+- Automatic mixed precision (if available)
+HARDWARE OPTIMIZATION:
+- Designed for 8GB RAM systems
+- Efficient CPU training with PyTorch optimizations
+- Gradient accumulation to simulate larger batches
+- Memory cleanup and garbage collection
+- Progress saving for long training runs
+Usage:
+    python core/src/train_model.py \\
+        --model-size small \\
+        --data-file data/clean/training_data.txt \\
+        --tokenizer-dir data/tokenizer/ \\
+        --output-dir models/my-model/ \\
+        --max-steps 10000
+Requirements:
+    - PyTorch
+    - SentencePiece
+    - Our model architecture and data loader
+Author: Louis Chua Bean Chong
+License: GPLv3
+"""
+import argparse
+import gc
+import json
+import math
+import os
+import time
+from pathlib import Path
+from typing import Dict
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import CosineAnnealingLR
+# Import our modules
+try:
+    from data_loader import TextDataLoader
+    from model import GPTModel, create_model
+except ImportError:
+    import sys
+    sys.path.append(os.path.dirname(__file__))
+    from data_loader import TextDataLoader
+    from model import GPTModel, create_model
+class TrainingConfig:
+    """Configuration for model training parameters."""
+    def __init__(
+        self,
+        learning_rate: float = 1e-4,
+        batch_size: int = 32,
+        max_steps: int = 100000,
+        warmup_steps: int = 10000,
+        gradient_clipping: float = 1.0,
+        weight_decay: float = 0.01,
+        mixed_precision: bool = True,
+        gradient_checkpointing: bool = True,
+    ):
+        self.learning_rate = learning_rate
+        self.batch_size = batch_size
+        self.max_steps = max_steps
+        self.warmup_steps = warmup_steps
+        self.gradient_clipping = gradient_clipping
+        self.weight_decay = weight_decay
+        self.mixed_precision = mixed_precision
+        self.gradient_checkpointing = gradient_checkpointing
+class ModelTrainer:
+    """
+    Comprehensive trainer for GPT-style language models.
+    Handles the complete training pipeline including data loading, optimization,
+    checkpointing, and progress monitoring.
+    """
+    def __init__(
+        self,
+        model: GPTModel,
+        data_loader: TextDataLoader,
+        output_dir: str,
+        device: str = "cpu",
+        learning_rate: float = 3e-4,
+        weight_decay: float = 0.01,
+        warmup_steps: int = 1000,
+        max_steps: int = 10000,
+        gradient_accumulation_steps: int = 4,
+        gradient_clipping: float = 1.0,
+        save_every: int = 1000,
+        eval_every: int = 500,
+        log_every: int = 100,
+    ):
+        """
+        Initialize the model trainer.
+        Args:
+            model: GPT model to train
+            data_loader: Data loader for training data
+            output_dir: Directory to save checkpoints and logs
+            device: Training device ("cpu" or "cuda")
+            learning_rate: Peak learning rate
+            weight_decay: Weight decay for regularization
+            warmup_steps: Number of warmup steps for learning rate
+            max_steps: Maximum training steps
+            gradient_accumulation_steps: Steps to accumulate gradients
+            gradient_clipping: Maximum gradient norm
+            save_every: Save checkpoint every N steps
+            eval_every: Evaluate model every N steps
+            log_every: Log progress every N steps
+        """
+        self.model = model.to(device)
+        self.data_loader = data_loader
+        self.output_dir = Path(output_dir)
+        self.device = device
+        # Training hyperparameters
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.warmup_steps = warmup_steps
+        self.max_steps = max_steps
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.gradient_clipping = gradient_clipping
+        # Logging and saving
+        self.save_every = save_every
+        self.eval_every = eval_every
+        self.log_every = log_every
+        # Create output directory
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Initialize optimizer and scheduler
+        self.optimizer = self._create_optimizer()
+        self.scheduler = self._create_scheduler()
+        # Training state
+        self.step = 0
+        self.epoch = 0
+        self.best_loss = float("inf")
+        self.training_log = []
+        # Performance tracking
+        self.start_time = None
+        self.step_times = []
+        print("🚀 ModelTrainer initialized")
+        print(f"  Device: {device}")
+        print(f"  Model parameters: {model.get_num_params():,}")
+        print(f"  Learning rate: {learning_rate}")
+        print(f"  Max steps: {max_steps:,}")
+        print(f"  Gradient accumulation: {gradient_accumulation_steps}")
+        print(f"  Output directory: {output_dir}")
+    def _create_optimizer(self) -> optim.Optimizer:
+        """Create AdamW optimizer with weight decay."""
+        # Separate parameters for weight decay
+        decay_params = []
+        no_decay_params = []
+        for name, param in self.model.named_parameters():
+            if not param.requires_grad:
+                continue
+            # Don't apply weight decay to biases and layer norm parameters
+            if len(param.shape) == 1 or name.endswith(".bias"):
+                no_decay_params.append(param)
+            else:
+                decay_params.append(param)
+        param_groups = [
+            {"params": decay_params, "weight_decay": self.weight_decay},
+            {"params": no_decay_params, "weight_decay": 0.0},
+        ]
+        # Use AdamW with lower memory usage for CPU
+        optimizer = optim.AdamW(
+            param_groups,
+            lr=self.learning_rate,
+            betas=(0.9, 0.95),  # Slightly different from default for LLM training
+            eps=1e-8,
+        )
+        return optimizer
+    def _create_scheduler(self) -> torch.optim.lr_scheduler._LRScheduler:
+        """Create learning rate scheduler with warmup and cosine decay."""
+        if self.warmup_steps > 0:
+            # Use a custom scheduler to avoid deprecation warnings
+            # This implements warmup + cosine decay without SequentialLR
+            class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler):
+                def __init__(self, optimizer, warmup_steps, max_steps, min_lr_factor=0.1):
+                    self.warmup_steps = warmup_steps
+                    self.max_steps = max_steps
+                    self.min_lr_factor = min_lr_factor
+                    super().__init__(optimizer)
+                def get_lr(self):
+                    if self.last_epoch < self.warmup_steps:
+                        # Linear warmup
+                        factor = self.last_epoch / self.warmup_steps
+                        return [base_lr * (0.01 + 0.99 * factor) for base_lr in self.base_lrs]
+                    else:
+                        # Cosine decay
+                        progress = (self.last_epoch - self.warmup_steps) / (
+                            self.max_steps - self.warmup_steps
+                        )
+                        progress = min(progress, 1.0)  # Clamp to 1.0
+                        factor = 0.5 * (1 + math.cos(math.pi * progress))
+                        factor = self.min_lr_factor + (1 - self.min_lr_factor) * factor
+                        return [base_lr * factor for base_lr in self.base_lrs]
+            scheduler = WarmupCosineScheduler(
+                self.optimizer,
+                warmup_steps=self.warmup_steps,
+                max_steps=self.max_steps,
+                min_lr_factor=0.1,
+            )
+        else:
+            # Just cosine decay - this should not trigger warnings
+            scheduler = CosineAnnealingLR(
+                self.optimizer, T_max=self.max_steps, eta_min=self.learning_rate * 0.1
+            )
+        return scheduler
+    def _calculate_loss(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        """
+        Calculate cross-entropy loss for autoregressive language modeling.
+        This method computes the standard cross-entropy loss used in language model training.
+        The loss measures how well the model predicts the next token in the sequence.
+        Mathematical formulation:
+            Loss = -∑ log(P(target_token | context))
+            where P is the softmax probability distribution over vocabulary
+        Implementation details:
+            - Reshapes 3D tensors to 2D for efficient computation
+            - Uses PyTorch's optimized cross_entropy function
+            - Handles padding tokens by ignoring them in loss calculation
+            - Computes mean loss across all valid positions
+        Why cross-entropy for language modeling:
+            - Natural choice for multi-class classification (next token prediction)
+            - Provides strong gradient signal for correct token probabilities
+            - Mathematically equivalent to minimizing negative log-likelihood
+            - Well-studied optimization properties for neural language models
+        Args:
+            logits: Raw model predictions of shape (batch_size, seq_len, vocab_size)
+                   Contains unnormalized scores for each token in vocabulary
+                   These will be converted to probabilities via softmax internally
+            targets: Ground truth next tokens of shape (batch_size, seq_len)
+                    Contains token IDs representing the true next tokens
+                    Should be input sequence shifted by one position
+        Returns:
+            torch.Tensor: Scalar loss value representing prediction error
+                         Lower values indicate better next-token prediction accuracy
+        """
+        # Reshape tensors from 3D to 2D for efficient loss computation
+        # This converts per-sequence per-position predictions to a flat structure
+        # where each row represents one prediction over the entire vocabulary
+        logits = logits.view(-1, logits.size(-1))  # (batch_size * seq_len, vocab_size)
+        targets = targets.view(-1)  # (batch_size * seq_len,)
+        # Calculate cross-entropy loss with proper handling of special tokens
+        # ignore_index=-1 excludes padding tokens from loss calculation
+        # This prevents the model from learning to predict padding, which would skew training
+        # The function internally applies softmax to logits and computes negative log-likelihood
+        loss = nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+        # Return scalar loss for backpropagation
+        # This loss will be used to compute gradients via automatic differentiation
+        return loss
+    def _get_memory_usage(self) -> Dict[str, float]:
+        """Get current memory usage statistics."""
+        memory_stats = {}
+        if torch.cuda.is_available() and self.device.startswith("cuda"):
+            memory_stats["gpu_allocated_mb"] = torch.cuda.memory_allocated() / (1024**2)
+            memory_stats["gpu_cached_mb"] = torch.cuda.memory_reserved() / (1024**2)
+        # Estimate CPU memory (approximate)
+        import psutil
+        process = psutil.Process()
+        memory_stats["cpu_memory_mb"] = process.memory_info().rss / (1024**2)
+        return memory_stats
+    def _log_step(self, step: int, loss: float, lr: float, step_time: float) -> None:
+        """Log training progress for a single step."""
+        perplexity = math.exp(min(loss, 10))  # Cap at exp(10) to avoid overflow
+        # Calculate tokens per second
+        tokens_per_batch = self.data_loader.batch_size * self.data_loader.seq_len
+        tokens_per_second = tokens_per_batch / step_time if step_time > 0 else 0
+        # Get memory usage
+        memory_stats = self._get_memory_usage()
+        # Create log entry
+        log_entry = {
+            "step": step,
+            "loss": loss,
+            "perplexity": perplexity,
+            "learning_rate": lr,
+            "step_time": step_time,
+            "tokens_per_second": tokens_per_second,
+            "memory_mb": memory_stats.get("cpu_memory_mb", 0),
+        }
+        self.training_log.append(log_entry)
+        # Print progress
+        _ = time.time() - self.start_time if self.start_time else 0
+        eta_seconds = (self.max_steps - step) * step_time if step_time > 0 else 0
+        eta_hours = eta_seconds / 3600
+        print(
+            f"Step {step:,}/{self.max_steps:,} | "
+            f"Loss: {loss:.4f} | "
+            f"PPL: {perplexity:.2f} | "
+            f"LR: {lr:.2e} | "
+            f"Time: {step_time:.2f}s | "
+            f"Tokens/s: {tokens_per_second:.1f} | "
+            f"Memory: {memory_stats.get('cpu_memory_mb', 0):.0f}MB | "
+            f"ETA: {eta_hours:.1f}h"
+        )
+    def _save_checkpoint(self, step: int, is_best: bool = False) -> None:
+        """Save model checkpoint."""
+        checkpoint = {
+            "step": step,
+            "epoch": self.epoch,
+            "model_state_dict": self.model.state_dict(),
+            "optimizer_state_dict": self.optimizer.state_dict(),
+            "scheduler_state_dict": self.scheduler.state_dict(),
+            "best_loss": self.best_loss,
+            "training_log": self.training_log,
+            "config": self.model.config.__dict__,
+        }
+        # Save latest checkpoint
+        checkpoint_path = self.output_dir / f"checkpoint_step_{step}.pt"
+        torch.save(checkpoint, checkpoint_path)
+        # Save best checkpoint
+        if is_best:
+            best_path = self.output_dir / "best_model.pt"
+            torch.save(checkpoint, best_path)
+            print(f"💾 New best model saved: {best_path}")
+        # Save training log
+        log_path = self.output_dir / "training_log.json"
+        with open(log_path, "w") as f:
+            json.dump(self.training_log, f, indent=2)
+        print(f"💾 Checkpoint saved: {checkpoint_path}")
+    def _load_checkpoint(self, checkpoint_path: str) -> None:
+        """Load model checkpoint to resume training."""
+        if not os.path.exists(checkpoint_path):
+            print(f"⚠️  Checkpoint not found: {checkpoint_path}")
+            return
+        print(f"📂 Loading checkpoint: {checkpoint_path}")
+        checkpoint = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+        self.step = checkpoint["step"]
+        self.epoch = checkpoint["epoch"]
+        self.best_loss = checkpoint["best_loss"]
+        self.training_log = checkpoint.get("training_log", [])
+        print("✓ Checkpoint loaded successfully")
+        print(f"  Resuming from step: {self.step:,}")
+        print(f"  Best loss so far: {self.best_loss:.4f}")
+    def train(self) -> None:
+        """Main training loop."""
+        print("\n🚀 Starting training...")
+        print(f"  Model: {self.model.config.model_name}")
+        print(f"  Parameters: {self.model.get_num_params():,}")
+        print(f"  Device: {self.device}")
+        print(f"  Max steps: {self.max_steps:,}")
+        print("=" * 80)
+        self.model.train()
+        self.start_time = time.time()
+        # Initialize gradient accumulation
+        accumulated_loss = 0.0
+        self.optimizer.zero_grad()
+        for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
+            if self.step >= self.max_steps:
+                break
+            step_start_time = time.time()
+            # Move batch to device
+            input_ids = input_ids.to(self.device)
+            target_ids = target_ids.to(self.device)
+            # Forward pass (model computes loss internally when targets provided)
+            logits, loss = self.model(input_ids, target_ids)
+            # Scale loss for gradient accumulation
+            loss = loss / self.gradient_accumulation_steps
+            accumulated_loss += loss.item()
+            # Backward pass
+            loss.backward()
+            # Update weights every gradient_accumulation_steps
+            if (batch_idx + 1) % self.gradient_accumulation_steps == 0:
+                # Clip gradients
+                if self.gradient_clipping > 0:
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.gradient_clipping)
+                # Update parameters
+                self.optimizer.step()
+                self.scheduler.step()
+                self.optimizer.zero_grad()
+                # Update step count
+                self.step += 1
+                step_time = time.time() - step_start_time
+                self.step_times.append(step_time)
+                # Get current learning rate
+                current_lr = self.scheduler.get_last_lr()[0]
+                # Log progress
+                if self.step % self.log_every == 0:
+                    avg_loss = accumulated_loss
+                    self._log_step(self.step, avg_loss, current_lr, step_time)
+                # Save checkpoint
+                if self.step % self.save_every == 0:
+                    is_best = accumulated_loss < self.best_loss
+                    if is_best:
+                        self.best_loss = accumulated_loss
+                    self._save_checkpoint(self.step, is_best)
+                # Clean up memory periodically
+                if self.step % 100 == 0:
+                    gc.collect()
+                # Reset accumulated loss
+                accumulated_loss = 0.0
+                # Check if training complete
+                if self.step >= self.max_steps:
+                    break
+        # Final checkpoint
+        print("\n🎉 Training completed!")
+        self._save_checkpoint(self.step, is_best=True)
+        # Training summary
+        total_time = time.time() - self.start_time
+        avg_step_time = sum(self.step_times) / len(self.step_times) if self.step_times else 0
+        print("\n📊 Training Summary:")
+        print(f"  Steps completed: {self.step:,}")
+        print(f"  Total time: {total_time/3600:.2f} hours")
+        print(f"  Average time per step: {avg_step_time:.2f}s")
+        print(f"  Final loss: {self.best_loss:.4f}")
+        print(f"  Final perplexity: {math.exp(min(self.best_loss, 10)):.2f}")
+        print(f"  Model saved to: {self.output_dir}")
+def main():
+    """Main function to handle command line training."""
+    parser = argparse.ArgumentParser(
+        description="Train a GPT-style language model",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Train small model for quick experimentation
+  python core/src/train_model.py \\
+    --model-size small \\
+    --max-steps 5000 \\
+    --output-dir models/test-small
+  # Train medium model with custom settings
+  python core/src/train_model.py \\
+    --model-size medium \\
+    --learning-rate 1e-4 \\
+    --batch-size 2 \\
+    --max-steps 50000 \\
+    --output-dir models/my-medium-model
+        """,
+    )
+    # Model and data arguments
+    parser.add_argument(
+        "--model-size",
+        choices=["small", "medium", "large"],
+        default="small",
+        help="Model size to train (default: small)",
+    )
+    parser.add_argument(
+        "--data-file",
+        default="data/clean/training_data.txt",
+        help="Path to training text file (default: data/clean/training_data.txt)",
+    )
+    parser.add_argument(
+        "--tokenizer-dir",
+        default="data/tokenizer/",
+        help="Path to tokenizer directory (default: data/tokenizer/)",
+    )
+    parser.add_argument(
+        "--output-dir", required=True, help="Output directory for model checkpoints"
+    )
+    # Training hyperparameters
+    parser.add_argument(
+        "--seq-len", type=int, default=512, help="Sequence length for training (default: 512)"
+    )
+    parser.add_argument("--batch-size", type=int, default=4, help="Batch size (default: 4)")
+    parser.add_argument(
+        "--learning-rate", type=float, default=3e-4, help="Learning rate (default: 3e-4)"
+    )
+    parser.add_argument(
+        "--max-steps", type=int, default=10000, help="Maximum training steps (default: 10000)"
+    )
+    parser.add_argument(
+        "--warmup-steps", type=int, default=1000, help="Warmup steps (default: 1000)"
+    )
+    parser.add_argument(
+        "--gradient-accumulation-steps",
+        type=int,
+        default=4,
+        help="Gradient accumulation steps (default: 4)",
+    )
+    parser.add_argument(
+        "--device",
+        choices=["cpu", "cuda", "auto"],
+        default="auto",
+        help="Training device (default: auto)",
+    )
+    parser.add_argument("--resume", help="Path to checkpoint to resume training from")
+    parser.add_argument(
+        "--save-every", type=int, default=1000, help="Save checkpoint every N steps (default: 1000)"
+    )
+    args = parser.parse_args()
+    print("🚀 OpenLLM Model Training")
+    print("=" * 60)
+    # Determine device
+    if args.device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    else:
+        device = args.device
+    print(f"Using device: {device}")
+    try:
+        # Create model
+        print(f"\n🏗️  Creating {args.model_size} model...")
+        model = create_model(args.model_size)
+        # Create data loader
+        print("\n📊 Setting up data loader...")
+        tokenizer_path = os.path.join(args.tokenizer_dir, "tokenizer.model")
+        data_loader = TextDataLoader(
+            data_file=args.data_file,
+            tokenizer_path=tokenizer_path,
+            seq_len=args.seq_len,
+            batch_size=args.batch_size,
+            shuffle=True,
+        )
+        # Get data statistics
+        _ = data_loader.get_data_stats()
+        # Create trainer
+        print("\n🎯 Setting up trainer...")
+        trainer = ModelTrainer(
+            model=model,
+            data_loader=data_loader,
+            output_dir=args.output_dir,
+            device=device,
+            learning_rate=args.learning_rate,
+            max_steps=args.max_steps,
+            warmup_steps=args.warmup_steps,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            save_every=args.save_every,
+        )
+        # Resume from checkpoint if specified
+        if args.resume:
+            trainer._load_checkpoint(args.resume)
+        # Start training
+        trainer.train()
+        print("\n🎉 Training completed successfully!")
+    except Exception as e:
+        print(f"\n❌ Training failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+if __name__ == "__main__":
+    main()

core/src/train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,428 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024 Louis Chua Bean Chong
+#
+# This file is part of OpenLLM.
+#
+# OpenLLM is dual-licensed:
+# 1. For open source use: GNU General Public License v3.0
+# 2. For commercial use: Commercial License (contact for details)
+#
+# See LICENSE and docs/LICENSES.md for full license information.
+"""
+Train a SentencePiece tokenizer from scratch using the prepared training data.
+OVERVIEW:
+This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset
+or any other text corpus. SentencePiece is a subword tokenizer that works well for language
+models and supports multiple languages without requiring pre-tokenization.
+FEATURES:
+- Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms
+- Configurable vocabulary size (recommended: 8k-64k for LLMs)
+- Handles special tokens (BOS, EOS, UNK, PAD)
+- Outputs tokenizer model files compatible with Hugging Face
+- Comprehensive statistics and vocabulary analysis
+TOKENIZER OUTPUT:
+- tokenizer.model: SentencePiece model file
+- tokenizer.vocab: Human-readable vocabulary file
+- tokenizer_config.json: Configuration for Hugging Face integration
+Usage:
+    python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
+Advanced usage:
+    python core/src/train_tokenizer.py \\
+        --input data/clean/training_data.txt \\
+        --vocab_size 32000 \\
+        --model_type bpe \\
+        --output_dir data/tokenizer/ \\
+        --character_coverage 0.9995
+Requirements:
+    pip install sentencepiece
+Example setup:
+```bash
+# If not already in virtual environment
+python -m venv venv
+source venv/bin/activate  # Linux/macOS
+# .\venv\Scripts\Activate.ps1  # Windows PowerShell
+# Install SentencePiece
+pip install sentencepiece
+# Train tokenizer
+python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
+```
+"""
+import argparse
+import json
+import os
+import time
+from typing import Any, Dict
+try:
+    import sentencepiece as spm
+except ImportError:
+    print("ERROR: SentencePiece not installed. Run: pip install sentencepiece")
+    exit(1)
+def validate_input_file(input_path: str) -> None:
+    """
+    Validate that the input training file exists and is readable.
+    Args:
+        input_path (str): Path to the training text file
+    Raises:
+        FileNotFoundError: If input file doesn't exist
+        ValueError: If input file is empty or unreadable
+    """
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"Training data file not found: {input_path}")
+    # Check file size and readability
+    file_size = os.path.getsize(input_path)
+    if file_size == 0:
+        raise ValueError(f"Training data file is empty: {input_path}")
+    # Test that we can read the file
+    try:
+        with open(input_path, "r", encoding="utf-8") as f:
+            first_line = f.readline()
+            if not first_line.strip():
+                raise ValueError(
+                    "Training data file appears to be empty or contains only whitespace"
+                )
+    except UnicodeDecodeError as e:
+        raise ValueError(f"Cannot read training data file as UTF-8: {e}")
+    print(f"✓ Input file validated: {input_path} ({file_size:,} bytes)")
+def count_training_sentences(input_path: str) -> int:
+    """
+    Count the number of training sentences/lines in the input file.
+    Args:
+        input_path (str): Path to the training text file
+    Returns:
+        int: Number of lines in the file
+    """
+    print("Counting training sentences...")
+    with open(input_path, "r", encoding="utf-8") as f:
+        count = sum(1 for line in f if line.strip())
+    print(f"✓ Found {count:,} training sentences")
+    return count
+def train_sentencepiece_tokenizer(
+    input_path: str,
+    output_dir: str,
+    vocab_size: int = 32000,
+    model_type: str = "bpe",
+    character_coverage: float = 0.9995,
+    max_sentence_length: int = 4192,
+    input_sentence_size: int = 10000000,
+    shuffle_input_sentence: bool = True,
+) -> Dict[str, Any]:
+    """
+    Train a SentencePiece tokenizer with the specified parameters.
+    Args:
+        input_path (str): Path to training text file
+        output_dir (str): Directory to save tokenizer files
+        vocab_size (int): Target vocabulary size (recommended: 8k-64k)
+        model_type (str): Algorithm type ('bpe' or 'unigram')
+        character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese)
+        max_sentence_length (int): Maximum sentence length in characters
+        input_sentence_size (int): Maximum number of sentences to use for training
+        shuffle_input_sentence (bool): Whether to shuffle input sentences
+    Returns:
+        Dict[str, Any]: Training statistics and configuration
+    """
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Define output paths
+    model_prefix = os.path.join(output_dir, "tokenizer")
+    # SentencePiece training parameters
+    train_params = [
+        f"--input={input_path}",
+        f"--model_prefix={model_prefix}",
+        f"--vocab_size={vocab_size}",
+        f"--model_type={model_type}",
+        f"--character_coverage={character_coverage}",
+        f"--max_sentence_length={max_sentence_length}",
+        f"--input_sentence_size={input_sentence_size}",
+        f"--shuffle_input_sentence={shuffle_input_sentence}",
+        # Special tokens for language modeling
+        "--pad_id=0",  # Padding token
+        "--unk_id=1",  # Unknown token
+        "--bos_id=2",  # Beginning of sequence
+        "--eos_id=3",  # End of sequence
+        # Additional useful parameters
+        "--split_by_unicode_script=true",  # Better handling of mixed scripts
+        "--split_by_whitespace=true",  # Split on whitespace
+        "--remove_extra_whitespaces=true",  # Clean up whitespace
+        "--normalization_rule_name=identity",  # Keep original text as-is
+    ]
+    print("\nTraining SentencePiece tokenizer...")
+    print(f"  Algorithm: {model_type.upper()}")
+    print(f"  Vocabulary size: {vocab_size:,}")
+    print(f"  Character coverage: {character_coverage}")
+    print(f"  Output directory: {output_dir}")
+    print(f"  Model files: {model_prefix}.model, {model_prefix}.vocab")
+    # Record training start time
+    start_time = time.time()
+    # Train the tokenizer
+    try:
+        spm.SentencePieceTrainer.train(" ".join(train_params))
+        training_time = time.time() - start_time
+        print(f"✓ Tokenizer training completed in {training_time:.1f} seconds")
+    except Exception as e:
+        raise RuntimeError(f"SentencePiece training failed: {e}")
+    # Verify output files were created
+    model_file = f"{model_prefix}.model"
+    vocab_file = f"{model_prefix}.vocab"
+    if not os.path.exists(model_file):
+        raise RuntimeError(f"Expected model file not created: {model_file}")
+    if not os.path.exists(vocab_file):
+        raise RuntimeError(f"Expected vocab file not created: {vocab_file}")
+    print(f"✓ Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
+    print(f"✓ Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")
+    # Return training configuration and statistics
+    config = {
+        "model_type": model_type,
+        "vocab_size": vocab_size,
+        "character_coverage": character_coverage,
+        "max_sentence_length": max_sentence_length,
+        "training_time_seconds": training_time,
+        "input_file": input_path,
+        "output_directory": output_dir,
+        "model_file": model_file,
+        "vocab_file": vocab_file,
+    }
+    return config
+def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
+    """
+    Test the trained tokenizer on sample sentences to verify it works correctly.
+    Args:
+        model_path (str): Path to the trained .model file
+        test_sentences (list): Optional list of test sentences
+    """
+    print("\nTesting trained tokenizer...")
+    # Load the trained tokenizer
+    sp = spm.SentencePieceProcessor()
+    sp.load(model_path)
+    # Default test sentences if none provided
+    if test_sentences is None:
+        test_sentences = [
+            "Hello, world! This is a test sentence.",
+            "The quick brown fox jumps over the lazy dog.",
+            "Machine learning and artificial intelligence are transforming technology.",
+            "SentencePiece tokenization works well for language models.",
+        ]
+    print(f"Vocabulary size: {sp.vocab_size():,}")
+    print(
+        f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}"
+    )
+    print("\nTokenization examples:")
+    for i, sentence in enumerate(test_sentences, 1):
+        # Encode to token IDs and pieces
+        token_ids = sp.encode(sentence)
+        token_pieces = sp.encode(sentence, out_type=str)
+        print(f"\n{i}. Input: {sentence}")
+        print(f"   Tokens ({len(token_pieces)}): {token_pieces}")
+        print(f"   IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")
+        # Test decoding
+        decoded = sp.decode(token_ids)
+        print(f"   Decoded: {decoded}")
+        # Verify round-trip encoding/decoding
+        if decoded.strip() != sentence.strip():
+            print("   ⚠️  Warning: Decode mismatch!")
+    print("✓ Tokenizer testing completed")
+def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
+    """
+    Save a Hugging Face compatible tokenizer configuration file.
+    Args:
+        output_dir (str): Directory containing the tokenizer files
+        config (Dict[str, Any]): Tokenizer configuration
+    """
+    # Create Hugging Face tokenizer config
+    hf_config = {
+        "tokenizer_class": "SentencePieceTokenizer",
+        "model_type": config["model_type"],
+        "vocab_size": config["vocab_size"],
+        "model_file": "tokenizer.model",
+        "special_tokens": {
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+        },
+        "special_token_ids": {
+            "pad_token_id": 0,
+            "unk_token_id": 1,
+            "bos_token_id": 2,
+            "eos_token_id": 3,
+        },
+    }
+    config_path = os.path.join(output_dir, "tokenizer_config.json")
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(hf_config, f, indent=2, ensure_ascii=False)
+    print(f"✓ Hugging Face config saved: {config_path}")
+def main():
+    """Main function to handle command line arguments and orchestrate tokenizer training."""
+    parser = argparse.ArgumentParser(
+        description="Train a SentencePiece tokenizer for language model training",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage with SQUAD data
+  python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
+  # Advanced configuration
+  python core/src/train_tokenizer.py \\
+    --input data/clean/training_data.txt \\
+    --vocab_size 32000 \\
+    --model_type bpe \\
+    --output_dir data/tokenizer/ \\
+    --character_coverage 0.9995
+        """,
+    )
+    # Required arguments
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="Path to training text file (e.g., data/clean/training_data.txt)",
+    )
+    # Optional arguments with sensible defaults
+    parser.add_argument(
+        "--vocab_size",
+        type=int,
+        default=32000,
+        help="Vocabulary size (default: 32000, recommended: 8k-64k)",
+    )
+    parser.add_argument(
+        "--model_type",
+        choices=["bpe", "unigram"],
+        default="bpe",
+        help="Tokenization algorithm (default: bpe)",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="data/tokenizer/",
+        help="Output directory for tokenizer files (default: data/tokenizer/)",
+    )
+    parser.add_argument(
+        "--character_coverage",
+        type=float,
+        default=0.9995,
+        help="Character coverage (default: 0.9995 for English)",
+    )
+    parser.add_argument(
+        "--max_sentence_length",
+        type=int,
+        default=4192,
+        help="Maximum sentence length in characters (default: 4192)",
+    )
+    parser.add_argument(
+        "--no_test", action="store_true", help="Skip tokenizer testing after training"
+    )
+    args = parser.parse_args()
+    print("🔤 SentencePiece Tokenizer Training")
+    print("=" * 50)
+    try:
+        # Step 1: Validate input file
+        validate_input_file(args.input)
+        # Step 2: Count training data
+        sentence_count = count_training_sentences(args.input)
+        # Step 3: Train tokenizer
+        config = train_sentencepiece_tokenizer(
+            input_path=args.input,
+            output_dir=args.output_dir,
+            vocab_size=args.vocab_size,
+            model_type=args.model_type,
+            character_coverage=args.character_coverage,
+            max_sentence_length=args.max_sentence_length,
+        )
+        # Step 4: Save Hugging Face compatible config
+        save_huggingface_config(args.output_dir, config)
+        # Step 5: Test tokenizer (unless skipped)
+        if not args.no_test:
+            model_path = os.path.join(args.output_dir, "tokenizer.model")
+            test_tokenizer(model_path)
+        # Step 6: Print summary
+        print("\n🎉 Tokenizer training completed successfully!")
+        print(f"📁 Output directory: {args.output_dir}")
+        print(f"📊 Vocabulary size: {config['vocab_size']:,}")
+        print(f"⏱️  Training time: {config['training_time_seconds']:.1f}s")
+        print(f"📄 Training sentences: {sentence_count:,}")
+        print("\nFiles created:")
+        print(f"  • {config['model_file']} - SentencePiece model")
+        print(f"  • {config['vocab_file']} - Vocabulary file")
+        print(f"  • {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")
+        print("\nTo use this tokenizer in your language model:")
+        print("  import sentencepiece as spm")
+        print("  sp = spm.SentencePieceProcessor()")
+        print(f"  sp.load('{config['model_file']}')")
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        exit(1)
+if __name__ == "__main__":
+    main()