Spaces:

lemms
/

openllm

Runtime error

App Files Files Community

lemms commited on Aug 16, 2025

Commit

6c43565

verified ·

1 Parent(s): 483881c

feat: Sync training infrastructure from main repository

Browse files

Files changed (7) hide show

app.py +960 -159
requirements.txt +44 -19
training/data_loader.py +112 -107
training/evaluate_model.py +298 -311
training/model.py +173 -158
training/train_model.py +169 -189
training/train_tokenizer.py +91 -91

app.py CHANGED Viewed

@@ -1,223 +1,1024 @@
 #!/usr/bin/env python3
 """
-OpenLLM Training Space - Main Application
-This is the main entry point for the Hugging Face Space.
-It provides a web interface for running OpenLLM training with authentication.
 Author: Louis Chua Bean Chong
-License: GPLv3
 """
-import os
-import sys
 import gradio as gr
 from pathlib import Path
-# Import our authentication and training modules
 try:
-    from space_auth_test import test_space_authentication
-    from openllm_training_with_auth import OpenLLMTrainingManager
-    MODULES_AVAILABLE = True
 except ImportError as e:
-    MODULES_AVAILABLE = False
-    print(f"❌ Required modules not available: {e}")
-def create_space_interface():
-    """Create the Gradio interface for the Space."""
-    def run_authentication_test():
-        """Run the authentication test and return results."""
         try:
-            if not MODULES_AVAILABLE:
-                return "❌ Required modules not available. Please check deployment."
-            # Capture output from authentication test
-            import io
-            import contextlib
-            output = io.StringIO()
-            with contextlib.redirect_stdout(output):
-                success = test_space_authentication()
-            result = output.getvalue()
-            if success:
-                return f"✅ Authentication Test Results:\n\n{result}"
-            else:
-                return f"❌ Authentication Test Failed:\n\n{result}"
         except Exception as e:
-            return f"❌ Error running authentication test: {e}"
-    def run_training(model_size, training_steps):
-        """Run the OpenLLM training with authentication."""
         try:
-            if not MODULES_AVAILABLE:
-                return "❌ Required modules not available. Please check deployment."
-            # Capture output from training
-            import io
-            import contextlib
-            output = io.StringIO()
-            with contextlib.redirect_stdout(output):
-                training_manager = OpenLLMTrainingManager()
-                repo_id = training_manager.run_training(
-                    model_size=model_size,
-                    steps=int(training_steps)
                 )
-            result = output.getvalue()
-            return f"✅ Training Results:\n\n{result}\n\n🎉 Model available at: https://huggingface.co/{repo_id}"
         except Exception as e:
-            return f"❌ Error running training: {e}"
-    def check_space_environment():
-        """Check the Space environment and configuration."""
         try:
-            # Check if we're in a Space
-            space_vars = ["SPACE_ID", "SPACE_HOST", "SPACE_REPO_ID"]
-            is_space = any(os.getenv(var) for var in space_vars)
-            # Check HF_TOKEN
-            hf_token = os.getenv("HF_TOKEN")
-            result = "🔍 Space Environment Check:\n\n"
-            if is_space:
-                result += "✅ Running in Hugging Face Space environment\n"
-                for var in space_vars:
-                    value = os.getenv(var)
-                    if value:
-                        result += f"   - {var}: {value}\n"
-            else:
-                result += "ℹ️ Running in local environment\n"
-            if hf_token:
-                result += f"✅ HF access token found: {hf_token[:8]}...{hf_token[-4:]}\n"
-                result += "   - Source: HF access token in Space settings\n"
-            else:
-                result += "❌ HF access token not found\n"
-                result += "   - Please set HF_TOKEN in Space settings with HF access token\n"
-            result += f"\n📁 Available modules: {'✅' if MODULES_AVAILABLE else '❌'}"
-            return result
         except Exception as e:
-            return f"❌ Error checking environment: {e}"
-    # Create the Gradio interface
-    with gr.Blocks(title="OpenLLM Training Space", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("""
-        # 🚀 OpenLLM Training Space
-        Welcome to the OpenLLM Training Space! This Space provides a complete environment for training OpenLLM models with automatic Hugging Face authentication and model upload.
-                 ## 🔐 Authentication
-         This Space uses HF access token for secure authentication. The HF_TOKEN is automatically available from your Space settings.
-        ## 📋 Available Actions
-        1. **Environment Check**: Verify Space configuration and authentication
-        2. **Authentication Test**: Test Hugging Face authentication
-        3. **Run Training**: Start OpenLLM training with automatic upload
-        """)
-        with gr.Tab("🔍 Environment Check"):
-            gr.Markdown("Check the Space environment and configuration.")
-            env_check_btn = gr.Button("Check Environment", variant="primary")
-            env_output = gr.Textbox(label="Environment Status", lines=10, interactive=False)
-            env_check_btn.click(check_space_environment, outputs=env_output)
-        with gr.Tab("🔐 Authentication Test"):
-            gr.Markdown("Test Hugging Face authentication using HF access token.")
-            auth_test_btn = gr.Button("Run Authentication Test", variant="primary")
-            auth_output = gr.Textbox(label="Authentication Results", lines=15, interactive=False)
-            auth_test_btn.click(run_authentication_test, outputs=auth_output)
-        with gr.Tab("🚀 Run Training"):
-            gr.Markdown("""
-            Start OpenLLM training with automatic model upload.
-            **Training Parameters:**
-            - **Model Size**: Choose the model size (small, medium, large)
-            - **Training Steps**: Number of training steps (default: 8000)
-            **Expected Results:**
-            - Training will complete successfully
-            - Model will be uploaded to Hugging Face Hub
-            - Repository will be created with proper model files
-            """)
-            with gr.Row():
                 model_size = gr.Dropdown(
                     choices=["small", "medium", "large"],
                     value="small",
                     label="Model Size",
-                    info="Choose the model size for training"
                 )
-                training_steps = gr.Number(
-                    value=8000,
-                    label="Training Steps",
-                    info="Number of training steps",
-                    minimum=1000,
-                    maximum=50000
                 )
-            train_btn = gr.Button("Start Training", variant="primary", size="lg")
-            train_output = gr.Textbox(label="Training Results", lines=20, interactive=False)
-            train_btn.click(
-                run_training,
-                inputs=[model_size, training_steps],
-                outputs=train_output
-            )
-        with gr.Tab("📚 Documentation"):
-            gr.Markdown("""
-            ## 📖 Available Documentation
-            - **HUGGINGFACE_SPACE_SETUP_GUIDE.md**: Complete setup guide
-            - **SPACE_AUTHENTICATION_SUMMARY.md**: Authentication summary
-            - **SPACE_READY_SUMMARY.md**: Deployment summary
-            ## 🔧 Available Scripts
-            - **space_auth_test.py**: Authentication verification
-            - **openllm_training_with_auth.py**: Complete training script
-            - **integrate_auth_into_training.py**: Integration guide
-            - **setup_hf_space_auth.py**: Space authentication setup
-            - **verify_space_auth.py**: Space verification script
-            ## 🎯 Quick Start
-            1. Check the environment to verify configuration
-            2. Run authentication test to ensure GitHub secrets are working
-            3. Start training with your desired parameters
-            4. Monitor the training progress and model upload
-            ## 🔒 Security
-            - HF_TOKEN is securely stored in GitHub repository secrets
-            - No hardcoded tokens in any scripts
-            - Automatic cleanup of test repositories
-            - Proper error handling and logging
-            """)
-    return interface
 if __name__ == "__main__":
-    # Create and launch the interface
-    interface = create_space_interface()
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 #!/usr/bin/env python3
 """
+OpenLLM Training Space Application - Fixed with Uploaded Modules
+This version imports OpenLLM modules from the uploaded files in the HF Space:
+- Imports model.py and data_loader.py that were uploaded to the Space
+- Uses OpenLLM's actual custom model architecture
+- Compatible with OpenLLM's implementation
+This application provides a complete training interface for OpenLLM models on Hugging Face Spaces.
+It uses OpenLLM's custom GPTModel architecture instead of Hugging Face Transformers,
+ensuring compatibility with the actual OpenLLM implementation.
+Key Features:
+- Real model training using OpenLLM's custom architecture
+- SentencePiece tokenization for OpenLLM models
+- Complete training pipeline with progress monitoring
+- Automatic model saving and uploading to Hugging Face Hub
+- Gradio 4.44.1 compatible user interface
+Technical Architecture:
+- Uses OpenLLM's GPTModel class (not Hugging Face Transformers)
+- Imports custom modules from uploaded files in the Space
+- Uses sentencepiece.SentencePieceProcessor() for tokenization
+- Implements OpenLLM's training loop and optimization strategy
+- Saves checkpoints in OpenLLM's format
 Author: Louis Chua Bean Chong
+License: GPL-3.0
+Version: 2.1.1
+Last Updated: 2024
 """
 import gradio as gr
+import torch
+import torch.nn as nn
+import os
+import time
+import math
+import gc
+from typing import Dict, Any, Optional
+import threading
+from dataclasses import dataclass
 from pathlib import Path
+# Import OpenLLM's custom model architecture from uploaded files
+# These files were uploaded to the HF Space and contain OpenLLM's actual implementation
+try:
+    # Import from the uploaded files in the HF Space
+    # model.py contains GPTModel, GPTConfig, and create_model factory function
+    from model import GPTModel, GPTConfig, create_model
+    # data_loader.py contains TextDataLoader for OpenLLM's data loading approach
+    from data_loader import TextDataLoader
+    OPENLLM_AVAILABLE = True
+    print("✅ OpenLLM custom model architecture imported successfully from uploaded files")
+    print("   - GPTModel: Custom PyTorch model architecture")
+    print("   - GPTConfig: Model configuration dataclass")
+    print("   - create_model: Factory function for model creation")
+    print("   - TextDataLoader: Custom data loading implementation")
+except ImportError as e:
+    print(f"❌ OpenLLM imports failed: {e}")
+    print("   This indicates the uploaded OpenLLM source files are not available")
+    print("   The training functionality will be disabled")
+    OPENLLM_AVAILABLE = False
+# Try to import sentencepiece - CRITICAL for OpenLLM tokenization
+# OpenLLM uses SentencePiece for tokenization, not Hugging Face tokenizers
+try:
+    import sentencepiece as spm
+    SENTENCEPIECE_AVAILABLE = True
+    print(f"✅ SentencePiece available: {spm.__version__}")
+    print("   - Required for OpenLLM tokenization")
+    print("   - Used for loading tokenizer.model files")
+except ImportError:
+    SENTENCEPIECE_AVAILABLE = False
+    print("❌ SentencePiece not available")
+    print("   - This will prevent tokenizer loading")
+    print("   - Training functionality will be limited")
+# Import other dependencies for the complete training pipeline
 try:
+    from datasets import load_dataset  # For loading training data from HF Hub
+    from huggingface_hub import HfApi, hf_hub_download  # For model uploads and downloads
+    DEPENDENCIES_AVAILABLE = True
+    print("✅ Training dependencies available")
+    print("   - datasets: For loading training data")
+    print("   - huggingface_hub: For model uploads/downloads")
 except ImportError as e:
+    print(f"❌ Dependencies not available: {e}")
+    print("   - This will prevent dataset loading and model uploading")
+    DEPENDENCIES_AVAILABLE = False
+@dataclass
+class TrainingConfig:
+    """
+    Configuration class for training parameters.
+    This dataclass encapsulates all the training hyperparameters and settings
+    that control the OpenLLM training process. It provides a clean interface
+    for passing configuration between different components of the training pipeline.
+    Attributes:
+        model_size: Size of the model to train ("small", "medium", "large")
+        max_steps: Maximum number of training iterations
+        learning_rate: Learning rate for the optimizer
+        batch_size: Number of samples per training batch
+        output_dir: Directory to save trained models and checkpoints
+        save_steps: Frequency of checkpoint saving (every N steps)
+        logging_steps: Frequency of progress logging (every N steps)
+        warmup_steps: Number of warmup steps for learning rate scheduling
+        gradient_accumulation_steps: Number of steps to accumulate gradients
+    """
+    model_size: str
+    max_steps: int
+    learning_rate: float
+    batch_size: int
+    output_dir: str = "./openllm-trained"
+    save_steps: int = 100
+    logging_steps: int = 10
+    warmup_steps: int = 50
+    gradient_accumulation_steps: int = 4
+class OpenLLMTrainer:
+    """
+    Complete training implementation using OpenLLM's actual architecture.
+    This class handles the entire training pipeline including:
+    - Model loading using OpenLLM's custom GPTModel
+    - Tokenizer loading using sentencepiece.SentencePieceProcessor()
+    - Dataset preparation using OpenLLM's TextDataLoader
+    - Training execution using OpenLLM's approach
+    - Model saving and uploading to Hugging Face Hub
+    The trainer implements OpenLLM's actual training methodology rather than
+    using Hugging Face Transformers, ensuring compatibility with the real
+    OpenLLM implementation.
+    Key Features:
+    - Custom model architecture (GPTModel, not PreTrainedModel)
+    - SentencePiece tokenization (not Hugging Face tokenizers)
+    - OpenLLM's training loop and optimization strategy
+    - Gradient accumulation for memory efficiency
+    - Learning rate scheduling with warmup
+    - Automatic checkpoint saving and model uploading
+    """
+    def __init__(self):
+        """
+        Initialize the trainer with default settings.
+        Sets up the trainer with default values and initializes the Hugging Face
+        API for model uploading. All components start as None and are initialized
+        during the training process.
+        """
+        # Core training components - initialized during training
+        self.model = None  # OpenLLM's GPTModel instance
+        self.tokenizer = None  # SentencePieceProcessor instance
+        self.data_loader = None  # OpenLLM's TextDataLoader instance
+        self.optimizer = None  # PyTorch optimizer (AdamW)
+        self.scheduler = None  # Learning rate scheduler
+        # Training state management
+        self.is_training = False  # Flag to track training status
+        self.tokenizer_path = None  # Path to the tokenizer.model file
+        # Progress tracking for UI updates
+        self.training_progress = {
+            "status": "Ready",  # Current training status
+            "current_step": 0,  # Current training step
+            "total_steps": 0,  # Total steps to complete
+            "loss": 0.0,  # Current training loss
+            "learning_rate": 0.0  # Current learning rate
+        }
+        # Initialize Hugging Face API for model uploading
+        # This allows the trained model to be automatically uploaded to HF Hub
         try:
+            self.hf_api = HfApi()
+            print("✅ Hugging Face API initialized for model uploading")
+        except Exception as e:
+            print(f"Failed to initialize HF API: {e}")
+            print("   - Model uploading will be disabled")
+            self.hf_api = None
+    def load_model_and_tokenizer(self, model_size: str) -> str:
+        """
+        Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
+        This method implements OpenLLM's actual model loading strategy:
+        1. Creates a new GPTModel using OpenLLM's factory function
+        2. Downloads the tokenizer.model file from Hugging Face Hub
+        3. Loads the tokenizer using SentencePieceProcessor
+        4. Stores both components for use in training
+        This approach differs from Hugging Face Transformers because:
+        - Uses OpenLLM's custom GPTModel (not AutoModelForCausalLM)
+        - Uses SentencePiece directly (not AutoTokenizer)
+        - Downloads specific files rather than using from_pretrained()
+        Args:
+            model_size: Size of the model to load ("small", "medium", "large")
+                       Determines which pre-trained model to download
+        Returns:
+            Status message indicating success or failure
+            Success: "✅ Successfully loaded OpenLLM {model_size} model with custom architecture"
+            Failure: "❌ Failed to load OpenLLM model and tokenizer: {error details}"
+        """
+        try:
+            # Verify OpenLLM modules are available
+            if not OPENLLM_AVAILABLE:
+                return "❌ OpenLLM custom model architecture not available"
+            print(f"🔄 Loading OpenLLM {model_size} model using custom architecture...")
+            print(f"   - Using OpenLLM's create_model factory function")
+            print(f"   - Not using Hugging Face Transformers")
+            # Step 1: Create model using OpenLLM's factory function
+            # This creates a fresh GPTModel instance with the specified size
+            try:
+                self.model = create_model(model_size)
+                print(f"✅ OpenLLM {model_size} model created: {type(self.model).__name__}")
+                print(f"   - Model type: {type(self.model).__name__}")
+                print(f"   - Parameters: {self.model.get_num_params():,}")
+                print(f"   - Architecture: Custom GPTModel (not PreTrainedModel)")
+            except Exception as e:
+                print(f"❌ Failed to create model: {e}")
+                return f"❌ Failed to create OpenLLM model: {str(e)}"
+            # Step 2: Load tokenizer using sentencepiece
+            # OpenLLM uses SentencePiece directly, not Hugging Face tokenizers
+            try:
+                print("🔄 Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
+                print("   - Using SentencePiece directly (not AutoTokenizer)")
+                print("   - Downloading tokenizer.model from Hugging Face Hub")
+                # Download tokenizer.model from HF Hub
+                # This is the actual tokenizer file used by OpenLLM models
+                model_name = f"lemms/openllm-{model_size}-extended-7k"
+                tokenizer_path = hf_hub_download(
+                    repo_id=model_name,
+                    filename="tokenizer.model"  # Specific file name for OpenLLM
+                )
+                print(f"✅ Tokenizer downloaded to: {tokenizer_path}")
+                print(f"   - Source: {model_name}")
+                print(f"   - File: tokenizer.model")
+                # Create SentencePieceProcessor and load the tokenizer
+                # This is OpenLLM's actual tokenization approach
+                sp_processor = spm.SentencePieceProcessor()
+                sp_processor.load(tokenizer_path)
+                # Store tokenizer and its path separately
+                # We need the path for the TextDataLoader later
+                self.tokenizer = sp_processor
+                self.tokenizer_path = tokenizer_path  # Store the path separately
+                print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
+                print(f"   - Vocabulary size: {sp_processor.vocab_size()}")
+                print(f"   - Tokenizer path: {tokenizer_path}")
+                print(f"   - Tokenizer type: {type(sp_processor).__name__}")
+            except Exception as e:
+                print(f"❌ Failed to load tokenizer: {e}")
+                return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
+            return f"✅ Successfully loaded OpenLLM {model_size} model with custom architecture"
         except Exception as e:
+            return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
+    def prepare_dataset(self) -> str:
+        """
+        Load and prepare the training dataset using OpenLLM's approach.
+        This method implements OpenLLM's data preparation strategy:
+        1. Loads training data from Hugging Face Hub dataset
+        2. Creates a temporary text file for OpenLLM's TextDataLoader
+        3. Initializes OpenLLM's TextDataLoader with the tokenizer
+        4. Prepares the data for training
+        OpenLLM's approach differs from Hugging Face because:
+        - Uses a simple text file format (not tokenized datasets)
+        - Uses OpenLLM's TextDataLoader (not Hugging Face datasets)
+        - Tokenization happens on-the-fly during training
+        Returns:
+            Status message indicating success or failure
+            Success: "✅ Successfully prepared dataset with {count} samples"
+            Failure: "❌ Failed to prepare dataset: {error details}"
+        """
         try:
+            # Verify dependencies are available
+            if not DEPENDENCIES_AVAILABLE:
+                return "❌ Required dependencies not available"
+            print("🔄 Loading training dataset...")
+            print("   - Loading from Hugging Face Hub dataset")
+            print("   - Using OpenLLM's data preparation approach")
+            # Load dataset from HF Hub
+            # This contains the training text data for continuing model training
+            dataset = load_dataset("lemms/openllm-training-data")
+            print(f"✅ Dataset loaded: {len(dataset['train'])} samples")
+            print(f"   - Dataset: lemms/openllm-training-data")
+            print(f"   - Samples: {len(dataset['train'])}")
+            # Create temporary data file for OpenLLM's TextDataLoader
+            # OpenLLM expects a simple text file with one text sample per line
+            temp_data_file = "temp_training_data.txt"
+            with open(temp_data_file, 'w', encoding='utf-8') as f:
+                for item in dataset['train']:
+                    f.write(item['text'] + '\n')
+            print(f"✅ Temporary data file created: {temp_data_file}")
+            print(f"   - Format: One text sample per line")
+            print(f"   - Encoding: UTF-8")
+            # Create OpenLLM's TextDataLoader
+            # This is OpenLLM's custom data loading implementation
+            try:
+                # Use the stored tokenizer path instead of trying to access model_file_path
+                # SentencePieceProcessor doesn't have a model_file_path attribute
+                tokenizer_path = self.tokenizer_path  # Use the stored path
+                print(f"🔄 Creating OpenLLM TextDataLoader...")
+                print(f"   - Data file: {temp_data_file}")
+                print(f"   - Tokenizer path: {tokenizer_path}")
+                print(f"   - Sequence length: 512")
+                print(f"   - Batch size: 4 (will be overridden by training config)")
+                self.data_loader = TextDataLoader(
+                    data_file=temp_data_file,
+                    tokenizer_path=tokenizer_path,
+                    seq_len=512,  # Maximum sequence length for training
+                    batch_size=4,  # Will be overridden by training config
+                    shuffle=True   # Shuffle data for better training
                 )
+                print(f"✅ OpenLLM TextDataLoader created successfully")
+                print(f"   - DataLoader type: {type(self.data_loader).__name__}")
+                print(f"   - Uses OpenLLM's custom implementation")
+            except Exception as e:
+                print(f"❌ Failed to create TextDataLoader: {e}")
+                return f"❌ Failed to create data loader: {str(e)}"
+            return f"✅ Successfully prepared dataset with {len(dataset['train'])} samples"
+        except Exception as e:
+            return f"❌ Failed to prepare dataset: {str(e)}"
+    def setup_training(self, config: TrainingConfig) -> str:
+        """
+        Set up the training configuration using OpenLLM's approach.
+        This method configures the training environment with:
+        1. Output directory creation
+        2. Optimizer setup with weight decay groups
+        3. Learning rate scheduler with warmup
+        4. Training hyperparameters
+        The setup follows OpenLLM's training methodology:
+        - Uses AdamW optimizer with weight decay
+        - Implements learning rate warmup followed by cosine annealing
+        - Separates parameters for different weight decay rates
+        - Uses gradient clipping for stability
+        Args:
+            config: Training configuration object containing all hyperparameters
+        Returns:
+            Status message indicating success or failure
+            Success: "✅ Training setup completed successfully"
+            Failure: "❌ Failed to setup training: {error details}"
+        """
+        try:
+            print("🔄 Setting up training configuration...")
+            print(f"   - Output directory: {config.output_dir}")
+            print(f"   - Learning rate: {config.learning_rate}")
+            print(f"   - Max steps: {config.max_steps}")
+            # Create output directory for saving models and checkpoints
+            os.makedirs(config.output_dir, exist_ok=True)
+            print(f"✅ Output directory created: {config.output_dir}")
+            # Set up optimizer (AdamW with weight decay)
+            # This follows OpenLLM's optimization strategy
+            print("🔄 Setting up AdamW optimizer with weight decay...")
+            # Separate parameters for different weight decay rates
+            # This is a common practice for transformer training
+            decay_params = []      # Parameters that should have weight decay
+            no_decay_params = []   # Parameters that should not have weight decay
+            for name, param in self.model.named_parameters():
+                if not param.requires_grad:
+                    continue
+                # Apply weight decay to all parameters except biases and layer norm weights
+                if len(param.shape) == 1 or name.endswith('.bias'):
+                    no_decay_params.append(param)
+                else:
+                    decay_params.append(param)
+            # Create parameter groups with different weight decay rates
+            param_groups = [
+                {'params': decay_params, 'weight_decay': 0.01},      # 1% weight decay
+                {'params': no_decay_params, 'weight_decay': 0.0}     # No weight decay
+            ]
+            print(f"   - Decay parameters: {len(decay_params)}")
+            print(f"   - No-decay parameters: {len(no_decay_params)}")
+            # Initialize AdamW optimizer with OpenLLM's recommended settings
+            self.optimizer = torch.optim.AdamW(
+                param_groups,
+                lr=config.learning_rate,
+                betas=(0.9, 0.95),  # Beta values for momentum
+                eps=1e-8            # Epsilon for numerical stability
+            )
+            print(f"✅ AdamW optimizer configured")
+            print(f"   - Learning rate: {config.learning_rate}")
+            print(f"   - Betas: (0.9, 0.95)")
+            print(f"   - Epsilon: 1e-8")
+            # Set up learning rate scheduler
+            # OpenLLM uses a warmup followed by cosine annealing
+            print("🔄 Setting up learning rate scheduler...")
+            # Warmup scheduler: linearly increase LR from 1% to 100%
+            warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
+                self.optimizer,
+                start_factor=0.01,  # Start at 1% of target LR
+                end_factor=1.0,     # End at 100% of target LR
+                total_iters=config.warmup_steps
+            )
+            # Main scheduler: cosine annealing after warmup
+            main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                self.optimizer,
+                T_max=config.max_steps - config.warmup_steps  # Duration of cosine annealing
+            )
+            # Combine warmup and main schedulers
+            self.scheduler = torch.optim.lr_scheduler.SequentialLR(
+                self.optimizer,
+                schedulers=[warmup_scheduler, main_scheduler],
+                milestones=[config.warmup_steps]  # Switch to main scheduler after warmup
+            )
+            print(f"✅ Learning rate scheduler configured")
+            print(f"   - Warmup steps: {config.warmup_steps}")
+            print(f"   - Total steps: {config.max_steps}")
+            print(f"   - Schedule: Linear warmup → Cosine annealing")
+            print("✅ Training setup completed successfully")
+            return f"✅ Training setup completed successfully"
         except Exception as e:
+            return f"❌ Failed to setup training: {str(e)}"
+    def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
+        """
+        Execute the actual model training using OpenLLM's approach.
+        This method implements OpenLLM's training loop:
+        1. Sets up training mode and progress tracking
+        2. Iterates through data batches using OpenLLM's TextDataLoader
+        3. Performs forward pass, loss computation, and backward pass
+        4. Implements gradient accumulation for memory efficiency
+        5. Updates model parameters and learning rate
+        6. Saves checkpoints and logs progress
+        The training loop follows OpenLLM's methodology:
+        - Uses OpenLLM's GPTModel forward pass (returns logits and loss)
+        - Implements gradient accumulation for effective larger batch sizes
+        - Uses gradient clipping for training stability
+        - Saves checkpoints in OpenLLM's format
+        - Updates progress for UI monitoring
+        Args:
+            config: Training configuration object containing hyperparameters
+            progress_callback: Optional callback function for progress updates
+                             (Not used in current implementation)
+        Returns:
+            Status message indicating success or failure
+            Success: "✅ Training completed successfully! Final step: {step}"
+            Failure: "❌ Training failed: {error details}"
+        """
         try:
+            # Set training state
+            self.is_training = True
+            self.training_progress["status"] = "Training"
+            self.training_progress["total_steps"] = config.max_steps
+            print(f"🚀 Starting OpenLLM training for {config.max_steps} steps...")
+            print(f"   - Model: {type(self.model).__name__}")
+            print(f"   - DataLoader: {type(self.data_loader).__name__}")
+            print(f"   - Optimizer: {type(self.optimizer).__name__}")
+            print(f"   - Gradient accumulation: {config.gradient_accumulation_steps}")
+            # Training loop using OpenLLM's approach
+            self.model.train()  # Set model to training mode
+            accumulated_loss = 0.0  # Track loss across accumulation steps
+            self.optimizer.zero_grad()  # Clear gradients
+            step = 0  # Current training step
+            for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
+                # Check if we've reached the maximum number of steps
+                if step >= config.max_steps:
+                    break
+                # Forward pass (model computes loss internally when targets provided)
+                # OpenLLM's GPTModel returns both logits and loss
+                logits, loss = self.model(input_ids, target_ids)
+                # Scale loss for gradient accumulation
+                # This allows us to simulate larger batch sizes
+                loss = loss / config.gradient_accumulation_steps
+                accumulated_loss += loss.item()
+                # Backward pass - compute gradients
+                loss.backward()
+                # Update weights every gradient_accumulation_steps
+                if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
+                    # Clip gradients for training stability
+                    # This prevents exploding gradients
+                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                    # Update parameters using the optimizer
+                    self.optimizer.step()
+                    # Update learning rate using the scheduler
+                    self.scheduler.step()
+                    # Clear gradients for the next accumulation cycle
+                    self.optimizer.zero_grad()
+                    # Update step count
+                    step += 1
+                    # Update progress for UI monitoring
+                    self.training_progress["current_step"] = step
+                    self.training_progress["loss"] = accumulated_loss
+                    self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
+                    # Log progress at specified intervals
+                    if step % config.logging_steps == 0:
+                        current_lr = self.scheduler.get_last_lr()[0]
+                        print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {current_lr:.2e}")
+                    # Save checkpoint at specified intervals
+                    if step % config.save_steps == 0:
+                        self._save_checkpoint(config.output_dir, step)
+                        print(f"💾 Checkpoint saved at step {step}")
+                    # Reset accumulated loss for the next accumulation cycle
+                    accumulated_loss = 0.0
+                    # Clean up memory periodically
+                    if step % 100 == 0:
+                        gc.collect()
+                        print(f"🧹 Memory cleanup at step {step}")
+            # Save final checkpoint
+            self._save_checkpoint(config.output_dir, step, is_best=True)
+            print(f"💾 Final checkpoint saved at step {step}")
+            # Update final progress
+            self.training_progress["status"] = "Completed"
+            self.training_progress["current_step"] = step
+            print(f"✅ Training completed! Final step: {step}")
+            print(f"   - Total steps completed: {step}")
+            print(f"   - Final loss: {self.training_progress['loss']:.4f}")
+            print(f"   - Final learning rate: {self.training_progress['learning_rate']:.2e}")
+            return f"✅ Training completed successfully! Final step: {step}"
         except Exception as e:
+            self.training_progress["status"] = "Failed"
+            print(f"❌ Training failed: {e}")
+            print(f"   - Error occurred during training")
+            print(f"   - Training state: {self.training_progress['status']}")
+            return f"❌ Training failed: {str(e)}"
+        finally:
+            self.is_training = False
+    def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
+        """
+        Save model checkpoint using OpenLLM's approach.
+        This method saves the model state in OpenLLM's checkpoint format:
+        - Model state dictionary
+        - Optimizer state dictionary
+        - Scheduler state dictionary
+        - Model configuration
+        - Training step information
+        The checkpoint format is compatible with OpenLLM's loading mechanism
+        and can be used to resume training or load the model for inference.
+        Args:
+            output_dir: Directory to save the checkpoint
+            step: Current training step number
+            is_best: Whether this is the best model so far
+        """
+        try:
+            # Create checkpoint dictionary with all necessary components
+            checkpoint = {
+                'step': step,                                    # Current training step
+                'model_state_dict': self.model.state_dict(),     # Model parameters
+                'optimizer_state_dict': self.optimizer.state_dict(),  # Optimizer state
+                'scheduler_state_dict': self.scheduler.state_dict(),  # Scheduler state
+                'config': self.model.config.__dict__             # Model configuration
+            }
+            # Save latest checkpoint
+            checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
+            torch.save(checkpoint, checkpoint_path)
+            # Save best checkpoint if this is the best model
+            if is_best:
+                best_path = os.path.join(output_dir, "best_model.pt")
+                torch.save(checkpoint, best_path)
+                print(f"💾 Best model saved: {best_path}")
+            print(f"💾 Checkpoint saved: {checkpoint_path}")
+        except Exception as e:
+            print(f"❌ Failed to save checkpoint: {e}")
+    def save_and_upload_model(self, config: TrainingConfig) -> str:
+        """
+        Save the trained model and upload it to Hugging Face Hub.
+        This method completes the training pipeline by:
+        1. Saving the final model checkpoint
+        2. Copying the tokenizer files
+        3. Uploading the complete model to Hugging Face Hub
+        4. Creating a new model repository for the trained model
+        The uploaded model will be available at:
+        https://huggingface.co/lemms/openllm-{size}-extended-8k
+        Args:
+            config: Training configuration object
+        Returns:
+            Status message indicating success or failure
+            Success: "✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
+            Failure: "❌ Failed to save/upload model: {error details}"
+        """
+        try:
+            print("🔄 Saving trained model...")
+            print(f"   - Output directory: {config.output_dir}")
+            print(f"   - Model size: {config.model_size}")
+            # Save the final model checkpoint
+            self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
+            # Save tokenizer files
+            # Create a tokenizer directory within the output directory
+            tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
+            os.makedirs(tokenizer_dir, exist_ok=True)
+            # Copy the tokenizer.model file using the stored path
+            # This ensures the tokenizer is included with the model
+            import shutil
+            shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
+            print("✅ Model saved locally")
+            print(f"   - Model checkpoint: {config.output_dir}/best_model.pt")
+            print(f"   - Tokenizer: {tokenizer_dir}/tokenizer.model")
+            # Generate model name for upload
+            # The naming convention follows: openllm-{size}-extended-8k
+            model_name = f"openllm-{config.model_size}-extended-8k"
+            repo_id = f"lemms/{model_name}"
+            # Upload to Hugging Face Hub
+            if self.hf_api:
+                print(f"🔄 Uploading model to {repo_id}...")
+                print(f"   - Repository: {repo_id}")
+                print(f"   - Type: model")
+                print(f"   - Source: {config.output_dir}")
+                # Create the repository first if it doesn't exist
+                try:
+                    from huggingface_hub import create_repo
+                    create_repo(
+                        repo_id=repo_id,
+                        repo_type="model",
+                        exist_ok=True,
+                        private=False
+                    )
+                    print(f"✅ Repository {repo_id} ready for upload")
+                except Exception as create_error:
+                    print(f"⚠️ Repository creation warning: {create_error}")
+                    print("   Continuing with upload attempt...")
+                # Upload model files to Hugging Face Hub
+                # This creates a new model repository with all the files
+                self.hf_api.upload_folder(
+                    folder_path=config.output_dir,
+                    repo_id=repo_id,
+                    repo_type="model",
+                    commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
+                )
+                print(f"✅ Model uploaded successfully to {repo_id}")
+                print(f"   - Available at: https://huggingface.co/{repo_id}")
+                return f"✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
+            else:
+                print("⚠️ Hugging Face API not available - model saved locally only")
+                return f"✅ Model saved locally to {config.output_dir}"
+        except Exception as e:
+            print(f"❌ Failed to save/upload model: {e}")
+            return f"❌ Failed to save/upload model: {str(e)}"
+    def get_training_progress(self) -> Dict[str, Any]:
+        """
+        Get current training progress information.
+        This method returns a copy of the current training progress
+        for display in the Gradio UI. The progress information includes:
+        - Current training status
+        - Current step and total steps
+        - Current loss value
+        - Current learning rate
+        Returns:
+            Dictionary containing current training progress information
+        """
+        return self.training_progress.copy()
+def main():
+    """
+    Main function that creates the complete Gradio application interface.
+    This function sets up the entire Gradio application with:
+    1. Application header and status information
+    2. Training configuration controls
+    3. Training status and progress display
+    4. Training control buttons
+    5. Instructions and resource links
+    6. Training function implementation
+    The interface provides a complete training experience for OpenLLM models
+    with real-time progress monitoring and comprehensive configuration options.
+    Returns:
+        Gradio Blocks interface for the training application
+    """
+    # Initialize the trainer
+    # This creates the OpenLLMTrainer instance that will handle all training operations
+    trainer = OpenLLMTrainer()
+    # Create the main Gradio application interface
+    # Using Gradio 4.44.1 with Soft theme for modern appearance
+    with gr.Blocks(
+        title="OpenLLM Training Space - Fixed with Uploaded Modules",
+        theme=gr.themes.Soft()
+    ) as demo:
+        # Application Header
+        # Provides clear identification and description of the application
+        gr.Markdown("# 🚀 OpenLLM Training Space - Fixed with Uploaded Modules")
+        gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
+        gr.Markdown("---")
+        # Status Information
+        # Shows the availability of key components and dependencies
+        gr.Markdown(f"**OpenLLM Available**: {'✅ Yes' if OPENLLM_AVAILABLE else '❌ No'}")
+        gr.Markdown(f"**SentencePiece Available**: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
+        gr.Markdown(f"**Dependencies Available**: {'✅ Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
+        gr.Markdown("**Architecture**: ✅ OpenLLM Custom GPTModel (From Uploaded Files)")
+        # Main Content Area
+        # Two-column layout for configuration and status
+        with gr.Row():
+            # Left Column: Training Configuration
+            # Contains all the training hyperparameters and settings
+            with gr.Column(scale=1):
+                gr.Markdown("## 📊 Training Configuration")
+                # Model Size Selection
+                # Allows users to choose which base model to train from
                 model_size = gr.Dropdown(
                     choices=["small", "medium", "large"],
                     value="small",
                     label="Model Size",
+                    info="Select the base model size to train from"
                 )
+                # Training Steps Configuration
+                # Controls the number of training iterations
+                max_steps = gr.Slider(
+                    minimum=100,
+                    maximum=10000,
+                    value=1000,
+                    step=100,
+                    label="Max Training Steps",
+                    info="Number of training iterations (100-10,000)"
+                )
+                # Learning Rate Configuration
+                # Controls the learning rate for the optimizer
+                learning_rate = gr.Slider(
+                    minimum=1e-5,
+                    maximum=1e-3,
+                    value=3e-4,
+                    step=1e-5,
+                    label="Learning Rate",
+                    info="Training rate (0.00001-0.001)"
+                )
+                # Batch Size Configuration
+                # Controls the number of samples per training batch
+                batch_size = gr.Slider(
+                    minimum=1,
+                    maximum=16,
+                    value=4,
+                    step=1,
+                    label="Batch Size",
+                    info="Samples per training batch (1-16)"
                 )
+            # Right Column: Training Status and Controls
+            # Contains status display and control buttons
+            with gr.Column(scale=1):
+                gr.Markdown("## 🎯 Training Status")
+                # Training Status Display
+                # Shows current training status and any error messages
+                status_text = gr.Textbox(
+                    value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
+                    label="Current Status",
+                    interactive=False,
+                    lines=5,
+                    info="Shows current training status and progress updates"
+                )
+                # Progress Information
+                # Displays detailed training progress in JSON format
+                progress_info = gr.JSON(
+                    value=trainer.get_training_progress(),
+                    label="Training Progress"
+                )
+                # Training Control Buttons
+                # Buttons to start and stop training
+                with gr.Row():
+                    start_btn = gr.Button("🚀 Start Training", variant="primary")
+                    stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
+        # Instructions Section
+        # Provides detailed instructions for using the training interface
+        gr.Markdown("## 📋 OpenLLM Training Instructions")
+        gr.Markdown("""
+        This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
+        ### **Step 1: Configure Parameters**
+        - **Model Size**: Select the base model to train from (small, medium, large)
+        - **Max Steps**: Number of training iterations (100-10,000)
+        - **Learning Rate**: Training rate (0.00001-0.001)
+        - **Batch Size**: Samples per training batch (1-16)
+        ### **Step 2: Start Training**
+        - Click "Start Training" to begin the actual training process
+        - Uses OpenLLM's custom GPTModel class from uploaded files
+        - Uses sentencepiece.SentencePieceProcessor() for tokenization
+        - Compatible with OpenLLM's actual implementation
+        ### **Step 3: Monitor Progress**
+        - Watch the status updates and progress information
+        - Training may take several minutes depending on steps
+        - The final model will be uploaded to Hugging Face Hub
+        ### **Step 4: Access Results**
+        - Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
+        - Check the model repository for your trained model
+        - Use the model for inference or further training
+        """)
+        # Resource Links Section
+        # Provides links to related models and resources
+        gr.Markdown("## 🔗 Model Resources")
+        gr.Markdown("""
+        - [📚 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
+        - [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
+        - [📊 Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
+        - [📖 Main Project](https://github.com/louischua/openllm)
+        """)
+        # Training Function Definition
+        # This function is called when the Start Training button is clicked
+        def start_complete_training(model_size, max_steps, learning_rate, batch_size):
+            """
+            Execute the complete training process using OpenLLM's approach.
+            This function orchestrates the entire training pipeline:
+            1. Validates OpenLLM availability
+            2. Creates training configuration
+            3. Loads model and tokenizer
+            4. Prepares dataset
+            5. Sets up training environment
+            6. Executes training
+            7. Saves and uploads the trained model
+            The function provides comprehensive error handling and status updates
+            throughout the training process.
+            Args:
+                model_size: Size of the model to train ("small", "medium", "large")
+                max_steps: Maximum number of training steps
+                learning_rate: Learning rate for the optimizer
+                batch_size: Batch size for training
+            Returns:
+                Status message indicating the result of the training process
+            """
+            # Validate OpenLLM availability
+            if not OPENLLM_AVAILABLE:
+                return "❌ OpenLLM custom model architecture not available. Please check the installation."
+            try:
+                print(f"🚀 Starting complete training process...")
+                print(f"   - Model size: {model_size}")
+                print(f"   - Max steps: {max_steps}")
+                print(f"   - Learning rate: {learning_rate}")
+                print(f"   - Batch size: {batch_size}")
+                # Create training configuration
+                # This encapsulates all training parameters
+                config = TrainingConfig(
+                    model_size=model_size,
+                    max_steps=max_steps,
+                    learning_rate=learning_rate,
+                    batch_size=batch_size
+                )
+                # Step 1: Load model and tokenizer using OpenLLM's approach
+                print("🔄 Step 1: Loading model and tokenizer...")
+                status = trainer.load_model_and_tokenizer(model_size)
+                if "❌" in status:
+                    return status
+                # Step 2: Prepare dataset
+                print("🔄 Step 2: Preparing dataset...")
+                status = trainer.prepare_dataset()
+                if "❌" in status:
+                    return status
+                # Step 3: Setup training
+                print("🔄 Step 3: Setting up training...")
+                status = trainer.setup_training(config)
+                if "❌" in status:
+                    return status
+                # Step 4: Execute training
+                print("🔄 Step 4: Executing training...")
+                status = trainer.train_model(config)
+                if "❌" in status:
+                    return status
+                # Step 5: Save and upload model
+                print("🔄 Step 5: Saving and uploading model...")
+                status = trainer.save_and_upload_model(config)
+                print("🎉 Complete training process finished!")
+                return f"🚀 Complete training process finished!\n{status}"
+            except Exception as e:
+                print(f"❌ Training process failed: {str(e)}")
+                return f"❌ Training process failed: {str(e)}"
+        def update_progress():
+            """
+            Update the progress display.
+            This function is called periodically to update the progress
+            information displayed in the Gradio interface. It returns the
+            current training progress from the trainer.
+            Returns:
+                Current training progress dictionary
+            """
+            return trainer.get_training_progress()
+        # Connect UI Components to Functions
+        # This connects the Start Training button to the training function
+        start_btn.click(
+            fn=start_complete_training,
+            inputs=[model_size, max_steps, learning_rate, batch_size],
+            outputs=[status_text]
+        )
+        # Auto-refresh progress every 5 seconds during training
+        # This ensures the progress display stays up to date
+        demo.load(update_progress, outputs=[progress_info])
+        # Application Footer
+        # Provides attribution and technical information
+        gr.Markdown("---")
+        gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
+        gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
+        gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
+    return demo
 if __name__ == "__main__":
+    # Launch the Gradio application
+    # This starts the web interface for the training application
+    demo = main()
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,26 +1,51 @@
-# OpenLLM Training Space Requirements
-# Core dependencies for Space deployment
-# Hugging Face Hub for authentication and model upload
-huggingface_hub>=0.19.0
-# Gradio for web interface (latest stable version)
-gradio==4.44.1
-# PyTorch for model training
-torch>=2.0.0
-torchvision>=0.15.0
-# Transformers for model handling
-transformers>=4.35.0
-# SentencePiece for tokenization
-sentencepiece>=0.1.99
-# NumPy and other utilities
-numpy>=1.24.0
-pandas>=2.0.0
-# Additional utilities
-requests>=2.31.0
-tqdm>=4.65.0

+# Complete Training Dependencies for OpenLLM Space - Updated for Gradio 4.44.1
+# This file includes all necessary packages for real model training
+# Core Machine Learning Framework
+torch>=2.0.0                    # PyTorch deep learning framework
+torchvision>=0.15.0             # Computer vision utilities
+torchaudio>=2.0.0               # Audio processing utilities
+# Hugging Face Ecosystem - Complete Training Stack
+transformers>=4.30.0            # Pre-trained models and training utilities
+datasets>=2.12.0                # Dataset loading and processing
+tokenizers>=0.13.0              # Fast tokenization library
+sentencepiece>=0.1.99           # SentencePiece tokenization (CRITICAL for OpenLLM models)
+huggingface_hub>=0.34.0         # Hugging Face Hub integration
+accelerate>=0.20.0              # Distributed training acceleration
+# User Interface Framework - Updated to 4.44.1
+gradio==4.44.1                  # Web UI framework for ML applications (fixed version)
+# Data Processing and Scientific Computing
+numpy>=1.24.0                   # Numerical computing library
+pandas>=2.0.0                   # Data manipulation and analysis
+scipy>=1.10.0                   # Scientific computing utilities
+# Progress and Monitoring
+tqdm>=4.65.0                    # Progress bars for long-running operations
+psutil>=5.9.0                   # System and process utilities
+# Memory and Performance Optimization
+bitsandbytes>=0.41.0            # Quantization utilities for memory efficiency
+peft>=0.4.0                     # Parameter-Efficient Fine-Tuning
+# Logging and Debugging
+wandb>=0.15.0                   # Experiment tracking (optional)
+tensorboard>=2.13.0             # Training visualization (optional)
+# Additional Utilities
+requests>=2.31.0                # HTTP library for API calls
+pillow>=9.5.0                   # Image processing (if needed)
+matplotlib>=3.7.0               # Plotting and visualization
+seaborn>=0.12.0                 # Statistical data visualization
+# Development and Testing (optional)
+pytest>=7.4.0                   # Testing framework
+black>=23.0.0                   # Code formatting
+flake8>=6.0.0                   # Code linting
+# Note: These versions are compatible with Hugging Face Spaces
+# and provide stable training performance for OpenLLM models
+# Gradio 4.44.1 fixes compatibility issues with JSON components
+# SentencePiece is CRITICAL for OpenLLM model tokenization

training/data_loader.py CHANGED Viewed

@@ -13,12 +13,12 @@
 Training Data Loader for Language Model Training
 This module provides efficient data loading and batching for training GPT-style
-language models. It handles text preprocessing, tokenization, and creates
 batches suitable for autoregressive language modeling.
 FEATURES:
 - Memory-efficient text loading with sliding window
-- Automatic tokenization using trained SentencePiece model
 - Configurable sequence length and batch size
 - CPU-optimized data loading for limited hardware
 - Support for training data validation and statistics
@@ -31,20 +31,20 @@ MEMORY OPTIMIZATION:
 Usage:
     from data_loader import TextDataLoader
     loader = TextDataLoader(
         data_file="data/clean/training_data.txt",
-        tokenizer_path="data/tokenizer/tokenizer.model",
         seq_len=512,
         batch_size=4
     )
     for batch in loader:
         input_ids, targets = batch
         # input_ids: (batch_size, seq_len)
         # targets: (batch_size, seq_len) - shifted by 1 for next token prediction
-Author: Louis Chua Bean Chong
 License: GPLv3
 """
@@ -66,11 +66,11 @@ except ImportError:
 class TextDataLoader:
     """
     Efficient data loader for autoregressive language model training.
     This class handles loading text data, tokenizing it using SentencePiece,
     and creating batches suitable for next-token prediction training.
     """
     def __init__(
         self,
         data_file: str,
@@ -79,11 +79,11 @@ class TextDataLoader:
         batch_size: int = 4,
         chunk_size: int = 1000000,  # Lines to read at once
         shuffle: bool = True,
-        seed: int = 42
     ):
         """
         Initialize the data loader.
         Args:
             data_file: Path to training text file (one passage per line)
             tokenizer_path: Path to trained SentencePiece model
@@ -100,44 +100,44 @@ class TextDataLoader:
         self.chunk_size = chunk_size
         self.shuffle = shuffle
         self.seed = seed
         # Validate inputs
         self._validate_inputs()
         # Load tokenizer
         self.tokenizer = self._load_tokenizer()
         # Get data statistics
         self.total_lines = self._count_lines()
         self.current_line = 0
         # Set random seed for reproducibility
         random.seed(seed)
         print(f"📊 TextDataLoader initialized")
         print(f"  Data file: {data_file}")
         print(f"  Total passages: {self.total_lines:,}")
         print(f"  Sequence length: {seq_len}")
         print(f"  Batch size: {batch_size}")
         print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
     def _validate_inputs(self) -> None:
         """Validate input parameters and file paths."""
         if not os.path.exists(self.data_file):
             raise FileNotFoundError(f"Training data file not found: {self.data_file}")
         if not os.path.exists(self.tokenizer_path):
             raise FileNotFoundError(f"Tokenizer model not found: {self.tokenizer_path}")
         if self.seq_len <= 0:
             raise ValueError(f"Sequence length must be positive, got {self.seq_len}")
         if self.batch_size <= 0:
             raise ValueError(f"Batch size must be positive, got {self.batch_size}")
         if self.chunk_size <= 0:
             raise ValueError(f"Chunk size must be positive, got {self.chunk_size}")
     def _load_tokenizer(self) -> spm.SentencePieceProcessor:
         """Load the trained SentencePiece tokenizer."""
         try:
@@ -146,222 +146,226 @@ class TextDataLoader:
             return tokenizer
         except Exception as e:
             raise RuntimeError(f"Failed to load tokenizer: {e}")
     def _count_lines(self) -> int:
         """Count total number of lines in the data file."""
         print("📏 Counting training passages...")
         start_time = time.time()
         line_count = 0
-        with open(self.data_file, 'r', encoding='utf-8') as f:
             for line in f:
                 if line.strip():  # Only count non-empty lines
                     line_count += 1
         count_time = time.time() - start_time
         print(f"✓ Found {line_count:,} passages in {count_time:.1f}s")
         return line_count
     def _read_chunk(self, start_line: int = 0) -> List[str]:
         """
         Read a chunk of lines from the data file.
         Args:
             start_line: Line number to start reading from
         Returns:
             List of text passages
         """
         chunk = []
         current_line = 0
         lines_read = 0
-        with open(self.data_file, 'r', encoding='utf-8') as f:
             for line in f:
                 if current_line < start_line:
                     current_line += 1
                     continue
                 text = line.strip()
                 if text:  # Only include non-empty lines
                     chunk.append(text)
                     lines_read += 1
                     if lines_read >= self.chunk_size:
                         break
                 current_line += 1
         return chunk
     def _tokenize_texts(self, texts: List[str]) -> List[List[int]]:
         """
         Tokenize a list of text passages using SentencePiece tokenizer.
         This method converts raw text into token ID sequences suitable for language model training.
         It handles special tokens (BOS/EOS) and length constraints for efficient training.
         Text processing pipeline:
         1. Add BOS (Beginning of Sequence) token to mark sequence start
         2. Tokenize text using trained SentencePiece model (subword tokenization)
         3. Truncate sequences that exceed maximum length
         4. Add EOS (End of Sequence) token to mark sequence end
         Special token handling:
         - BOS token helps model learn to generate text from scratch
         - EOS token signals natural sequence endings
         - These tokens are crucial for proper autoregressive generation
         Args:
             texts: List of text passages (typically Wikipedia passages from SQUAD)
                   Each passage should be a complete, coherent text segment
         Returns:
             List of token ID sequences, where each sequence is a list of integers
             representing subword tokens from the SentencePiece vocabulary
         """
         tokenized = []
         for text in texts:
             try:
                 # Add BOS (Beginning of Sequence) token at the start
                 # BOS token ID=2 by default in SentencePiece, signals sequence start
                 # This helps the model learn proper sequence initialization during generation
                 tokens = [self.tokenizer.bos_id()] + self.tokenizer.encode(text)
                 # Truncate sequences that exceed maximum context length
                 # Reserve one position for EOS token by using (seq_len - 1)
                 # This ensures we never exceed the model's context window during training
                 if len(tokens) > self.seq_len - 1:
-                    tokens = tokens[:self.seq_len - 1]
                     # NOTE: Truncation may cut off text mid-sentence, but this is acceptable
                     # for language modeling where the model learns from partial contexts
                 # Add EOS (End of Sequence) token at the end
                 # EOS token ID=1 by default in SentencePiece, signals sequence completion
                 # This teaches the model when to stop generating text naturally
                 tokens.append(self.tokenizer.eos_id())
                 # Validate tokenization result
                 if len(tokens) <= 2:  # Only BOS + EOS tokens, no actual content
                     print(f"⚠️  Skipping very short text: {text[:50]}...")
                     continue
                 tokenized.append(tokens)
             except Exception as e:
                 # Handle tokenization errors gracefully to avoid stopping training
                 # Common causes: encoding issues, very long texts, special characters
                 print(f"⚠️  Failed to tokenize passage: {text[:50]}... Error: {e}")
                 continue
         # Log tokenization statistics for monitoring
         if tokenized:
             avg_length = sum(len(tokens) for tokens in tokenized) / len(tokenized)
             print(f"📊 Tokenized {len(tokenized)} passages, avg length: {avg_length:.1f} tokens")
         return tokenized
-    def _create_training_examples(self, token_sequences: List[List[int]]) -> List[Tuple[List[int], List[int]]]:
         """
         Create training examples with input and target sequences.
         For autoregressive training, targets are inputs shifted by one position.
         Args:
             token_sequences: List of tokenized sequences
         Returns:
             List of (input_ids, target_ids) tuples
         """
         examples = []
         for tokens in token_sequences:
             if len(tokens) < 2:  # Need at least 2 tokens for input/target pair
                 continue
             # For sequences longer than seq_len, create multiple examples with sliding window
             if len(tokens) > self.seq_len:
                 # Create overlapping windows (50% overlap for better learning)
                 stride = self.seq_len // 2
                 for i in range(0, len(tokens) - self.seq_len, stride):
-                    input_ids = tokens[i:i + self.seq_len]
-                    target_ids = tokens[i + 1:i + self.seq_len + 1]
                     examples.append((input_ids, target_ids))
             else:
                 # Pad shorter sequences
                 input_ids = tokens[:-1]  # All but last token
                 target_ids = tokens[1:]  # All but first token
                 # Pad to seq_len if necessary
                 while len(input_ids) < self.seq_len:
                     input_ids.append(self.tokenizer.pad_id())
                     target_ids.append(-1)  # Use -1 for padding in targets (ignored in loss)
                 # Truncate if still too long
-                input_ids = input_ids[:self.seq_len]
-                target_ids = target_ids[:self.seq_len]
                 examples.append((input_ids, target_ids))
         return examples
-    def _create_batch(self, examples: List[Tuple[List[int], List[int]]]) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Create a batch tensor from training examples.
         Args:
             examples: List of (input_ids, target_ids) tuples
         Returns:
             Tuple of (input_tensor, target_tensor)
         """
         if not examples:
             raise ValueError("Cannot create batch from empty examples")
         batch_size = len(examples)
         # Initialize tensors
         input_ids = torch.zeros((batch_size, self.seq_len), dtype=torch.long)
         target_ids = torch.full((batch_size, self.seq_len), -1, dtype=torch.long)
         # Fill tensors
         for i, (inp, tgt) in enumerate(examples):
-            input_ids[i, :len(inp)] = torch.tensor(inp, dtype=torch.long)
-            target_ids[i, :len(tgt)] = torch.tensor(tgt, dtype=torch.long)
         return input_ids, target_ids
     def __iter__(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Iterate over training batches.
         Yields:
             Tuple of (input_ids, target_ids) tensors
         """
         self.current_line = 0
         while self.current_line < self.total_lines:
             # Read chunk of text
             texts = self._read_chunk(self.current_line)
             if not texts:
                 break
             # Tokenize texts
             token_sequences = self._tokenize_texts(texts)
             # Create training examples
             examples = self._create_training_examples(token_sequences)
             # Shuffle examples if requested
             if self.shuffle:
                 random.shuffle(examples)
             # Create batches
             for i in range(0, len(examples), self.batch_size):
-                batch_examples = examples[i:i + self.batch_size]
                 if len(batch_examples) == self.batch_size:  # Only yield full batches
                     try:
                         input_ids, target_ids = self._create_batch(batch_examples)
@@ -369,27 +373,27 @@ class TextDataLoader:
                     except Exception as e:
                         print(f"⚠️  Failed to create batch: {e}")
                         continue
             # Update progress
             self.current_line += len(texts)
             # Clean up memory
             del texts, token_sequences, examples
             gc.collect()
     def get_data_stats(self) -> dict:
         """
         Get statistics about the training data.
         Returns:
             Dictionary with data statistics
         """
         print("📊 Analyzing training data...")
         # Sample some data to get statistics
         sample_texts = self._read_chunk(0)[:100]  # Sample first 100 passages
         token_sequences = self._tokenize_texts(sample_texts)
         if token_sequences:
             sequence_lengths = [len(seq) for seq in token_sequences]
             avg_length = sum(sequence_lengths) / len(sequence_lengths)
@@ -397,15 +401,15 @@ class TextDataLoader:
             min_length = min(sequence_lengths)
         else:
             avg_length = max_length = min_length = 0
         # Estimate total tokens
         estimated_total_tokens = int(avg_length * self.total_lines)
         # Estimate number of batches per epoch
         examples_per_passage = max(1, avg_length // self.seq_len)
         total_examples = int(self.total_lines * examples_per_passage)
         batches_per_epoch = total_examples // self.batch_size
         stats = {
             "total_passages": self.total_lines,
             "avg_tokens_per_passage": avg_length,
@@ -416,22 +420,22 @@ class TextDataLoader:
             "estimated_batches_per_epoch": batches_per_epoch,
             "sequence_length": self.seq_len,
             "batch_size": self.batch_size,
-            "vocabulary_size": self.tokenizer.vocab_size()
         }
         print(f"✓ Data analysis complete:")
         print(f"  Total passages: {stats['total_passages']:,}")
         print(f"  Avg tokens per passage: {stats['avg_tokens_per_passage']:.1f}")
         print(f"  Estimated total tokens: {stats['estimated_total_tokens']:,}")
         print(f"  Estimated batches per epoch: {stats['estimated_batches_per_epoch']:,}")
         return stats
 def test_data_loader():
     """Test function for the data loader."""
     print("🧪 Testing TextDataLoader...")
     # Test with small parameters
     try:
         loader = TextDataLoader(
@@ -439,42 +443,43 @@ def test_data_loader():
             tokenizer_path="data/tokenizer/tokenizer.model",
             seq_len=128,
             batch_size=2,
-            chunk_size=10  # Small for testing
         )
         # Get data statistics
         stats = loader.get_data_stats()
         # Test iteration
         print("\n🔄 Testing batch iteration...")
         start_time = time.time()
         batch_count = 0
         for batch_idx, (input_ids, target_ids) in enumerate(loader):
             batch_count += 1
             print(f"Batch {batch_idx + 1}:")
             print(f"  Input shape: {input_ids.shape}")
             print(f"  Target shape: {target_ids.shape}")
             print(f"  Sample input tokens: {input_ids[0][:10].tolist()}")
             print(f"  Sample target tokens: {target_ids[0][:10].tolist()}")
             if batch_idx >= 2:  # Only test first few batches
                 break
         test_time = time.time() - start_time
         print(f"\n✓ Data loader test completed successfully!")
         print(f"  Processed {batch_count} batches in {test_time:.2f}s")
         print(f"  Average time per batch: {test_time/max(1, batch_count):.2f}s")
         return True
     except Exception as e:
         print(f"❌ Data loader test failed: {e}")
         import traceback
         traceback.print_exc()
         return False
 if __name__ == "__main__":
-    test_data_loader()

 Training Data Loader for Language Model Training
 This module provides efficient data loading and batching for training GPT-style
+language models. It handles text preprocessing, tokenization, and creates
 batches suitable for autoregressive language modeling.
 FEATURES:
 - Memory-efficient text loading with sliding window
+- Automatic tokenization using trained SentencePiece model
 - Configurable sequence length and batch size
 - CPU-optimized data loading for limited hardware
 - Support for training data validation and statistics
 Usage:
     from data_loader import TextDataLoader
     loader = TextDataLoader(
         data_file="data/clean/training_data.txt",
+        tokenizer_path="data/tokenizer/tokenizer.model",
         seq_len=512,
         batch_size=4
     )
     for batch in loader:
         input_ids, targets = batch
         # input_ids: (batch_size, seq_len)
         # targets: (batch_size, seq_len) - shifted by 1 for next token prediction
+Author: Louis Chua Bean Chong
 License: GPLv3
 """
 class TextDataLoader:
     """
     Efficient data loader for autoregressive language model training.
     This class handles loading text data, tokenizing it using SentencePiece,
     and creating batches suitable for next-token prediction training.
     """
     def __init__(
         self,
         data_file: str,
         batch_size: int = 4,
         chunk_size: int = 1000000,  # Lines to read at once
         shuffle: bool = True,
+        seed: int = 42,
     ):
         """
         Initialize the data loader.
         Args:
             data_file: Path to training text file (one passage per line)
             tokenizer_path: Path to trained SentencePiece model
         self.chunk_size = chunk_size
         self.shuffle = shuffle
         self.seed = seed
         # Validate inputs
         self._validate_inputs()
         # Load tokenizer
         self.tokenizer = self._load_tokenizer()
         # Get data statistics
         self.total_lines = self._count_lines()
         self.current_line = 0
         # Set random seed for reproducibility
         random.seed(seed)
         print(f"📊 TextDataLoader initialized")
         print(f"  Data file: {data_file}")
         print(f"  Total passages: {self.total_lines:,}")
         print(f"  Sequence length: {seq_len}")
         print(f"  Batch size: {batch_size}")
         print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
     def _validate_inputs(self) -> None:
         """Validate input parameters and file paths."""
         if not os.path.exists(self.data_file):
             raise FileNotFoundError(f"Training data file not found: {self.data_file}")
         if not os.path.exists(self.tokenizer_path):
             raise FileNotFoundError(f"Tokenizer model not found: {self.tokenizer_path}")
         if self.seq_len <= 0:
             raise ValueError(f"Sequence length must be positive, got {self.seq_len}")
         if self.batch_size <= 0:
             raise ValueError(f"Batch size must be positive, got {self.batch_size}")
         if self.chunk_size <= 0:
             raise ValueError(f"Chunk size must be positive, got {self.chunk_size}")
     def _load_tokenizer(self) -> spm.SentencePieceProcessor:
         """Load the trained SentencePiece tokenizer."""
         try:
             return tokenizer
         except Exception as e:
             raise RuntimeError(f"Failed to load tokenizer: {e}")
     def _count_lines(self) -> int:
         """Count total number of lines in the data file."""
         print("📏 Counting training passages...")
         start_time = time.time()
         line_count = 0
+        with open(self.data_file, "r", encoding="utf-8") as f:
             for line in f:
                 if line.strip():  # Only count non-empty lines
                     line_count += 1
         count_time = time.time() - start_time
         print(f"✓ Found {line_count:,} passages in {count_time:.1f}s")
         return line_count
     def _read_chunk(self, start_line: int = 0) -> List[str]:
         """
         Read a chunk of lines from the data file.
         Args:
             start_line: Line number to start reading from
         Returns:
             List of text passages
         """
         chunk = []
         current_line = 0
         lines_read = 0
+        with open(self.data_file, "r", encoding="utf-8") as f:
             for line in f:
                 if current_line < start_line:
                     current_line += 1
                     continue
                 text = line.strip()
                 if text:  # Only include non-empty lines
                     chunk.append(text)
                     lines_read += 1
                     if lines_read >= self.chunk_size:
                         break
                 current_line += 1
         return chunk
     def _tokenize_texts(self, texts: List[str]) -> List[List[int]]:
         """
         Tokenize a list of text passages using SentencePiece tokenizer.
         This method converts raw text into token ID sequences suitable for language model training.
         It handles special tokens (BOS/EOS) and length constraints for efficient training.
         Text processing pipeline:
         1. Add BOS (Beginning of Sequence) token to mark sequence start
         2. Tokenize text using trained SentencePiece model (subword tokenization)
         3. Truncate sequences that exceed maximum length
         4. Add EOS (End of Sequence) token to mark sequence end
         Special token handling:
         - BOS token helps model learn to generate text from scratch
         - EOS token signals natural sequence endings
         - These tokens are crucial for proper autoregressive generation
         Args:
             texts: List of text passages (typically Wikipedia passages from SQUAD)
                   Each passage should be a complete, coherent text segment
         Returns:
             List of token ID sequences, where each sequence is a list of integers
             representing subword tokens from the SentencePiece vocabulary
         """
         tokenized = []
         for text in texts:
             try:
                 # Add BOS (Beginning of Sequence) token at the start
                 # BOS token ID=2 by default in SentencePiece, signals sequence start
                 # This helps the model learn proper sequence initialization during generation
                 tokens = [self.tokenizer.bos_id()] + self.tokenizer.encode(text)
                 # Truncate sequences that exceed maximum context length
                 # Reserve one position for EOS token by using (seq_len - 1)
                 # This ensures we never exceed the model's context window during training
                 if len(tokens) > self.seq_len - 1:
+                    tokens = tokens[: self.seq_len - 1]
                     # NOTE: Truncation may cut off text mid-sentence, but this is acceptable
                     # for language modeling where the model learns from partial contexts
                 # Add EOS (End of Sequence) token at the end
                 # EOS token ID=1 by default in SentencePiece, signals sequence completion
                 # This teaches the model when to stop generating text naturally
                 tokens.append(self.tokenizer.eos_id())
                 # Validate tokenization result
                 if len(tokens) <= 2:  # Only BOS + EOS tokens, no actual content
                     print(f"⚠️  Skipping very short text: {text[:50]}...")
                     continue
                 tokenized.append(tokens)
             except Exception as e:
                 # Handle tokenization errors gracefully to avoid stopping training
                 # Common causes: encoding issues, very long texts, special characters
                 print(f"⚠️  Failed to tokenize passage: {text[:50]}... Error: {e}")
                 continue
         # Log tokenization statistics for monitoring
         if tokenized:
             avg_length = sum(len(tokens) for tokens in tokenized) / len(tokenized)
             print(f"📊 Tokenized {len(tokenized)} passages, avg length: {avg_length:.1f} tokens")
         return tokenized
+    def _create_training_examples(
+        self, token_sequences: List[List[int]]
+    ) -> List[Tuple[List[int], List[int]]]:
         """
         Create training examples with input and target sequences.
         For autoregressive training, targets are inputs shifted by one position.
         Args:
             token_sequences: List of tokenized sequences
         Returns:
             List of (input_ids, target_ids) tuples
         """
         examples = []
         for tokens in token_sequences:
             if len(tokens) < 2:  # Need at least 2 tokens for input/target pair
                 continue
             # For sequences longer than seq_len, create multiple examples with sliding window
             if len(tokens) > self.seq_len:
                 # Create overlapping windows (50% overlap for better learning)
                 stride = self.seq_len // 2
                 for i in range(0, len(tokens) - self.seq_len, stride):
+                    input_ids = tokens[i : i + self.seq_len]
+                    target_ids = tokens[i + 1 : i + self.seq_len + 1]
                     examples.append((input_ids, target_ids))
             else:
                 # Pad shorter sequences
                 input_ids = tokens[:-1]  # All but last token
                 target_ids = tokens[1:]  # All but first token
                 # Pad to seq_len if necessary
                 while len(input_ids) < self.seq_len:
                     input_ids.append(self.tokenizer.pad_id())
                     target_ids.append(-1)  # Use -1 for padding in targets (ignored in loss)
                 # Truncate if still too long
+                input_ids = input_ids[: self.seq_len]
+                target_ids = target_ids[: self.seq_len]
                 examples.append((input_ids, target_ids))
         return examples
+    def _create_batch(
+        self, examples: List[Tuple[List[int], List[int]]]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Create a batch tensor from training examples.
         Args:
             examples: List of (input_ids, target_ids) tuples
         Returns:
             Tuple of (input_tensor, target_tensor)
         """
         if not examples:
             raise ValueError("Cannot create batch from empty examples")
         batch_size = len(examples)
         # Initialize tensors
         input_ids = torch.zeros((batch_size, self.seq_len), dtype=torch.long)
         target_ids = torch.full((batch_size, self.seq_len), -1, dtype=torch.long)
         # Fill tensors
         for i, (inp, tgt) in enumerate(examples):
+            input_ids[i, : len(inp)] = torch.tensor(inp, dtype=torch.long)
+            target_ids[i, : len(tgt)] = torch.tensor(tgt, dtype=torch.long)
         return input_ids, target_ids
     def __iter__(self) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Iterate over training batches.
         Yields:
             Tuple of (input_ids, target_ids) tensors
         """
         self.current_line = 0
         while self.current_line < self.total_lines:
             # Read chunk of text
             texts = self._read_chunk(self.current_line)
             if not texts:
                 break
             # Tokenize texts
             token_sequences = self._tokenize_texts(texts)
             # Create training examples
             examples = self._create_training_examples(token_sequences)
             # Shuffle examples if requested
             if self.shuffle:
                 random.shuffle(examples)
             # Create batches
             for i in range(0, len(examples), self.batch_size):
+                batch_examples = examples[i : i + self.batch_size]
                 if len(batch_examples) == self.batch_size:  # Only yield full batches
                     try:
                         input_ids, target_ids = self._create_batch(batch_examples)
                     except Exception as e:
                         print(f"⚠️  Failed to create batch: {e}")
                         continue
             # Update progress
             self.current_line += len(texts)
             # Clean up memory
             del texts, token_sequences, examples
             gc.collect()
     def get_data_stats(self) -> dict:
         """
         Get statistics about the training data.
         Returns:
             Dictionary with data statistics
         """
         print("📊 Analyzing training data...")
         # Sample some data to get statistics
         sample_texts = self._read_chunk(0)[:100]  # Sample first 100 passages
         token_sequences = self._tokenize_texts(sample_texts)
         if token_sequences:
             sequence_lengths = [len(seq) for seq in token_sequences]
             avg_length = sum(sequence_lengths) / len(sequence_lengths)
             min_length = min(sequence_lengths)
         else:
             avg_length = max_length = min_length = 0
         # Estimate total tokens
         estimated_total_tokens = int(avg_length * self.total_lines)
         # Estimate number of batches per epoch
         examples_per_passage = max(1, avg_length // self.seq_len)
         total_examples = int(self.total_lines * examples_per_passage)
         batches_per_epoch = total_examples // self.batch_size
         stats = {
             "total_passages": self.total_lines,
             "avg_tokens_per_passage": avg_length,
             "estimated_batches_per_epoch": batches_per_epoch,
             "sequence_length": self.seq_len,
             "batch_size": self.batch_size,
+            "vocabulary_size": self.tokenizer.vocab_size(),
         }
         print(f"✓ Data analysis complete:")
         print(f"  Total passages: {stats['total_passages']:,}")
         print(f"  Avg tokens per passage: {stats['avg_tokens_per_passage']:.1f}")
         print(f"  Estimated total tokens: {stats['estimated_total_tokens']:,}")
         print(f"  Estimated batches per epoch: {stats['estimated_batches_per_epoch']:,}")
         return stats
 def test_data_loader():
     """Test function for the data loader."""
     print("🧪 Testing TextDataLoader...")
     # Test with small parameters
     try:
         loader = TextDataLoader(
             tokenizer_path="data/tokenizer/tokenizer.model",
             seq_len=128,
             batch_size=2,
+            chunk_size=10,  # Small for testing
         )
         # Get data statistics
         stats = loader.get_data_stats()
         # Test iteration
         print("\n🔄 Testing batch iteration...")
         start_time = time.time()
         batch_count = 0
         for batch_idx, (input_ids, target_ids) in enumerate(loader):
             batch_count += 1
             print(f"Batch {batch_idx + 1}:")
             print(f"  Input shape: {input_ids.shape}")
             print(f"  Target shape: {target_ids.shape}")
             print(f"  Sample input tokens: {input_ids[0][:10].tolist()}")
             print(f"  Sample target tokens: {target_ids[0][:10].tolist()}")
             if batch_idx >= 2:  # Only test first few batches
                 break
         test_time = time.time() - start_time
         print(f"\n✓ Data loader test completed successfully!")
         print(f"  Processed {batch_count} batches in {test_time:.2f}s")
         print(f"  Average time per batch: {test_time/max(1, batch_count):.2f}s")
         return True
     except Exception as e:
         print(f"❌ Data loader test failed: {e}")
         import traceback
         traceback.print_exc()
         return False
 if __name__ == "__main__":
+    test_data_loader()

training/evaluate_model.py CHANGED Viewed

@@ -56,20 +56,15 @@ from data_loader import TextDataLoader
 class ModelEvaluator:
     """
     Comprehensive evaluator for OpenLLM models.
     Implements intrinsic evaluation metrics and text generation quality
     assessment following the training pipeline specifications.
     """
-    def __init__(
-        self,
-        model: GPTModel,
-        tokenizer_path: str,
-        device: str = "cpu"
-    ):
         """
         Initialize the model evaluator.
         Args:
             model: Trained GPT model
             tokenizer_path: Path to tokenizer model file
@@ -77,30 +72,27 @@ class ModelEvaluator:
         """
         self.model = model.to(device)
         self.device = device
         # Load tokenizer
         self.tokenizer = smp.SentencePieceProcessor()
         self.tokenizer.load(tokenizer_path)
         print(f"🔧 ModelEvaluator initialized")
         print(f"  Device: {device}")
         print(f"  Model parameters: {model.get_num_params():,}")
         print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
     def evaluate_perplexity(
-        self,
-        eval_data: List[str],
-        max_seq_len: int = 512,
-        batch_size: int = 1
     ) -> Dict[str, float]:
         """
         Calculate perplexity on evaluation data.
         Args:
             eval_data: List of text passages for evaluation
             max_seq_len: Maximum sequence length for evaluation
             batch_size: Batch size for evaluation
         Returns:
             Dictionary with loss and perplexity metrics
         """
@@ -108,400 +100,394 @@ class ModelEvaluator:
         total_loss = 0.0
         total_tokens = 0
         num_sequences = 0
         print(f"📊 Calculating perplexity on {len(eval_data)} passages...")
         with torch.no_grad():
             for i, text in enumerate(eval_data):
                 if i % 100 == 0:
                     print(f"  Progress: {i}/{len(eval_data)} passages")
                 # Tokenize text
                 tokens = self.tokenizer.encode(text)
                 if len(tokens) < 2:
                     continue
                 # Truncate if too long
                 if len(tokens) > max_seq_len:
                     tokens = tokens[:max_seq_len]
                 # Create input and target tensors
                 input_ids = torch.tensor([tokens[:-1]], dtype=torch.long, device=self.device)
                 target_ids = torch.tensor([tokens[1:]], dtype=torch.long, device=self.device)
                 # Forward pass
                 logits, loss = self.model(input_ids, target_ids)
                 # Accumulate loss
                 seq_length = len(tokens) - 1
                 total_loss += loss.item() * seq_length
                 total_tokens += seq_length
                 num_sequences += 1
         # Calculate metrics
-        avg_loss = total_loss / total_tokens if total_tokens > 0 else float('inf')
         perplexity = math.exp(min(avg_loss, 10))  # Cap to prevent overflow
         return {
-            'loss': avg_loss,
-            'perplexity': perplexity,
-            'total_tokens': total_tokens,
-            'num_sequences': num_sequences
         }
     def evaluate_text_generation(
         self,
         prompts: List[str],
         max_length: int = 256,
         temperature: float = 0.7,
         top_k: Optional[int] = 40,
-        num_samples: int = 1
     ) -> List[Dict[str, Any]]:
         """
         Evaluate text generation quality.
         Args:
             prompts: List of input prompts
             max_length: Maximum generation length
             temperature: Sampling temperature
             top_k: Top-k sampling parameter
             num_samples: Number of samples per prompt
         Returns:
             List of generation results with quality metrics
         """
         self.model.eval()
         results = []
         print(f"✍️  Evaluating text generation on {len(prompts)} prompts...")
         with torch.no_grad():
             for prompt in prompts:
                 prompt_results = []
                 for sample_idx in range(num_samples):
                     # Tokenize prompt
                     input_ids = self.tokenizer.encode(prompt)
                     input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
                     start_time = time.time()
                     # Generate text
                     output = self.model.generate(
                         input_tensor,
                         max_new_tokens=max_length,
                         temperature=temperature,
-                        top_k=top_k
                     )
                     generation_time = time.time() - start_time
                     # Decode output
                     generated_ids = output[0].tolist()
                     full_text = self.tokenizer.decode(generated_ids)
-                    generated_text = self.tokenizer.decode(generated_ids[len(input_ids):])
                     # Calculate quality metrics
                     quality_metrics = self._assess_generation_quality(generated_text)
-                    prompt_results.append({
-                        'prompt': prompt,
-                        'generated_text': generated_text,
-                        'full_text': full_text,
-                        'generation_time': generation_time,
-                        'tokens_generated': len(generated_ids) - len(input_ids),
-                        'tokens_per_second': (len(generated_ids) - len(input_ids)) / generation_time,
-                        'quality_metrics': quality_metrics
-                    })
                 results.extend(prompt_results)
         return results
     def _assess_generation_quality(self, text: str) -> Dict[str, float]:
         """
         Assess basic quality metrics for generated text.
         Args:
             text: Generated text to assess
         Returns:
             Dictionary of quality metrics
         """
         if not text.strip():
             return {
-                'length': 0,
-                'avg_word_length': 0,
-                'repetition_rate': 1.0,
-                'coherence_score': 0.0
             }
         words = text.split()
         # Basic metrics
         length = len(words)
         avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
         # Repetition rate (simple n-gram repetition)
-        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
         unique_bigrams = len(set(bigrams))
         repetition_rate = 1 - (unique_bigrams / len(bigrams) if bigrams else 0)
         # Simple coherence score (based on sentence structure)
-        sentences = text.split('.')
         valid_sentences = [s for s in sentences if len(s.strip().split()) > 3]
         coherence_score = len(valid_sentences) / len(sentences) if sentences else 0
         return {
-            'length': length,
-            'avg_word_length': avg_word_length,
-            'repetition_rate': repetition_rate,
-            'coherence_score': coherence_score
         }
     def evaluate_downstream_tasks(self) -> Dict[str, Any]:
         """
         Evaluate model performance on downstream tasks.
         This function implements basic downstream task evaluation including:
         - Reading comprehension (simplified SQUAD-style)
         - Sentiment analysis (few-shot)
         - Common sense reasoning
         Returns:
             Dictionary of downstream task results
         """
         results = {}
         # 1. Reading Comprehension (Simplified SQUAD-style)
-        results['reading_comprehension'] = self._evaluate_reading_comprehension()
         # 2. Sentiment Analysis (Few-shot learning)
-        results['sentiment_analysis'] = self._evaluate_sentiment_analysis()
         # 3. Common Sense Reasoning
-        results['reasoning'] = self._evaluate_reasoning()
         # 4. Text Completion Quality
-        results['text_completion'] = self._evaluate_text_completion()
         return results
     def _evaluate_reading_comprehension(self) -> Dict[str, Any]:
         """Simplified reading comprehension evaluation."""
         # Sample reading comprehension tasks
         tasks = [
             {
-                'context': 'The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.',
-                'question': 'Who is the Eiffel Tower named after?',
-                'expected': 'Gustave Eiffel'
             },
             {
-                'context': 'Python is a high-level programming language. It was created by Guido van Rossum and first released in 1991.',
-                'question': 'When was Python first released?',
-                'expected': '1991'
             },
             {
-                'context': 'Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.',
-                'question': 'What is machine learning a subset of?',
-                'expected': 'artificial intelligence'
-            }
         ]
         correct = 0
         total = len(tasks)
         for task in tasks:
             prompt = f"Context: {task['context']}\nQuestion: {task['question']}\nAnswer:"
             # Generate answer
             input_ids = self.tokenizer.encode(prompt)
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=20, temperature=0.1)
             generated_ids = output[0].tolist()
-            answer = self.tokenizer.decode(generated_ids[len(input_ids):]).strip().lower()
             # Simple substring matching
-            if task['expected'].lower() in answer:
                 correct += 1
         return {
-            'accuracy': correct / total,
-            'correct': correct,
-            'total': total,
-            'score': correct / total
         }
     def _evaluate_sentiment_analysis(self) -> Dict[str, Any]:
         """Few-shot sentiment analysis evaluation."""
         # Few-shot examples
         examples = "Examples:\nText: 'I love this movie!' Sentiment: Positive\nText: 'This is terrible.' Sentiment: Negative\nText: 'It was okay.' Sentiment: Neutral\n\n"
         # Test cases
         test_cases = [
-            {'text': 'This is amazing!', 'expected': 'positive'},
-            {'text': 'I hate this.', 'expected': 'negative'},
-            {'text': 'This is wonderful.', 'expected': 'positive'},
-            {'text': 'This is awful.', 'expected': 'negative'},
-            {'text': 'It was fine.', 'expected': 'neutral'}
         ]
         correct = 0
         total = len(test_cases)
         for case in test_cases:
             prompt = f"{examples}Text: '{case['text']}' Sentiment:"
             # Generate sentiment
             input_ids = self.tokenizer.encode(prompt)
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=5, temperature=0.1)
             generated_ids = output[0].tolist()
-            sentiment = self.tokenizer.decode(generated_ids[len(input_ids):]).strip().lower()
             # Check if expected sentiment is in the generated response
-            if case['expected'] in sentiment:
                 correct += 1
         return {
-            'accuracy': correct / total,
-            'correct': correct,
-            'total': total,
-            'score': correct / total
         }
     def _evaluate_reasoning(self) -> Dict[str, Any]:
         """Simple reasoning evaluation."""
         # Basic reasoning tasks
         tasks = [
             {
-                'question': 'If all birds can fly and a penguin is a bird, can a penguin fly?',
-                'expected': 'no'  # This tests if model knows real-world facts
             },
             {
-                'question': 'If it is raining outside, should you take an umbrella?',
-                'expected': 'yes'
             },
-            {
-                'question': 'What comes after Monday?',
-                'expected': 'tuesday'
-            },
-            {
-                'question': 'Is the sun larger than the earth?',
-                'expected': 'yes'
-            }
         ]
         correct = 0
         total = len(tasks)
         for task in tasks:
             prompt = f"Question: {task['question']}\nAnswer:"
             # Generate answer
             input_ids = self.tokenizer.encode(prompt)
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=10, temperature=0.1)
             generated_ids = output[0].tolist()
-            answer = self.tokenizer.decode(generated_ids[len(input_ids):]).strip().lower()
             # Check if expected answer is in the response
-            if task['expected'] in answer:
                 correct += 1
         return {
-            'accuracy': correct / total,
-            'correct': correct,
-            'total': total,
-            'score': correct / total
         }
     def _evaluate_text_completion(self) -> Dict[str, Any]:
         """Evaluate text completion quality."""
         # Common phrases that should be completed predictably
         completions = [
-            {'prompt': 'The capital of France is', 'expected_word': 'paris'},
-            {'prompt': 'Two plus two equals', 'expected_word': 'four'},
-            {'prompt': 'The largest planet in our solar system is', 'expected_word': 'jupiter'},
-            {'prompt': 'Water boils at', 'expected_word': '100'}
         ]
         correct = 0
         total = len(completions)
         for completion in completions:
             # Generate completion
-            input_ids = self.tokenizer.encode(completion['prompt'])
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=5, temperature=0.1)
             generated_ids = output[0].tolist()
-            generated_text = self.tokenizer.decode(generated_ids[len(input_ids):]).strip().lower()
             # Check if expected word appears in completion
-            if completion['expected_word'] in generated_text:
                 correct += 1
         return {
-            'accuracy': correct / total,
-            'correct': correct,
-            'total': total,
-            'score': correct / total
         }
     def run_comprehensive_evaluation(
-        self,
-        eval_data_path: str,
-        metrics: List[str] = None,
-        generation_prompts: List[str] = None
     ) -> Dict[str, Any]:
         """
         Run comprehensive model evaluation.
         Args:
             eval_data_path: Path to evaluation text file
             metrics: List of metrics to compute
             generation_prompts: Prompts for text generation evaluation
         Returns:
             Complete evaluation results
         """
         if metrics is None:
-            metrics = ['perplexity', 'loss', 'generation']
         if generation_prompts is None:
             generation_prompts = [
                 "The history of artificial intelligence",
                 "Machine learning algorithms",
                 "The future of technology",
                 "In a world where",
-                "Scientists have discovered"
             ]
         results = {
-            'model_info': {
-                'parameters': self.model.get_num_params(),
-                'device': self.device,
-                'vocab_size': self.tokenizer.vocab_size()
             },
-            'evaluation_timestamp': time.time()
         }
         # Load evaluation data
         print(f"📂 Loading evaluation data from {eval_data_path}")
         if os.path.exists(eval_data_path):
-            with open(eval_data_path, 'r', encoding='utf-8') as f:
                 eval_texts = [line.strip() for line in f if line.strip()]
         else:
             print(f"⚠️  Evaluation file not found, using sample texts")
@@ -510,96 +496,103 @@ class ModelEvaluator:
                 "Machine learning algorithms can learn patterns from data automatically.",
                 "Natural language processing helps computers understand human language.",
                 "Deep learning uses neural networks with multiple layers for complex tasks.",
-                "The development of large language models has transformed AI applications."
             ]
         # Intrinsic evaluation
-        if 'perplexity' in metrics or 'loss' in metrics:
             perplexity_results = self.evaluate_perplexity(eval_texts)
-            results['intrinsic_evaluation'] = perplexity_results
         # Text generation evaluation
-        if 'generation' in metrics:
             generation_results = self.evaluate_text_generation(generation_prompts)
-            results['generation_evaluation'] = {
-                'results': generation_results,
-                'summary': self._summarize_generation_results(generation_results)
             }
         # Downstream tasks (placeholder)
-        results['downstream_evaluation'] = self.evaluate_downstream_tasks()
         # Overall quality assessment
-        results['quality_assessment'] = self._assess_overall_quality(results)
         return results
     def _summarize_generation_results(self, results: List[Dict[str, Any]]) -> Dict[str, float]:
         """Summarize text generation results."""
         if not results:
             return {}
-        total_time = sum(r['generation_time'] for r in results)
-        total_tokens = sum(r['tokens_generated'] for r in results)
-        quality_metrics = [r['quality_metrics'] for r in results]
         return {
-            'avg_generation_time': total_time / len(results),
-            'avg_tokens_per_second': total_tokens / total_time if total_time > 0 else 0,
-            'avg_length': sum(q['length'] for q in quality_metrics) / len(quality_metrics),
-            'avg_repetition_rate': sum(q['repetition_rate'] for q in quality_metrics) / len(quality_metrics),
-            'avg_coherence_score': sum(q['coherence_score'] for q in quality_metrics) / len(quality_metrics)
         }
     def _assess_overall_quality(self, results: Dict[str, Any]) -> Dict[str, Any]:
         """Assess overall model quality based on evaluation results."""
-        assessment = {
-            'quality_level': 'unknown',
-            'recommendations': []
-        }
         # Check intrinsic metrics
-        if 'intrinsic_evaluation' in results:
-            perplexity = results['intrinsic_evaluation'].get('perplexity', float('inf'))
             if perplexity < 12:
-                assessment['quality_level'] = 'good'
-                assessment['recommendations'].append('Model shows good perplexity scores')
             elif perplexity < 50:
-                assessment['quality_level'] = 'fair'
-                assessment['recommendations'].append('Model shows fair performance, could benefit from more training')
             else:
-                assessment['quality_level'] = 'poor'
-                assessment['recommendations'].append('Model needs significant more training or data improvements')
         # Check generation quality
-        if 'generation_evaluation' in results:
-            summary = results['generation_evaluation'].get('summary', {})
-            repetition_rate = summary.get('avg_repetition_rate', 1.0)
-            coherence_score = summary.get('avg_coherence_score', 0.0)
             if repetition_rate > 0.7:
-                assessment['recommendations'].append('High repetition rate - consider training longer or adjusting data')
             if coherence_score < 0.3:
-                assessment['recommendations'].append('Low coherence - model may need more training steps')
         return assessment
 def load_model_from_directory(model_dir: str, device: str = "cpu") -> Tuple[GPTModel, str]:
     """
     Load model from directory containing checkpoints.
     Args:
         model_dir: Directory containing model files
         device: Device to load model on
     Returns:
         Tuple of (model, tokenizer_path)
     """
     model_dir = Path(model_dir)
     # Find best model checkpoint
     best_model_path = model_dir / "best_model.pt"
     if not best_model_path.exists():
@@ -607,41 +600,41 @@ def load_model_from_directory(model_dir: str, device: str = "cpu") -> Tuple[GPTM
         checkpoints = list(model_dir.glob("checkpoint_step_*.pt"))
         if not checkpoints:
             raise FileNotFoundError(f"No model checkpoints found in {model_dir}")
         # Get latest checkpoint
-        latest_checkpoint = max(checkpoints, key=lambda p: int(p.stem.split('_')[-1]))
         best_model_path = latest_checkpoint
     print(f"📂 Loading model from {best_model_path}")
     # Load checkpoint
     checkpoint = torch.load(best_model_path, map_location=device)
     # Determine model size from config
-    config = checkpoint.get('config', {})
-    n_layer = config.get('n_layer', 12)
     if n_layer <= 6:
         model_size = "small"
     elif n_layer <= 12:
         model_size = "medium"
     else:
         model_size = "large"
     # Create and load model
     model = create_model(model_size)
-    model.load_state_dict(checkpoint['model_state_dict'])
     print(f"✅ Model loaded successfully ({model_size}, {model.get_num_params():,} parameters)")
     # Find tokenizer
     tokenizer_path = model_dir.parent / "tokenizer" / "tokenizer.model"
     if not tokenizer_path.exists():
         tokenizer_path = Path("data/tokenizer/tokenizer.model")
     if not tokenizer_path.exists():
         raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
     return model, str(tokenizer_path)
@@ -662,121 +655,115 @@ Examples:
     --model_dir models/small-extended-4k \\
     --metrics perplexity,generation \\
     --output results.json
-        """
     )
-    parser.add_argument(
-        "--model_dir",
-        required=True,
-        help="Directory containing trained model"
-    )
     parser.add_argument(
-        "--eval_data",
-        help="Path to evaluation text file (default: use sample texts)"
     )
     parser.add_argument(
         "--metrics",
         default="perplexity,loss,generation",
-        help="Comma-separated list of metrics to evaluate (default: perplexity,loss,generation)"
     )
-    parser.add_argument(
-        "--output",
-        help="Output JSON file for results (default: print to console)"
-    )
     parser.add_argument(
         "--device",
         choices=["cpu", "cuda", "auto"],
         default="auto",
-        help="Device for evaluation (default: auto)"
     )
     parser.add_argument(
-        "--generation_prompts",
-        help="File containing prompts for text generation evaluation"
     )
     args = parser.parse_args()
     print("📊 OpenLLM Model Evaluation")
     print("=" * 50)
     # Determine device
     if args.device == "auto":
         device = "cuda" if torch.cuda.is_available() else "cpu"
     else:
         device = args.device
     print(f"Using device: {device}")
     try:
         # Load model
         model, tokenizer_path = load_model_from_directory(args.model_dir, device)
         # Create evaluator
         evaluator = ModelEvaluator(model, tokenizer_path, device)
         # Parse metrics
-        metrics = [m.strip() for m in args.metrics.split(',')]
         # Load generation prompts if specified
         generation_prompts = None
         if args.generation_prompts and os.path.exists(args.generation_prompts):
-            with open(args.generation_prompts, 'r', encoding='utf-8') as f:
                 generation_prompts = [line.strip() for line in f if line.strip()]
         # Run evaluation
         eval_data_path = args.eval_data or "data/clean/training_data.txt"
         results = evaluator.run_comprehensive_evaluation(
             eval_data_path, metrics, generation_prompts
         )
         # Output results
         if args.output:
-            with open(args.output, 'w', encoding='utf-8') as f:
                 json.dump(results, f, indent=2)
             print(f"\n💾 Results saved to {args.output}")
         else:
             print(f"\n📊 Evaluation Results:")
             print("=" * 50)
             # Print key metrics
-            if 'intrinsic_evaluation' in results:
-                intrinsic = results['intrinsic_evaluation']
                 print(f"📈 Intrinsic Metrics:")
                 print(f"  Loss: {intrinsic['loss']:.4f}")
                 print(f"  Perplexity: {intrinsic['perplexity']:.2f}")
                 print(f"  Sequences evaluated: {intrinsic['num_sequences']:,}")
-            if 'generation_evaluation' in results:
-                gen_summary = results['generation_evaluation']['summary']
                 print(f"\n✍️  Generation Quality:")
-                print(f"  Avg generation speed: {gen_summary['avg_tokens_per_second']:.1f} tokens/sec")
                 print(f"  Avg text length: {gen_summary['avg_length']:.1f} words")
                 print(f"  Repetition rate: {gen_summary['avg_repetition_rate']:.3f}")
                 print(f"  Coherence score: {gen_summary['avg_coherence_score']:.3f}")
             # Quality assessment
-            if 'quality_assessment' in results:
-                assessment = results['quality_assessment']
                 print(f"\n🎯 Overall Assessment:")
                 print(f"  Quality Level: {assessment['quality_level'].upper()}")
-                for rec in assessment['recommendations']:
                     print(f"  • {rec}")
         print(f"\n🎉 Evaluation completed successfully!")
     except Exception as e:
         print(f"\n❌ Evaluation failed: {e}")
         import traceback
         traceback.print_exc()
         return False
     return True
 if __name__ == "__main__":
-    main()

 class ModelEvaluator:
     """
     Comprehensive evaluator for OpenLLM models.
     Implements intrinsic evaluation metrics and text generation quality
     assessment following the training pipeline specifications.
     """
+    def __init__(self, model: GPTModel, tokenizer_path: str, device: str = "cpu"):
         """
         Initialize the model evaluator.
         Args:
             model: Trained GPT model
             tokenizer_path: Path to tokenizer model file
         """
         self.model = model.to(device)
         self.device = device
         # Load tokenizer
         self.tokenizer = smp.SentencePieceProcessor()
         self.tokenizer.load(tokenizer_path)
         print(f"🔧 ModelEvaluator initialized")
         print(f"  Device: {device}")
         print(f"  Model parameters: {model.get_num_params():,}")
         print(f"  Vocabulary size: {self.tokenizer.vocab_size():,}")
     def evaluate_perplexity(
+        self, eval_data: List[str], max_seq_len: int = 512, batch_size: int = 1
     ) -> Dict[str, float]:
         """
         Calculate perplexity on evaluation data.
         Args:
             eval_data: List of text passages for evaluation
             max_seq_len: Maximum sequence length for evaluation
             batch_size: Batch size for evaluation
         Returns:
             Dictionary with loss and perplexity metrics
         """
         total_loss = 0.0
         total_tokens = 0
         num_sequences = 0
         print(f"📊 Calculating perplexity on {len(eval_data)} passages...")
         with torch.no_grad():
             for i, text in enumerate(eval_data):
                 if i % 100 == 0:
                     print(f"  Progress: {i}/{len(eval_data)} passages")
                 # Tokenize text
                 tokens = self.tokenizer.encode(text)
                 if len(tokens) < 2:
                     continue
                 # Truncate if too long
                 if len(tokens) > max_seq_len:
                     tokens = tokens[:max_seq_len]
                 # Create input and target tensors
                 input_ids = torch.tensor([tokens[:-1]], dtype=torch.long, device=self.device)
                 target_ids = torch.tensor([tokens[1:]], dtype=torch.long, device=self.device)
                 # Forward pass
                 logits, loss = self.model(input_ids, target_ids)
                 # Accumulate loss
                 seq_length = len(tokens) - 1
                 total_loss += loss.item() * seq_length
                 total_tokens += seq_length
                 num_sequences += 1
         # Calculate metrics
+        avg_loss = total_loss / total_tokens if total_tokens > 0 else float("inf")
         perplexity = math.exp(min(avg_loss, 10))  # Cap to prevent overflow
         return {
+            "loss": avg_loss,
+            "perplexity": perplexity,
+            "total_tokens": total_tokens,
+            "num_sequences": num_sequences,
         }
     def evaluate_text_generation(
         self,
         prompts: List[str],
         max_length: int = 256,
         temperature: float = 0.7,
         top_k: Optional[int] = 40,
+        num_samples: int = 1,
     ) -> List[Dict[str, Any]]:
         """
         Evaluate text generation quality.
         Args:
             prompts: List of input prompts
             max_length: Maximum generation length
             temperature: Sampling temperature
             top_k: Top-k sampling parameter
             num_samples: Number of samples per prompt
         Returns:
             List of generation results with quality metrics
         """
         self.model.eval()
         results = []
         print(f"✍️  Evaluating text generation on {len(prompts)} prompts...")
         with torch.no_grad():
             for prompt in prompts:
                 prompt_results = []
                 for sample_idx in range(num_samples):
                     # Tokenize prompt
                     input_ids = self.tokenizer.encode(prompt)
                     input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
                     start_time = time.time()
                     # Generate text
                     output = self.model.generate(
                         input_tensor,
                         max_new_tokens=max_length,
                         temperature=temperature,
+                        top_k=top_k,
                     )
                     generation_time = time.time() - start_time
                     # Decode output
                     generated_ids = output[0].tolist()
                     full_text = self.tokenizer.decode(generated_ids)
+                    generated_text = self.tokenizer.decode(generated_ids[len(input_ids) :])
                     # Calculate quality metrics
                     quality_metrics = self._assess_generation_quality(generated_text)
+                    prompt_results.append(
+                        {
+                            "prompt": prompt,
+                            "generated_text": generated_text,
+                            "full_text": full_text,
+                            "generation_time": generation_time,
+                            "tokens_generated": len(generated_ids) - len(input_ids),
+                            "tokens_per_second": (len(generated_ids) - len(input_ids))
+                            / generation_time,
+                            "quality_metrics": quality_metrics,
+                        }
+                    )
                 results.extend(prompt_results)
         return results
     def _assess_generation_quality(self, text: str) -> Dict[str, float]:
         """
         Assess basic quality metrics for generated text.
         Args:
             text: Generated text to assess
         Returns:
             Dictionary of quality metrics
         """
         if not text.strip():
             return {
+                "length": 0,
+                "avg_word_length": 0,
+                "repetition_rate": 1.0,
+                "coherence_score": 0.0,
             }
         words = text.split()
         # Basic metrics
         length = len(words)
         avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
         # Repetition rate (simple n-gram repetition)
+        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
         unique_bigrams = len(set(bigrams))
         repetition_rate = 1 - (unique_bigrams / len(bigrams) if bigrams else 0)
         # Simple coherence score (based on sentence structure)
+        sentences = text.split(".")
         valid_sentences = [s for s in sentences if len(s.strip().split()) > 3]
         coherence_score = len(valid_sentences) / len(sentences) if sentences else 0
         return {
+            "length": length,
+            "avg_word_length": avg_word_length,
+            "repetition_rate": repetition_rate,
+            "coherence_score": coherence_score,
         }
     def evaluate_downstream_tasks(self) -> Dict[str, Any]:
         """
         Evaluate model performance on downstream tasks.
         This function implements basic downstream task evaluation including:
         - Reading comprehension (simplified SQUAD-style)
         - Sentiment analysis (few-shot)
         - Common sense reasoning
         Returns:
             Dictionary of downstream task results
         """
         results = {}
         # 1. Reading Comprehension (Simplified SQUAD-style)
+        results["reading_comprehension"] = self._evaluate_reading_comprehension()
         # 2. Sentiment Analysis (Few-shot learning)
+        results["sentiment_analysis"] = self._evaluate_sentiment_analysis()
         # 3. Common Sense Reasoning
+        results["reasoning"] = self._evaluate_reasoning()
         # 4. Text Completion Quality
+        results["text_completion"] = self._evaluate_text_completion()
         return results
     def _evaluate_reading_comprehension(self) -> Dict[str, Any]:
         """Simplified reading comprehension evaluation."""
         # Sample reading comprehension tasks
         tasks = [
             {
+                "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.",
+                "question": "Who is the Eiffel Tower named after?",
+                "expected": "Gustave Eiffel",
             },
             {
+                "context": "Python is a high-level programming language. It was created by Guido van Rossum and first released in 1991.",
+                "question": "When was Python first released?",
+                "expected": "1991",
             },
             {
+                "context": "Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
+                "question": "What is machine learning a subset of?",
+                "expected": "artificial intelligence",
+            },
         ]
         correct = 0
         total = len(tasks)
         for task in tasks:
             prompt = f"Context: {task['context']}\nQuestion: {task['question']}\nAnswer:"
             # Generate answer
             input_ids = self.tokenizer.encode(prompt)
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=20, temperature=0.1)
             generated_ids = output[0].tolist()
+            answer = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
             # Simple substring matching
+            if task["expected"].lower() in answer:
                 correct += 1
         return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
         }
     def _evaluate_sentiment_analysis(self) -> Dict[str, Any]:
         """Few-shot sentiment analysis evaluation."""
         # Few-shot examples
         examples = "Examples:\nText: 'I love this movie!' Sentiment: Positive\nText: 'This is terrible.' Sentiment: Negative\nText: 'It was okay.' Sentiment: Neutral\n\n"
         # Test cases
         test_cases = [
+            {"text": "This is amazing!", "expected": "positive"},
+            {"text": "I hate this.", "expected": "negative"},
+            {"text": "This is wonderful.", "expected": "positive"},
+            {"text": "This is awful.", "expected": "negative"},
+            {"text": "It was fine.", "expected": "neutral"},
         ]
         correct = 0
         total = len(test_cases)
         for case in test_cases:
             prompt = f"{examples}Text: '{case['text']}' Sentiment:"
             # Generate sentiment
             input_ids = self.tokenizer.encode(prompt)
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=5, temperature=0.1)
             generated_ids = output[0].tolist()
+            sentiment = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
             # Check if expected sentiment is in the generated response
+            if case["expected"] in sentiment:
                 correct += 1
         return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
         }
     def _evaluate_reasoning(self) -> Dict[str, Any]:
         """Simple reasoning evaluation."""
         # Basic reasoning tasks
         tasks = [
             {
+                "question": "If all birds can fly and a penguin is a bird, can a penguin fly?",
+                "expected": "no",  # This tests if model knows real-world facts
             },
             {
+                "question": "If it is raining outside, should you take an umbrella?",
+                "expected": "yes",
             },
+            {"question": "What comes after Monday?", "expected": "tuesday"},
+            {"question": "Is the sun larger than the earth?", "expected": "yes"},
         ]
         correct = 0
         total = len(tasks)
         for task in tasks:
             prompt = f"Question: {task['question']}\nAnswer:"
             # Generate answer
             input_ids = self.tokenizer.encode(prompt)
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=10, temperature=0.1)
             generated_ids = output[0].tolist()
+            answer = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
             # Check if expected answer is in the response
+            if task["expected"] in answer:
                 correct += 1
         return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
         }
     def _evaluate_text_completion(self) -> Dict[str, Any]:
         """Evaluate text completion quality."""
         # Common phrases that should be completed predictably
         completions = [
+            {"prompt": "The capital of France is", "expected_word": "paris"},
+            {"prompt": "Two plus two equals", "expected_word": "four"},
+            {"prompt": "The largest planet in our solar system is", "expected_word": "jupiter"},
+            {"prompt": "Water boils at", "expected_word": "100"},
         ]
         correct = 0
         total = len(completions)
         for completion in completions:
             # Generate completion
+            input_ids = self.tokenizer.encode(completion["prompt"])
             input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
             with torch.no_grad():
                 output = self.model.generate(input_tensor, max_new_tokens=5, temperature=0.1)
             generated_ids = output[0].tolist()
+            generated_text = self.tokenizer.decode(generated_ids[len(input_ids) :]).strip().lower()
             # Check if expected word appears in completion
+            if completion["expected_word"] in generated_text:
                 correct += 1
         return {
+            "accuracy": correct / total,
+            "correct": correct,
+            "total": total,
+            "score": correct / total,
         }
     def run_comprehensive_evaluation(
+        self, eval_data_path: str, metrics: List[str] = None, generation_prompts: List[str] = None
     ) -> Dict[str, Any]:
         """
         Run comprehensive model evaluation.
         Args:
             eval_data_path: Path to evaluation text file
             metrics: List of metrics to compute
             generation_prompts: Prompts for text generation evaluation
         Returns:
             Complete evaluation results
         """
         if metrics is None:
+            metrics = ["perplexity", "loss", "generation"]
         if generation_prompts is None:
             generation_prompts = [
                 "The history of artificial intelligence",
                 "Machine learning algorithms",
                 "The future of technology",
                 "In a world where",
+                "Scientists have discovered",
             ]
         results = {
+            "model_info": {
+                "parameters": self.model.get_num_params(),
+                "device": self.device,
+                "vocab_size": self.tokenizer.vocab_size(),
             },
+            "evaluation_timestamp": time.time(),
         }
         # Load evaluation data
         print(f"📂 Loading evaluation data from {eval_data_path}")
         if os.path.exists(eval_data_path):
+            with open(eval_data_path, "r", encoding="utf-8") as f:
                 eval_texts = [line.strip() for line in f if line.strip()]
         else:
             print(f"⚠️  Evaluation file not found, using sample texts")
                 "Machine learning algorithms can learn patterns from data automatically.",
                 "Natural language processing helps computers understand human language.",
                 "Deep learning uses neural networks with multiple layers for complex tasks.",
+                "The development of large language models has transformed AI applications.",
             ]
         # Intrinsic evaluation
+        if "perplexity" in metrics or "loss" in metrics:
             perplexity_results = self.evaluate_perplexity(eval_texts)
+            results["intrinsic_evaluation"] = perplexity_results
         # Text generation evaluation
+        if "generation" in metrics:
             generation_results = self.evaluate_text_generation(generation_prompts)
+            results["generation_evaluation"] = {
+                "results": generation_results,
+                "summary": self._summarize_generation_results(generation_results),
             }
         # Downstream tasks (placeholder)
+        results["downstream_evaluation"] = self.evaluate_downstream_tasks()
         # Overall quality assessment
+        results["quality_assessment"] = self._assess_overall_quality(results)
         return results
     def _summarize_generation_results(self, results: List[Dict[str, Any]]) -> Dict[str, float]:
         """Summarize text generation results."""
         if not results:
             return {}
+        total_time = sum(r["generation_time"] for r in results)
+        total_tokens = sum(r["tokens_generated"] for r in results)
+        quality_metrics = [r["quality_metrics"] for r in results]
         return {
+            "avg_generation_time": total_time / len(results),
+            "avg_tokens_per_second": total_tokens / total_time if total_time > 0 else 0,
+            "avg_length": sum(q["length"] for q in quality_metrics) / len(quality_metrics),
+            "avg_repetition_rate": sum(q["repetition_rate"] for q in quality_metrics)
+            / len(quality_metrics),
+            "avg_coherence_score": sum(q["coherence_score"] for q in quality_metrics)
+            / len(quality_metrics),
         }
     def _assess_overall_quality(self, results: Dict[str, Any]) -> Dict[str, Any]:
         """Assess overall model quality based on evaluation results."""
+        assessment = {"quality_level": "unknown", "recommendations": []}
         # Check intrinsic metrics
+        if "intrinsic_evaluation" in results:
+            perplexity = results["intrinsic_evaluation"].get("perplexity", float("inf"))
             if perplexity < 12:
+                assessment["quality_level"] = "good"
+                assessment["recommendations"].append("Model shows good perplexity scores")
             elif perplexity < 50:
+                assessment["quality_level"] = "fair"
+                assessment["recommendations"].append(
+                    "Model shows fair performance, could benefit from more training"
+                )
             else:
+                assessment["quality_level"] = "poor"
+                assessment["recommendations"].append(
+                    "Model needs significant more training or data improvements"
+                )
         # Check generation quality
+        if "generation_evaluation" in results:
+            summary = results["generation_evaluation"].get("summary", {})
+            repetition_rate = summary.get("avg_repetition_rate", 1.0)
+            coherence_score = summary.get("avg_coherence_score", 0.0)
             if repetition_rate > 0.7:
+                assessment["recommendations"].append(
+                    "High repetition rate - consider training longer or adjusting data"
+                )
             if coherence_score < 0.3:
+                assessment["recommendations"].append(
+                    "Low coherence - model may need more training steps"
+                )
         return assessment
 def load_model_from_directory(model_dir: str, device: str = "cpu") -> Tuple[GPTModel, str]:
     """
     Load model from directory containing checkpoints.
     Args:
         model_dir: Directory containing model files
         device: Device to load model on
     Returns:
         Tuple of (model, tokenizer_path)
     """
     model_dir = Path(model_dir)
     # Find best model checkpoint
     best_model_path = model_dir / "best_model.pt"
     if not best_model_path.exists():
         checkpoints = list(model_dir.glob("checkpoint_step_*.pt"))
         if not checkpoints:
             raise FileNotFoundError(f"No model checkpoints found in {model_dir}")
         # Get latest checkpoint
+        latest_checkpoint = max(checkpoints, key=lambda p: int(p.stem.split("_")[-1]))
         best_model_path = latest_checkpoint
     print(f"📂 Loading model from {best_model_path}")
     # Load checkpoint
     checkpoint = torch.load(best_model_path, map_location=device)
     # Determine model size from config
+    config = checkpoint.get("config", {})
+    n_layer = config.get("n_layer", 12)
     if n_layer <= 6:
         model_size = "small"
     elif n_layer <= 12:
         model_size = "medium"
     else:
         model_size = "large"
     # Create and load model
     model = create_model(model_size)
+    model.load_state_dict(checkpoint["model_state_dict"])
     print(f"✅ Model loaded successfully ({model_size}, {model.get_num_params():,} parameters)")
     # Find tokenizer
     tokenizer_path = model_dir.parent / "tokenizer" / "tokenizer.model"
     if not tokenizer_path.exists():
         tokenizer_path = Path("data/tokenizer/tokenizer.model")
     if not tokenizer_path.exists():
         raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
     return model, str(tokenizer_path)
     --model_dir models/small-extended-4k \\
     --metrics perplexity,generation \\
     --output results.json
+        """,
     )
+    parser.add_argument("--model_dir", required=True, help="Directory containing trained model")
     parser.add_argument(
+        "--eval_data", help="Path to evaluation text file (default: use sample texts)"
     )
     parser.add_argument(
         "--metrics",
         default="perplexity,loss,generation",
+        help="Comma-separated list of metrics to evaluate (default: perplexity,loss,generation)",
     )
+    parser.add_argument("--output", help="Output JSON file for results (default: print to console)")
     parser.add_argument(
         "--device",
         choices=["cpu", "cuda", "auto"],
         default="auto",
+        help="Device for evaluation (default: auto)",
     )
     parser.add_argument(
+        "--generation_prompts", help="File containing prompts for text generation evaluation"
     )
     args = parser.parse_args()
     print("📊 OpenLLM Model Evaluation")
     print("=" * 50)
     # Determine device
     if args.device == "auto":
         device = "cuda" if torch.cuda.is_available() else "cpu"
     else:
         device = args.device
     print(f"Using device: {device}")
     try:
         # Load model
         model, tokenizer_path = load_model_from_directory(args.model_dir, device)
         # Create evaluator
         evaluator = ModelEvaluator(model, tokenizer_path, device)
         # Parse metrics
+        metrics = [m.strip() for m in args.metrics.split(",")]
         # Load generation prompts if specified
         generation_prompts = None
         if args.generation_prompts and os.path.exists(args.generation_prompts):
+            with open(args.generation_prompts, "r", encoding="utf-8") as f:
                 generation_prompts = [line.strip() for line in f if line.strip()]
         # Run evaluation
         eval_data_path = args.eval_data or "data/clean/training_data.txt"
         results = evaluator.run_comprehensive_evaluation(
             eval_data_path, metrics, generation_prompts
         )
         # Output results
         if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
                 json.dump(results, f, indent=2)
             print(f"\n💾 Results saved to {args.output}")
         else:
             print(f"\n📊 Evaluation Results:")
             print("=" * 50)
             # Print key metrics
+            if "intrinsic_evaluation" in results:
+                intrinsic = results["intrinsic_evaluation"]
                 print(f"📈 Intrinsic Metrics:")
                 print(f"  Loss: {intrinsic['loss']:.4f}")
                 print(f"  Perplexity: {intrinsic['perplexity']:.2f}")
                 print(f"  Sequences evaluated: {intrinsic['num_sequences']:,}")
+            if "generation_evaluation" in results:
+                gen_summary = results["generation_evaluation"]["summary"]
                 print(f"\n✍️  Generation Quality:")
+                print(
+                    f"  Avg generation speed: {gen_summary['avg_tokens_per_second']:.1f} tokens/sec"
+                )
                 print(f"  Avg text length: {gen_summary['avg_length']:.1f} words")
                 print(f"  Repetition rate: {gen_summary['avg_repetition_rate']:.3f}")
                 print(f"  Coherence score: {gen_summary['avg_coherence_score']:.3f}")
             # Quality assessment
+            if "quality_assessment" in results:
+                assessment = results["quality_assessment"]
                 print(f"\n🎯 Overall Assessment:")
                 print(f"  Quality Level: {assessment['quality_level'].upper()}")
+                for rec in assessment["recommendations"]:
                     print(f"  • {rec}")
         print(f"\n🎉 Evaluation completed successfully!")
     except Exception as e:
         print(f"\n❌ Evaluation failed: {e}")
         import traceback
         traceback.print_exc()
         return False
     return True
 if __name__ == "__main__":
+    main()

training/model.py CHANGED Viewed

@@ -18,7 +18,7 @@ language modeling (next-token prediction).
 ARCHITECTURE OVERVIEW:
 - Token Embedding: Maps token IDs to dense vectors
-- Positional Embedding: Adds position information to token embeddings
 - Transformer Blocks: Stack of multi-head attention + feed-forward layers
 - Layer Normalization: Pre-norm placement for training stability
 - Output Head: Linear projection to vocabulary for next-token prediction
@@ -32,16 +32,16 @@ FEATURES:
 Usage:
     from model import GPTConfig, GPTModel
     config = GPTConfig(vocab_size=32000, n_layer=12, n_head=12, n_embd=768)
     model = GPTModel(config)
     # Forward pass
     logits = model(input_ids)  # Shape: (batch_size, seq_len, vocab_size)
 Hardware Requirements:
 - Small Model (25M params): 4-8GB RAM, CPU/integrated GPU
-- Medium Model (117M params): 8-16GB RAM, dedicated GPU recommended
 - Large Model (350M params): 16GB+ RAM, high-end GPU required
 Author: Louis Chua Bean Chong
@@ -60,43 +60,43 @@ from typing import Optional, Tuple
 class GPTConfig:
     """
     Configuration class for GPT model hyperparameters.
     This class defines all the architectural parameters needed to instantiate
     a GPT model. Use the provided class methods to get pre-configured setups
     for different model sizes.
     """
     # Model architecture
-    vocab_size: int = 32000          # Vocabulary size (from tokenizer)
-    n_layer: int = 12                # Number of transformer layers
-    n_head: int = 12                 # Number of attention heads
-    n_embd: int = 768               # Embedding dimension
     # Sequence and context
-    block_size: int = 1024           # Maximum sequence length
     # Training hyperparameters
-    dropout: float = 0.1             # Dropout probability
-    bias: bool = True                # Use bias in linear layers
     # Model size identifier
-    model_name: str = "gpt-medium"   # Human-readable model identifier
     @classmethod
-    def small(cls) -> 'GPTConfig':
         """Small model configuration (~25M parameters) - Good for CPU training"""
         return cls(
             vocab_size=32000,
             n_layer=6,
-            n_head=8,
             n_embd=512,
             block_size=1024,
             dropout=0.1,
-            model_name="gpt-small"
         )
-    @classmethod
-    def medium(cls) -> 'GPTConfig':
         """Medium model configuration (~117M parameters) - Balanced performance"""
         return cls(
             vocab_size=32000,
@@ -105,11 +105,11 @@ class GPTConfig:
             n_embd=768,
             block_size=2048,
             dropout=0.1,
-            model_name="gpt-medium"
         )
     @classmethod
-    def large(cls) -> 'GPTConfig':
         """Large model configuration (~350M parameters) - High performance"""
         return cls(
             vocab_size=32000,
@@ -118,29 +118,29 @@ class GPTConfig:
             n_embd=1024,
             block_size=2048,
             dropout=0.1,
-            model_name="gpt-large"
         )
     def estimate_parameters(self) -> int:
         """
         Estimate the total number of trainable parameters.
         Returns:
             int: Estimated parameter count
         """
         # Token embeddings
         token_emb = self.vocab_size * self.n_embd
-        # Position embeddings
         pos_emb = self.block_size * self.n_embd
         # Transformer layers
         # Each layer: attention (4 * n_embd^2) + mlp (8 * n_embd^2) + layer_norms
         layer_params = self.n_layer * (12 * self.n_embd**2 + 4 * self.n_embd)
         # Output head
         output_head = self.vocab_size * self.n_embd
         total = token_emb + pos_emb + layer_params + output_head
         return total
@@ -148,75 +148,78 @@ class GPTConfig:
 class CausalSelfAttention(nn.Module):
     """
     Multi-head causal self-attention mechanism.
     This implements the core attention mechanism of the transformer, with causal
     masking to ensure autoregressive behavior (tokens can only attend to previous
     tokens, not future ones).
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
-        assert config.n_embd % config.n_head == 0, "Embedding dim must be divisible by number of heads"
         self.config = config
         self.n_head = config.n_head
         self.n_embd = config.n_embd
         self.head_dim = self.n_embd // self.n_head
         # Key, query, value projections for all heads (batched)
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
         # Output projection
         self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
         # Dropout
         self.attn_dropout = nn.Dropout(config.dropout)
         self.resid_dropout = nn.Dropout(config.dropout)
         # Causal mask - lower triangular matrix
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones(config.block_size, config.block_size))
-                  .view(1, 1, config.block_size, config.block_size)
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of causal self-attention.
         This method implements the scaled dot-product attention mechanism with causal masking.
         The attention mechanism allows each token to attend to all previous tokens in the sequence,
         but not to future tokens, maintaining the autoregressive property essential for language modeling.
         Mathematical formulation:
             Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
             where Q, K, V are query, key, value matrices derived from input x
         Implementation details:
             - Uses batch matrix multiplication for efficiency
             - Applies causal mask to prevent future token attention
             - Implements multi-head attention by reshaping and parallel processing
             - Applies dropout for regularization during training
         Args:
             x: Input tensor of shape (batch_size, seq_len, n_embd)
                Contains embedded token representations from previous layer
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
         """
         # Extract tensor dimensions for clear variable naming and validation
         # B = batch size (number of sequences processed in parallel)
-        # T = sequence length (number of tokens in each sequence)
         # C = embedding dimensionality (n_embd from config)
         B, T, C = x.size()
         # Generate query, key, and value projections for all attention heads
         # The c_attn linear layer outputs 3 * n_embd features, which we split into Q, K, V
         # This batched approach is more efficient than separate linear layers
         # Input shape: (B, T, C) -> Output shape: (B, T, 3*C) -> Split to 3x (B, T, C)
         q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
         # Reshape tensors for multi-head attention computation
         # Transform from (B, T, C) to (B, nh, T, hs) where:
         # - nh = number of heads (self.n_head)
@@ -225,41 +228,41 @@ class CausalSelfAttention(nn.Module):
         q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
         k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
         v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
         # Compute scaled dot-product attention scores
         # Matrix multiplication: Q @ K^T gives attention affinities between all token pairs
         # Scaling by 1/sqrt(head_dim) prevents softmax saturation for large embedding dimensions
         # Shape: (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nh, T, T)
         # The resulting (T, T) matrix represents attention weights from each token to every other token
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
         # Apply causal masking to enforce autoregressive property
         # The causal mask ensures that token i can only attend to tokens j where j <= i
         # This prevents the model from "cheating" by looking at future tokens during training
         # We use -inf for masked positions so they become 0 after softmax
-        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
         # Convert attention scores to probabilities using softmax
         # Each row of the attention matrix now sums to 1, representing a probability distribution
         # over which tokens to attend to for each query position
         att = F.softmax(att, dim=-1)
         # Apply dropout to attention weights for regularization
         # This randomly zeros some attention connections during training to prevent overfitting
         att = self.attn_dropout(att)
         # Apply attention weights to value vectors
         # This weighted combination produces the actual output of the attention mechanism
         # Shape: (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
         # Each output position is a weighted sum of all value vectors, with weights from attention
         y = att @ v
         # Concatenate multi-head outputs back to original embedding dimension
         # Transform from (B, nh, T, hs) back to (B, T, C) where C = nh * hs
         # The transpose moves head dimension back, and contiguous() ensures memory layout efficiency
         # This combines information from all attention heads into a single representation
         y = y.transpose(1, 2).contiguous().view(B, T, C)
         # Apply final output projection and residual dropout
         # The output projection allows the model to learn how to best combine multi-head information
         # Residual dropout provides additional regularization before the residual connection
@@ -270,62 +273,62 @@ class CausalSelfAttention(nn.Module):
 class MLP(nn.Module):
     """
     Multi-Layer Perceptron (Feed-Forward Network) for Transformer.
     This implements the position-wise feed-forward network that appears in each transformer layer.
     The MLP provides additional non-linear transformation capacity beyond what attention provides.
     Architecture:
         Input -> Linear(n_embd -> 4*n_embd) -> GELU -> Linear(4*n_embd -> n_embd) -> Dropout -> Output
     Design rationale:
         - 4x expansion is standard in transformers (from "Attention Is All You Need")
         - GELU activation provides smoother gradients than ReLU for language modeling
         - Dropout prevents overfitting in the feed-forward layers
         - Two linear layers allow complex non-linear transformations of attention outputs
     Parameters:
         - First linear layer: n_embd * 4*n_embd parameters (expansion)
         - Second linear layer: 4*n_embd * n_embd parameters (projection back)
         - Total: 8 * n_embd^2 parameters (significant portion of model size)
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
         # First linear layer: expand embedding dimension by 4x
         # This expansion gives the network more representational capacity
         # The 4x factor is a standard choice that balances capacity vs efficiency
         self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
         # GELU (Gaussian Error Linear Unit) activation function
         # GELU provides smoother gradients compared to ReLU and works better for language modeling
         # It's approximately: GELU(x) = x * Φ(x) where Φ is the CDF of standard normal distribution
         self.gelu = nn.GELU()
         # Second linear layer: project back to original embedding dimension
         # This projection allows the network to combine information from the expanded representation
         self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
         # Dropout for regularization in the feed-forward network
         # Applied after the final projection to prevent overfitting
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of the feed-forward network.
         This method applies a two-layer MLP with GELU activation to transform
         the attention outputs. The MLP operates independently on each position
         in the sequence, providing position-wise non-linear transformations.
         Mathematical operation:
             MLP(x) = Dropout(Linear₂(GELU(Linear₁(x))))
             where Linear₁: R^n_embd -> R^4*n_embd and Linear₂: R^4*n_embd -> R^n_embd
         Args:
             x: Input tensor of shape (batch_size, seq_len, n_embd)
                Contains attended representations from the attention layer
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
                          Contains transformed representations ready for residual connection
@@ -334,112 +337,116 @@ class MLP(nn.Module):
         # This expansion provides the network with a higher-dimensional space for computation
         # Shape: (batch_size, seq_len, n_embd) -> (batch_size, seq_len, 4*n_embd)
         x = self.c_fc(x)
         # Apply GELU activation function for non-linearity
         # GELU is smoother than ReLU and provides better gradients for language modeling
         # It introduces non-linearity while maintaining differentiability everywhere
         x = self.gelu(x)
         # Second linear transformation: project back to original n_embd dimensions
         # This projection combines information from the expanded representation
         # Shape: (batch_size, seq_len, 4*n_embd) -> (batch_size, seq_len, n_embd)
         x = self.c_proj(x)
         # Apply dropout for regularization before residual connection
         # Dropout randomly zeros some neurons during training to prevent overfitting
         # This is particularly important in the feed-forward layers which have many parameters
         x = self.dropout(x)
         return x
 class Block(nn.Module):
     """
     Single Transformer block.
     Consists of:
     1. Layer normalization
-    2. Multi-head causal self-attention
     3. Residual connection
     4. Layer normalization
     5. MLP (feed-forward network)
     6. Residual connection
     Uses pre-norm architecture for better training stability.
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
         self.ln_1 = nn.LayerNorm(config.n_embd)
         self.attn = CausalSelfAttention(config)
         self.ln_2 = nn.LayerNorm(config.n_embd)
         self.mlp = MLP(config)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of transformer block.
         Args:
             x: Input tensor of shape (batch_size, seq_len, n_embd)
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
         """
         # Pre-norm attention with residual connection
         x = x + self.attn(self.ln_1(x))
-        # Pre-norm MLP with residual connection
         x = x + self.mlp(self.ln_2(x))
         return x
 class GPTModel(nn.Module):
     """
     Complete GPT Language Model.
     This is the main model class that combines all components:
     - Token and positional embeddings
     - Stack of transformer blocks
     - Final layer normalization
     - Language modeling head
     The model can be used for:
     - Training from scratch on text data
     - Fine-tuning on downstream tasks
     - Text generation (inference)
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
         assert config.vocab_size is not None, "vocab_size must be specified"
         assert config.block_size is not None, "block_size must be specified"
         self.config = config
         # Embeddings
-        self.transformer = nn.ModuleDict(dict(
-            wte = nn.Embedding(config.vocab_size, config.n_embd),  # Token embeddings
-            wpe = nn.Embedding(config.block_size, config.n_embd),  # Position embeddings
-            drop = nn.Dropout(config.dropout),
-            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),  # Transformer blocks
-            ln_f = nn.LayerNorm(config.n_embd),  # Final layer norm
-        ))
         # Language modeling head (maps hidden states to vocabulary)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         # Tie weights between token embeddings and output head (common practice)
         self.transformer.wte.weight = self.lm_head.weight
         # Initialize weights
         self.apply(self._init_weights)
         # Report parameter count
         print(f"Model initialized: {self.config.model_name}")
         print(f"Parameters: {self.get_num_params():,}")
         print(f"Estimated: {self.config.estimate_parameters():,}")
     def _init_weights(self, module):
         """Initialize model weights using standard practices."""
         if isinstance(module, nn.Linear):
@@ -448,14 +455,14 @@ class GPTModel(nn.Module):
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def get_num_params(self, non_embedding: bool = False) -> int:
         """
         Count the number of parameters in the model.
         Args:
             non_embedding: If True, subtract embedding parameters
         Returns:
             int: Number of parameters
         """
@@ -464,19 +471,17 @@ class GPTModel(nn.Module):
             n_params -= self.transformer.wpe.weight.numel()
             n_params -= self.transformer.wte.weight.numel()
         return n_params
     def forward(
-        self,
-        idx: torch.Tensor,
-        targets: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Forward pass of the GPT model.
         Args:
             idx: Input token indices of shape (batch_size, seq_len)
             targets: Optional target tokens for loss calculation (batch_size, seq_len)
         Returns:
             Tuple containing:
             - logits: Output logits of shape (batch_size, seq_len, vocab_size)
@@ -484,53 +489,57 @@ class GPTModel(nn.Module):
         """
         device = idx.device
         b, t = idx.size()
-        assert t <= self.config.block_size, f"Sequence length {t} exceeds block size {self.config.block_size}"
         # Token embeddings
         tok_emb = self.transformer.wte(idx)  # (b, t, n_embd)
         # Position embeddings
         pos = torch.arange(0, t, dtype=torch.long, device=device)  # (t,)
         pos_emb = self.transformer.wpe(pos)  # (t, n_embd)
         # Combine embeddings and apply dropout
         x = self.transformer.drop(tok_emb + pos_emb)
         # Pass through transformer blocks
         for block in self.transformer.h:
             x = block(x)
         # Final layer normalization
         x = self.transformer.ln_f(x)
         # Language modeling head
         if targets is not None:
             # If we have targets, compute loss
             logits = self.lm_head(x)
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
         else:
             # If no targets, only compute logits for the last token (more efficient for generation)
             logits = self.lm_head(x[:, [-1], :])  # Note: using list [-1] to preserve the time dim
             loss = None
         return logits, loss
     def generate(
-        self,
-        idx: torch.Tensor,
         max_new_tokens: int = 100,
         temperature: float = 1.0,
-        top_k: Optional[int] = None
     ) -> torch.Tensor:
         """
         Generate new tokens autoregressively.
         Args:
             idx: Starting token indices (batch_size, seq_len)
             max_new_tokens: Maximum number of new tokens to generate
             temperature: Sampling temperature (higher = more random)
             top_k: If set, only sample from top-k most likely tokens
         Returns:
             torch.Tensor: Generated sequence (batch_size, seq_len + max_new_tokens)
         """
@@ -538,57 +547,61 @@ class GPTModel(nn.Module):
         with torch.no_grad():
             for _ in range(max_new_tokens):
                 # Crop sequence if it exceeds block size
-                idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
                 # Forward pass
                 logits, _ = self(idx_cond)
                 # Get logits for the last token and apply temperature
                 logits = logits[:, -1, :] / temperature
                 # Optionally crop to top-k most likely tokens
                 if top_k is not None:
                     v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                    logits[logits < v[:, [-1]]] = -float('Inf')
                 # Apply softmax and sample
                 probs = F.softmax(logits, dim=-1)
                 idx_next = torch.multinomial(probs, num_samples=1)
                 # Append to sequence
                 idx = torch.cat((idx, idx_next), dim=1)
         self.train()  # Return to training mode
         return idx
     def estimate_memory_usage(self, batch_size: int = 1, seq_len: int = None) -> dict:
         """
         Estimate memory usage for training and inference.
         Args:
             batch_size: Batch size for estimation
             seq_len: Sequence length (defaults to block_size)
         Returns:
             dict: Memory usage estimates in MB
         """
         if seq_len is None:
             seq_len = self.config.block_size
         # Model parameters (weights)
         param_memory = self.get_num_params() * 4 / (1024**2)  # 4 bytes per float32
         # Activations (rough estimate)
         activation_memory = (
             batch_size * seq_len * self.config.n_embd * self.config.n_layer * 8  # Rough estimate
         ) / (1024**2)
         # Gradients (same size as parameters during training)
         gradient_memory = param_memory
         return {
             "parameters_mb": param_memory,
-            "activations_mb": activation_memory,
             "gradients_mb": gradient_memory,
             "total_training_mb": param_memory + activation_memory + gradient_memory,
             "total_inference_mb": param_memory + activation_memory * 0.5,  # No gradients needed
@@ -598,25 +611,25 @@ class GPTModel(nn.Module):
 def create_model(model_size: str = "medium") -> GPTModel:
     """
     Factory function to create a GPT model with predefined configurations.
     Args:
         model_size: Size of model to create ("small", "medium", "large")
     Returns:
         GPTModel: Initialized model
     """
     configs = {
         "small": GPTConfig.small(),
-        "medium": GPTConfig.medium(),
         "large": GPTConfig.large(),
     }
     if model_size not in configs:
         raise ValueError(f"Unknown model size: {model_size}. Choose from {list(configs.keys())}")
     config = configs[model_size]
     model = GPTModel(config)
     return model
@@ -624,18 +637,20 @@ if __name__ == "__main__":
     # Example usage
     print("🧠 GPT Model Architecture")
     print("=" * 50)
     # Create models of different sizes
     for size in ["small", "medium", "large"]:
         print(f"\n{size.upper()} MODEL:")
         model = create_model(size)
         # Show memory estimates
         memory = model.estimate_memory_usage(batch_size=4, seq_len=512)
-        print(f"Memory (4 batch, 512 seq): {memory['total_training_mb']:.1f}MB training, {memory['total_inference_mb']:.1f}MB inference")
         # Test forward pass
         x = torch.randint(0, 32000, (2, 64))  # Batch size 2, sequence length 64
         with torch.no_grad():
             logits, _ = model(x)
-        print(f"Test forward pass: {x.shape} -> {logits.shape} ✓")

 ARCHITECTURE OVERVIEW:
 - Token Embedding: Maps token IDs to dense vectors
+- Positional Embedding: Adds position information to token embeddings
 - Transformer Blocks: Stack of multi-head attention + feed-forward layers
 - Layer Normalization: Pre-norm placement for training stability
 - Output Head: Linear projection to vocabulary for next-token prediction
 Usage:
     from model import GPTConfig, GPTModel
     config = GPTConfig(vocab_size=32000, n_layer=12, n_head=12, n_embd=768)
     model = GPTModel(config)
     # Forward pass
     logits = model(input_ids)  # Shape: (batch_size, seq_len, vocab_size)
 Hardware Requirements:
 - Small Model (25M params): 4-8GB RAM, CPU/integrated GPU
+- Medium Model (117M params): 8-16GB RAM, dedicated GPU recommended
 - Large Model (350M params): 16GB+ RAM, high-end GPU required
 Author: Louis Chua Bean Chong
 class GPTConfig:
     """
     Configuration class for GPT model hyperparameters.
     This class defines all the architectural parameters needed to instantiate
     a GPT model. Use the provided class methods to get pre-configured setups
     for different model sizes.
     """
     # Model architecture
+    vocab_size: int = 32000  # Vocabulary size (from tokenizer)
+    n_layer: int = 12  # Number of transformer layers
+    n_head: int = 12  # Number of attention heads
+    n_embd: int = 768  # Embedding dimension
     # Sequence and context
+    block_size: int = 1024  # Maximum sequence length
     # Training hyperparameters
+    dropout: float = 0.1  # Dropout probability
+    bias: bool = True  # Use bias in linear layers
     # Model size identifier
+    model_name: str = "gpt-medium"  # Human-readable model identifier
     @classmethod
+    def small(cls) -> "GPTConfig":
         """Small model configuration (~25M parameters) - Good for CPU training"""
         return cls(
             vocab_size=32000,
             n_layer=6,
+            n_head=8,
             n_embd=512,
             block_size=1024,
             dropout=0.1,
+            model_name="gpt-small",
         )
+    @classmethod
+    def medium(cls) -> "GPTConfig":
         """Medium model configuration (~117M parameters) - Balanced performance"""
         return cls(
             vocab_size=32000,
             n_embd=768,
             block_size=2048,
             dropout=0.1,
+            model_name="gpt-medium",
         )
     @classmethod
+    def large(cls) -> "GPTConfig":
         """Large model configuration (~350M parameters) - High performance"""
         return cls(
             vocab_size=32000,
             n_embd=1024,
             block_size=2048,
             dropout=0.1,
+            model_name="gpt-large",
         )
     def estimate_parameters(self) -> int:
         """
         Estimate the total number of trainable parameters.
         Returns:
             int: Estimated parameter count
         """
         # Token embeddings
         token_emb = self.vocab_size * self.n_embd
+        # Position embeddings
         pos_emb = self.block_size * self.n_embd
         # Transformer layers
         # Each layer: attention (4 * n_embd^2) + mlp (8 * n_embd^2) + layer_norms
         layer_params = self.n_layer * (12 * self.n_embd**2 + 4 * self.n_embd)
         # Output head
         output_head = self.vocab_size * self.n_embd
         total = token_emb + pos_emb + layer_params + output_head
         return total
 class CausalSelfAttention(nn.Module):
     """
     Multi-head causal self-attention mechanism.
     This implements the core attention mechanism of the transformer, with causal
     masking to ensure autoregressive behavior (tokens can only attend to previous
     tokens, not future ones).
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
+        assert (
+            config.n_embd % config.n_head == 0
+        ), "Embedding dim must be divisible by number of heads"
         self.config = config
         self.n_head = config.n_head
         self.n_embd = config.n_embd
         self.head_dim = self.n_embd // self.n_head
         # Key, query, value projections for all heads (batched)
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
         # Output projection
         self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
         # Dropout
         self.attn_dropout = nn.Dropout(config.dropout)
         self.resid_dropout = nn.Dropout(config.dropout)
         # Causal mask - lower triangular matrix
         self.register_buffer(
             "bias",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                1, 1, config.block_size, config.block_size
+            ),
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of causal self-attention.
         This method implements the scaled dot-product attention mechanism with causal masking.
         The attention mechanism allows each token to attend to all previous tokens in the sequence,
         but not to future tokens, maintaining the autoregressive property essential for language modeling.
         Mathematical formulation:
             Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
             where Q, K, V are query, key, value matrices derived from input x
         Implementation details:
             - Uses batch matrix multiplication for efficiency
             - Applies causal mask to prevent future token attention
             - Implements multi-head attention by reshaping and parallel processing
             - Applies dropout for regularization during training
         Args:
             x: Input tensor of shape (batch_size, seq_len, n_embd)
                Contains embedded token representations from previous layer
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
         """
         # Extract tensor dimensions for clear variable naming and validation
         # B = batch size (number of sequences processed in parallel)
+        # T = sequence length (number of tokens in each sequence)
         # C = embedding dimensionality (n_embd from config)
         B, T, C = x.size()
         # Generate query, key, and value projections for all attention heads
         # The c_attn linear layer outputs 3 * n_embd features, which we split into Q, K, V
         # This batched approach is more efficient than separate linear layers
         # Input shape: (B, T, C) -> Output shape: (B, T, 3*C) -> Split to 3x (B, T, C)
         q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
         # Reshape tensors for multi-head attention computation
         # Transform from (B, T, C) to (B, nh, T, hs) where:
         # - nh = number of heads (self.n_head)
         q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
         k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
         v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hs)
         # Compute scaled dot-product attention scores
         # Matrix multiplication: Q @ K^T gives attention affinities between all token pairs
         # Scaling by 1/sqrt(head_dim) prevents softmax saturation for large embedding dimensions
         # Shape: (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nh, T, T)
         # The resulting (T, T) matrix represents attention weights from each token to every other token
         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
         # Apply causal masking to enforce autoregressive property
         # The causal mask ensures that token i can only attend to tokens j where j <= i
         # This prevents the model from "cheating" by looking at future tokens during training
         # We use -inf for masked positions so they become 0 after softmax
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
         # Convert attention scores to probabilities using softmax
         # Each row of the attention matrix now sums to 1, representing a probability distribution
         # over which tokens to attend to for each query position
         att = F.softmax(att, dim=-1)
         # Apply dropout to attention weights for regularization
         # This randomly zeros some attention connections during training to prevent overfitting
         att = self.attn_dropout(att)
         # Apply attention weights to value vectors
         # This weighted combination produces the actual output of the attention mechanism
         # Shape: (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
         # Each output position is a weighted sum of all value vectors, with weights from attention
         y = att @ v
         # Concatenate multi-head outputs back to original embedding dimension
         # Transform from (B, nh, T, hs) back to (B, T, C) where C = nh * hs
         # The transpose moves head dimension back, and contiguous() ensures memory layout efficiency
         # This combines information from all attention heads into a single representation
         y = y.transpose(1, 2).contiguous().view(B, T, C)
         # Apply final output projection and residual dropout
         # The output projection allows the model to learn how to best combine multi-head information
         # Residual dropout provides additional regularization before the residual connection
 class MLP(nn.Module):
     """
     Multi-Layer Perceptron (Feed-Forward Network) for Transformer.
     This implements the position-wise feed-forward network that appears in each transformer layer.
     The MLP provides additional non-linear transformation capacity beyond what attention provides.
     Architecture:
         Input -> Linear(n_embd -> 4*n_embd) -> GELU -> Linear(4*n_embd -> n_embd) -> Dropout -> Output
     Design rationale:
         - 4x expansion is standard in transformers (from "Attention Is All You Need")
         - GELU activation provides smoother gradients than ReLU for language modeling
         - Dropout prevents overfitting in the feed-forward layers
         - Two linear layers allow complex non-linear transformations of attention outputs
     Parameters:
         - First linear layer: n_embd * 4*n_embd parameters (expansion)
         - Second linear layer: 4*n_embd * n_embd parameters (projection back)
         - Total: 8 * n_embd^2 parameters (significant portion of model size)
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
         # First linear layer: expand embedding dimension by 4x
         # This expansion gives the network more representational capacity
         # The 4x factor is a standard choice that balances capacity vs efficiency
         self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
         # GELU (Gaussian Error Linear Unit) activation function
         # GELU provides smoother gradients compared to ReLU and works better for language modeling
         # It's approximately: GELU(x) = x * Φ(x) where Φ is the CDF of standard normal distribution
         self.gelu = nn.GELU()
         # Second linear layer: project back to original embedding dimension
         # This projection allows the network to combine information from the expanded representation
         self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
         # Dropout for regularization in the feed-forward network
         # Applied after the final projection to prevent overfitting
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of the feed-forward network.
         This method applies a two-layer MLP with GELU activation to transform
         the attention outputs. The MLP operates independently on each position
         in the sequence, providing position-wise non-linear transformations.
         Mathematical operation:
             MLP(x) = Dropout(Linear₂(GELU(Linear₁(x))))
             where Linear₁: R^n_embd -> R^4*n_embd and Linear₂: R^4*n_embd -> R^n_embd
         Args:
             x: Input tensor of shape (batch_size, seq_len, n_embd)
                Contains attended representations from the attention layer
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
                          Contains transformed representations ready for residual connection
         # This expansion provides the network with a higher-dimensional space for computation
         # Shape: (batch_size, seq_len, n_embd) -> (batch_size, seq_len, 4*n_embd)
         x = self.c_fc(x)
         # Apply GELU activation function for non-linearity
         # GELU is smoother than ReLU and provides better gradients for language modeling
         # It introduces non-linearity while maintaining differentiability everywhere
         x = self.gelu(x)
         # Second linear transformation: project back to original n_embd dimensions
         # This projection combines information from the expanded representation
         # Shape: (batch_size, seq_len, 4*n_embd) -> (batch_size, seq_len, n_embd)
         x = self.c_proj(x)
         # Apply dropout for regularization before residual connection
         # Dropout randomly zeros some neurons during training to prevent overfitting
         # This is particularly important in the feed-forward layers which have many parameters
         x = self.dropout(x)
         return x
 class Block(nn.Module):
     """
     Single Transformer block.
     Consists of:
     1. Layer normalization
+    2. Multi-head causal self-attention
     3. Residual connection
     4. Layer normalization
     5. MLP (feed-forward network)
     6. Residual connection
     Uses pre-norm architecture for better training stability.
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
         self.ln_1 = nn.LayerNorm(config.n_embd)
         self.attn = CausalSelfAttention(config)
         self.ln_2 = nn.LayerNorm(config.n_embd)
         self.mlp = MLP(config)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of transformer block.
         Args:
             x: Input tensor of shape (batch_size, seq_len, n_embd)
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, seq_len, n_embd)
         """
         # Pre-norm attention with residual connection
         x = x + self.attn(self.ln_1(x))
+        # Pre-norm MLP with residual connection
         x = x + self.mlp(self.ln_2(x))
         return x
 class GPTModel(nn.Module):
     """
     Complete GPT Language Model.
     This is the main model class that combines all components:
     - Token and positional embeddings
     - Stack of transformer blocks
     - Final layer normalization
     - Language modeling head
     The model can be used for:
     - Training from scratch on text data
     - Fine-tuning on downstream tasks
     - Text generation (inference)
     """
     def __init__(self, config: GPTConfig):
         super().__init__()
         assert config.vocab_size is not None, "vocab_size must be specified"
         assert config.block_size is not None, "block_size must be specified"
         self.config = config
         # Embeddings
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),  # Token embeddings
+                wpe=nn.Embedding(config.block_size, config.n_embd),  # Position embeddings
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList(
+                    [Block(config) for _ in range(config.n_layer)]
+                ),  # Transformer blocks
+                ln_f=nn.LayerNorm(config.n_embd),  # Final layer norm
+            )
+        )
         # Language modeling head (maps hidden states to vocabulary)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         # Tie weights between token embeddings and output head (common practice)
         self.transformer.wte.weight = self.lm_head.weight
         # Initialize weights
         self.apply(self._init_weights)
         # Report parameter count
         print(f"Model initialized: {self.config.model_name}")
         print(f"Parameters: {self.get_num_params():,}")
         print(f"Estimated: {self.config.estimate_parameters():,}")
     def _init_weights(self, module):
         """Initialize model weights using standard practices."""
         if isinstance(module, nn.Linear):
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def get_num_params(self, non_embedding: bool = False) -> int:
         """
         Count the number of parameters in the model.
         Args:
             non_embedding: If True, subtract embedding parameters
         Returns:
             int: Number of parameters
         """
             n_params -= self.transformer.wpe.weight.numel()
             n_params -= self.transformer.wte.weight.numel()
         return n_params
     def forward(
+        self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Forward pass of the GPT model.
         Args:
             idx: Input token indices of shape (batch_size, seq_len)
             targets: Optional target tokens for loss calculation (batch_size, seq_len)
         Returns:
             Tuple containing:
             - logits: Output logits of shape (batch_size, seq_len, vocab_size)
         """
         device = idx.device
         b, t = idx.size()
+        assert (
+            t <= self.config.block_size
+        ), f"Sequence length {t} exceeds block size {self.config.block_size}"
         # Token embeddings
         tok_emb = self.transformer.wte(idx)  # (b, t, n_embd)
         # Position embeddings
         pos = torch.arange(0, t, dtype=torch.long, device=device)  # (t,)
         pos_emb = self.transformer.wpe(pos)  # (t, n_embd)
         # Combine embeddings and apply dropout
         x = self.transformer.drop(tok_emb + pos_emb)
         # Pass through transformer blocks
         for block in self.transformer.h:
             x = block(x)
         # Final layer normalization
         x = self.transformer.ln_f(x)
         # Language modeling head
         if targets is not None:
             # If we have targets, compute loss
             logits = self.lm_head(x)
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
+            )
         else:
             # If no targets, only compute logits for the last token (more efficient for generation)
             logits = self.lm_head(x[:, [-1], :])  # Note: using list [-1] to preserve the time dim
             loss = None
         return logits, loss
     def generate(
+        self,
+        idx: torch.Tensor,
         max_new_tokens: int = 100,
         temperature: float = 1.0,
+        top_k: Optional[int] = None,
     ) -> torch.Tensor:
         """
         Generate new tokens autoregressively.
         Args:
             idx: Starting token indices (batch_size, seq_len)
             max_new_tokens: Maximum number of new tokens to generate
             temperature: Sampling temperature (higher = more random)
             top_k: If set, only sample from top-k most likely tokens
         Returns:
             torch.Tensor: Generated sequence (batch_size, seq_len + max_new_tokens)
         """
         with torch.no_grad():
             for _ in range(max_new_tokens):
                 # Crop sequence if it exceeds block size
+                idx_cond = (
+                    idx
+                    if idx.size(1) <= self.config.block_size
+                    else idx[:, -self.config.block_size :]
+                )
                 # Forward pass
                 logits, _ = self(idx_cond)
                 # Get logits for the last token and apply temperature
                 logits = logits[:, -1, :] / temperature
                 # Optionally crop to top-k most likely tokens
                 if top_k is not None:
                     v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float("Inf")
                 # Apply softmax and sample
                 probs = F.softmax(logits, dim=-1)
                 idx_next = torch.multinomial(probs, num_samples=1)
                 # Append to sequence
                 idx = torch.cat((idx, idx_next), dim=1)
         self.train()  # Return to training mode
         return idx
     def estimate_memory_usage(self, batch_size: int = 1, seq_len: int = None) -> dict:
         """
         Estimate memory usage for training and inference.
         Args:
             batch_size: Batch size for estimation
             seq_len: Sequence length (defaults to block_size)
         Returns:
             dict: Memory usage estimates in MB
         """
         if seq_len is None:
             seq_len = self.config.block_size
         # Model parameters (weights)
         param_memory = self.get_num_params() * 4 / (1024**2)  # 4 bytes per float32
         # Activations (rough estimate)
         activation_memory = (
             batch_size * seq_len * self.config.n_embd * self.config.n_layer * 8  # Rough estimate
         ) / (1024**2)
         # Gradients (same size as parameters during training)
         gradient_memory = param_memory
         return {
             "parameters_mb": param_memory,
+            "activations_mb": activation_memory,
             "gradients_mb": gradient_memory,
             "total_training_mb": param_memory + activation_memory + gradient_memory,
             "total_inference_mb": param_memory + activation_memory * 0.5,  # No gradients needed
 def create_model(model_size: str = "medium") -> GPTModel:
     """
     Factory function to create a GPT model with predefined configurations.
     Args:
         model_size: Size of model to create ("small", "medium", "large")
     Returns:
         GPTModel: Initialized model
     """
     configs = {
         "small": GPTConfig.small(),
+        "medium": GPTConfig.medium(),
         "large": GPTConfig.large(),
     }
     if model_size not in configs:
         raise ValueError(f"Unknown model size: {model_size}. Choose from {list(configs.keys())}")
     config = configs[model_size]
     model = GPTModel(config)
     return model
     # Example usage
     print("🧠 GPT Model Architecture")
     print("=" * 50)
     # Create models of different sizes
     for size in ["small", "medium", "large"]:
         print(f"\n{size.upper()} MODEL:")
         model = create_model(size)
         # Show memory estimates
         memory = model.estimate_memory_usage(batch_size=4, seq_len=512)
+        print(
+            f"Memory (4 batch, 512 seq): {memory['total_training_mb']:.1f}MB training, {memory['total_inference_mb']:.1f}MB inference"
+        )
         # Test forward pass
         x = torch.randint(0, 32000, (2, 64))  # Batch size 2, sequence length 64
         with torch.no_grad():
             logits, _ = model(x)
+        print(f"Test forward pass: {x.shape} -> {logits.shape} ✓")

training/train_model.py CHANGED Viewed

@@ -69,6 +69,7 @@ try:
     from data_loader import TextDataLoader
 except ImportError:
     import sys
     sys.path.append(os.path.dirname(__file__))
     from model import GPTModel, GPTConfig, create_model
     from data_loader import TextDataLoader
@@ -77,11 +78,11 @@ except ImportError:
 class ModelTrainer:
     """
     Comprehensive trainer for GPT-style language models.
     Handles the complete training pipeline including data loading, optimization,
     checkpointing, and progress monitoring.
     """
     def __init__(
         self,
         model: GPTModel,
@@ -96,11 +97,11 @@ class ModelTrainer:
         gradient_clipping: float = 1.0,
         save_every: int = 1000,
         eval_every: int = 500,
-        log_every: int = 100
     ):
         """
         Initialize the model trainer.
         Args:
             model: GPT model to train
             data_loader: Data loader for training data
@@ -120,7 +121,7 @@ class ModelTrainer:
         self.data_loader = data_loader
         self.output_dir = Path(output_dir)
         self.device = device
         # Training hyperparameters
         self.learning_rate = learning_rate
         self.weight_decay = weight_decay
@@ -128,29 +129,29 @@ class ModelTrainer:
         self.max_steps = max_steps
         self.gradient_accumulation_steps = gradient_accumulation_steps
         self.gradient_clipping = gradient_clipping
         # Logging and saving
         self.save_every = save_every
         self.eval_every = eval_every
         self.log_every = log_every
         # Create output directory
         self.output_dir.mkdir(parents=True, exist_ok=True)
         # Initialize optimizer and scheduler
         self.optimizer = self._create_optimizer()
         self.scheduler = self._create_scheduler()
         # Training state
         self.step = 0
         self.epoch = 0
-        self.best_loss = float('inf')
         self.training_log = []
         # Performance tracking
         self.start_time = None
         self.step_times = []
         print(f"🚀 ModelTrainer initialized")
         print(f"  Device: {device}")
         print(f"  Model parameters: {model.get_num_params():,}")
@@ -158,38 +159,38 @@ class ModelTrainer:
         print(f"  Max steps: {max_steps:,}")
         print(f"  Gradient accumulation: {gradient_accumulation_steps}")
         print(f"  Output directory: {output_dir}")
     def _create_optimizer(self) -> optim.Optimizer:
         """Create AdamW optimizer with weight decay."""
         # Separate parameters for weight decay
         decay_params = []
         no_decay_params = []
         for name, param in self.model.named_parameters():
             if not param.requires_grad:
                 continue
             # Don't apply weight decay to biases and layer norm parameters
-            if len(param.shape) == 1 or name.endswith('.bias'):
                 no_decay_params.append(param)
             else:
                 decay_params.append(param)
         param_groups = [
-            {'params': decay_params, 'weight_decay': self.weight_decay},
-            {'params': no_decay_params, 'weight_decay': 0.0}
         ]
         # Use AdamW with lower memory usage for CPU
         optimizer = optim.AdamW(
             param_groups,
             lr=self.learning_rate,
             betas=(0.9, 0.95),  # Slightly different from default for LLM training
-            eps=1e-8
         )
         return optimizer
     def _create_scheduler(self) -> torch.optim.lr_scheduler._LRScheduler:
         """Create learning rate scheduler with warmup and cosine decay."""
         if self.warmup_steps > 0:
@@ -201,59 +202,59 @@ class ModelTrainer:
                     self.max_steps = max_steps
                     self.min_lr_factor = min_lr_factor
                     super().__init__(optimizer)
                 def get_lr(self):
                     if self.last_epoch < self.warmup_steps:
-            # Linear warmup
                         factor = self.last_epoch / self.warmup_steps
                         return [base_lr * (0.01 + 0.99 * factor) for base_lr in self.base_lrs]
                     else:
                         # Cosine decay
-                        progress = (self.last_epoch - self.warmup_steps) / (self.max_steps - self.warmup_steps)
                         progress = min(progress, 1.0)  # Clamp to 1.0
                         factor = 0.5 * (1 + math.cos(math.pi * progress))
                         factor = self.min_lr_factor + (1 - self.min_lr_factor) * factor
                         return [base_lr * factor for base_lr in self.base_lrs]
             scheduler = WarmupCosineScheduler(
                 self.optimizer,
                 warmup_steps=self.warmup_steps,
                 max_steps=self.max_steps,
-                min_lr_factor=0.1
             )
         else:
             # Just cosine decay - this should not trigger warnings
             scheduler = CosineAnnealingLR(
-                self.optimizer,
-                T_max=self.max_steps,
-                eta_min=self.learning_rate * 0.1
             )
         return scheduler
     def _calculate_loss(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
         """
         Calculate cross-entropy loss for autoregressive language modeling.
         This method computes the standard cross-entropy loss used in language model training.
         The loss measures how well the model predicts the next token in the sequence.
         Mathematical formulation:
             Loss = -∑ log(P(target_token | context))
             where P is the softmax probability distribution over vocabulary
         Implementation details:
             - Reshapes 3D tensors to 2D for efficient computation
             - Uses PyTorch's optimized cross_entropy function
             - Handles padding tokens by ignoring them in loss calculation
             - Computes mean loss across all valid positions
         Why cross-entropy for language modeling:
             - Natural choice for multi-class classification (next token prediction)
             - Provides strong gradient signal for correct token probabilities
             - Mathematically equivalent to minimizing negative log-likelihood
             - Well-studied optimization properties for neural language models
         Args:
             logits: Raw model predictions of shape (batch_size, seq_len, vocab_size)
                    Contains unnormalized scores for each token in vocabulary
@@ -261,7 +262,7 @@ class ModelTrainer:
             targets: Ground truth next tokens of shape (batch_size, seq_len)
                     Contains token IDs representing the true next tokens
                     Should be input sequence shifted by one position
         Returns:
             torch.Tensor: Scalar loss value representing prediction error
                          Lower values indicate better next-token prediction accuracy
@@ -271,123 +272,126 @@ class ModelTrainer:
         # where each row represents one prediction over the entire vocabulary
         logits = logits.view(-1, logits.size(-1))  # (batch_size * seq_len, vocab_size)
         targets = targets.view(-1)  # (batch_size * seq_len,)
         # Calculate cross-entropy loss with proper handling of special tokens
         # ignore_index=-1 excludes padding tokens from loss calculation
         # This prevents the model from learning to predict padding, which would skew training
         # The function internally applies softmax to logits and computes negative log-likelihood
         loss = nn.functional.cross_entropy(logits, targets, ignore_index=-1)
         # Return scalar loss for backpropagation
         # This loss will be used to compute gradients via automatic differentiation
         return loss
     def _get_memory_usage(self) -> Dict[str, float]:
         """Get current memory usage statistics."""
         memory_stats = {}
-        if torch.cuda.is_available() and self.device.startswith('cuda'):
-            memory_stats['gpu_allocated_mb'] = torch.cuda.memory_allocated() / (1024**2)
-            memory_stats['gpu_cached_mb'] = torch.cuda.memory_reserved() / (1024**2)
         # Estimate CPU memory (approximate)
         import psutil
         process = psutil.Process()
-        memory_stats['cpu_memory_mb'] = process.memory_info().rss / (1024**2)
         return memory_stats
     def _log_step(self, step: int, loss: float, lr: float, step_time: float) -> None:
         """Log training progress for a single step."""
         perplexity = math.exp(min(loss, 10))  # Cap at exp(10) to avoid overflow
         # Calculate tokens per second
         tokens_per_batch = self.data_loader.batch_size * self.data_loader.seq_len
         tokens_per_second = tokens_per_batch / step_time if step_time > 0 else 0
         # Get memory usage
         memory_stats = self._get_memory_usage()
         # Create log entry
         log_entry = {
-            'step': step,
-            'loss': loss,
-            'perplexity': perplexity,
-            'learning_rate': lr,
-            'step_time': step_time,
-            'tokens_per_second': tokens_per_second,
-            'memory_mb': memory_stats.get('cpu_memory_mb', 0)
         }
         self.training_log.append(log_entry)
         # Print progress
         elapsed_time = time.time() - self.start_time if self.start_time else 0
         eta_seconds = (self.max_steps - step) * step_time if step_time > 0 else 0
         eta_hours = eta_seconds / 3600
-        print(f"Step {step:,}/{self.max_steps:,} | "
-              f"Loss: {loss:.4f} | "
-              f"PPL: {perplexity:.2f} | "
-              f"LR: {lr:.2e} | "
-              f"Time: {step_time:.2f}s | "
-              f"Tokens/s: {tokens_per_second:.1f} | "
-              f"Memory: {memory_stats.get('cpu_memory_mb', 0):.0f}MB | "
-              f"ETA: {eta_hours:.1f}h")
     def _save_checkpoint(self, step: int, is_best: bool = False) -> None:
         """Save model checkpoint."""
         checkpoint = {
-            'step': step,
-            'epoch': self.epoch,
-            'model_state_dict': self.model.state_dict(),
-            'optimizer_state_dict': self.optimizer.state_dict(),
-            'scheduler_state_dict': self.scheduler.state_dict(),
-            'best_loss': self.best_loss,
-            'training_log': self.training_log,
-            'config': self.model.config.__dict__
         }
         # Save latest checkpoint
         checkpoint_path = self.output_dir / f"checkpoint_step_{step}.pt"
         torch.save(checkpoint, checkpoint_path)
         # Save best checkpoint
         if is_best:
             best_path = self.output_dir / "best_model.pt"
             torch.save(checkpoint, best_path)
             print(f"💾 New best model saved: {best_path}")
         # Save training log
         log_path = self.output_dir / "training_log.json"
-        with open(log_path, 'w') as f:
             json.dump(self.training_log, f, indent=2)
         print(f"💾 Checkpoint saved: {checkpoint_path}")
     def _load_checkpoint(self, checkpoint_path: str) -> None:
         """Load model checkpoint to resume training."""
         if not os.path.exists(checkpoint_path):
             print(f"⚠️  Checkpoint not found: {checkpoint_path}")
             return
         print(f"📂 Loading checkpoint: {checkpoint_path}")
         checkpoint = torch.load(checkpoint_path, map_location=self.device)
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
-        self.step = checkpoint['step']
-        self.epoch = checkpoint['epoch']
-        self.best_loss = checkpoint['best_loss']
-        self.training_log = checkpoint.get('training_log', [])
         print(f"✓ Checkpoint loaded successfully")
         print(f"  Resuming from step: {self.step:,}")
         print(f"  Best loss so far: {self.best_loss:.4f}")
     def train(self) -> None:
         """Main training loop."""
         print(f"\n🚀 Starting training...")
@@ -396,85 +400,85 @@ class ModelTrainer:
         print(f"  Device: {self.device}")
         print(f"  Max steps: {self.max_steps:,}")
         print("=" * 80)
         self.model.train()
         self.start_time = time.time()
         # Initialize gradient accumulation
         accumulated_loss = 0.0
         self.optimizer.zero_grad()
         for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
             if self.step >= self.max_steps:
                 break
             step_start_time = time.time()
             # Move batch to device
             input_ids = input_ids.to(self.device)
             target_ids = target_ids.to(self.device)
             # Forward pass (model computes loss internally when targets provided)
             logits, loss = self.model(input_ids, target_ids)
             # Scale loss for gradient accumulation
             loss = loss / self.gradient_accumulation_steps
             accumulated_loss += loss.item()
             # Backward pass
             loss.backward()
             # Update weights every gradient_accumulation_steps
             if (batch_idx + 1) % self.gradient_accumulation_steps == 0:
                 # Clip gradients
                 if self.gradient_clipping > 0:
                     torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.gradient_clipping)
                 # Update parameters
                 self.optimizer.step()
                 self.scheduler.step()
                 self.optimizer.zero_grad()
                 # Update step count
                 self.step += 1
                 step_time = time.time() - step_start_time
                 self.step_times.append(step_time)
                 # Get current learning rate
                 current_lr = self.scheduler.get_last_lr()[0]
                 # Log progress
                 if self.step % self.log_every == 0:
                     avg_loss = accumulated_loss
                     self._log_step(self.step, avg_loss, current_lr, step_time)
                 # Save checkpoint
                 if self.step % self.save_every == 0:
                     is_best = accumulated_loss < self.best_loss
                     if is_best:
                         self.best_loss = accumulated_loss
                     self._save_checkpoint(self.step, is_best)
                 # Clean up memory periodically
                 if self.step % 100 == 0:
                     gc.collect()
                 # Reset accumulated loss
                 accumulated_loss = 0.0
                 # Check if training complete
                 if self.step >= self.max_steps:
                     break
         # Final checkpoint
         print(f"\n🎉 Training completed!")
         self._save_checkpoint(self.step, is_best=True)
         # Training summary
         total_time = time.time() - self.start_time
         avg_step_time = sum(self.step_times) / len(self.step_times) if self.step_times else 0
         print(f"\n📊 Training Summary:")
         print(f"  Steps completed: {self.step:,}")
         print(f"  Total time: {total_time/3600:.2f} hours")
@@ -504,130 +508,105 @@ Examples:
     --batch-size 2 \\
     --max-steps 50000 \\
     --output-dir models/my-medium-model
-        """
     )
     # Model and data arguments
     parser.add_argument(
         "--model-size",
         choices=["small", "medium", "large"],
         default="small",
-        help="Model size to train (default: small)"
     )
     parser.add_argument(
         "--data-file",
         default="data/clean/training_data.txt",
-        help="Path to training text file (default: data/clean/training_data.txt)"
     )
     parser.add_argument(
         "--tokenizer-dir",
         default="data/tokenizer/",
-        help="Path to tokenizer directory (default: data/tokenizer/)"
     )
     parser.add_argument(
-        "--output-dir",
-        required=True,
-        help="Output directory for model checkpoints"
     )
     # Training hyperparameters
     parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=512,
-        help="Sequence length for training (default: 512)"
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4,
-        help="Batch size (default: 4)"
     )
     parser.add_argument(
-        "--learning-rate",
-        type=float,
-        default=3e-4,
-        help="Learning rate (default: 3e-4)"
     )
     parser.add_argument(
-        "--max-steps",
-        type=int,
-        default=10000,
-        help="Maximum training steps (default: 10000)"
     )
     parser.add_argument(
-        "--warmup-steps",
-        type=int,
-        default=1000,
-        help="Warmup steps (default: 1000)"
     )
     parser.add_argument(
         "--gradient-accumulation-steps",
         type=int,
         default=4,
-        help="Gradient accumulation steps (default: 4)"
     )
     parser.add_argument(
         "--device",
         choices=["cpu", "cuda", "auto"],
         default="auto",
-        help="Training device (default: auto)"
-    )
-    parser.add_argument(
-        "--resume",
-        help="Path to checkpoint to resume training from"
     )
     parser.add_argument(
-        "--save-every",
-        type=int,
-        default=1000,
-        help="Save checkpoint every N steps (default: 1000)"
     )
     args = parser.parse_args()
     print("🚀 OpenLLM Model Training")
     print("=" * 60)
     # Determine device
     if args.device == "auto":
         device = "cuda" if torch.cuda.is_available() else "cpu"
     else:
         device = args.device
     print(f"Using device: {device}")
     try:
         # Create model
         print(f"\n🏗️  Creating {args.model_size} model...")
         model = create_model(args.model_size)
         # Create data loader
         print(f"\n📊 Setting up data loader...")
         tokenizer_path = os.path.join(args.tokenizer_dir, "tokenizer.model")
         data_loader = TextDataLoader(
             data_file=args.data_file,
             tokenizer_path=tokenizer_path,
             seq_len=args.seq_len,
             batch_size=args.batch_size,
-            shuffle=True
         )
         # Get data statistics
         data_stats = data_loader.get_data_stats()
         # Create trainer
         print(f"\n🎯 Setting up trainer...")
         trainer = ModelTrainer(
@@ -639,26 +618,27 @@ Examples:
             max_steps=args.max_steps,
             warmup_steps=args.warmup_steps,
             gradient_accumulation_steps=args.gradient_accumulation_steps,
-            save_every=args.save_every
         )
         # Resume from checkpoint if specified
         if args.resume:
             trainer._load_checkpoint(args.resume)
         # Start training
         trainer.train()
         print(f"\n🎉 Training completed successfully!")
     except Exception as e:
         print(f"\n❌ Training failed: {e}")
         import traceback
         traceback.print_exc()
         return False
     return True
 if __name__ == "__main__":
-    main()

     from data_loader import TextDataLoader
 except ImportError:
     import sys
     sys.path.append(os.path.dirname(__file__))
     from model import GPTModel, GPTConfig, create_model
     from data_loader import TextDataLoader
 class ModelTrainer:
     """
     Comprehensive trainer for GPT-style language models.
     Handles the complete training pipeline including data loading, optimization,
     checkpointing, and progress monitoring.
     """
     def __init__(
         self,
         model: GPTModel,
         gradient_clipping: float = 1.0,
         save_every: int = 1000,
         eval_every: int = 500,
+        log_every: int = 100,
     ):
         """
         Initialize the model trainer.
         Args:
             model: GPT model to train
             data_loader: Data loader for training data
         self.data_loader = data_loader
         self.output_dir = Path(output_dir)
         self.device = device
         # Training hyperparameters
         self.learning_rate = learning_rate
         self.weight_decay = weight_decay
         self.max_steps = max_steps
         self.gradient_accumulation_steps = gradient_accumulation_steps
         self.gradient_clipping = gradient_clipping
         # Logging and saving
         self.save_every = save_every
         self.eval_every = eval_every
         self.log_every = log_every
         # Create output directory
         self.output_dir.mkdir(parents=True, exist_ok=True)
         # Initialize optimizer and scheduler
         self.optimizer = self._create_optimizer()
         self.scheduler = self._create_scheduler()
         # Training state
         self.step = 0
         self.epoch = 0
+        self.best_loss = float("inf")
         self.training_log = []
         # Performance tracking
         self.start_time = None
         self.step_times = []
         print(f"🚀 ModelTrainer initialized")
         print(f"  Device: {device}")
         print(f"  Model parameters: {model.get_num_params():,}")
         print(f"  Max steps: {max_steps:,}")
         print(f"  Gradient accumulation: {gradient_accumulation_steps}")
         print(f"  Output directory: {output_dir}")
     def _create_optimizer(self) -> optim.Optimizer:
         """Create AdamW optimizer with weight decay."""
         # Separate parameters for weight decay
         decay_params = []
         no_decay_params = []
         for name, param in self.model.named_parameters():
             if not param.requires_grad:
                 continue
             # Don't apply weight decay to biases and layer norm parameters
+            if len(param.shape) == 1 or name.endswith(".bias"):
                 no_decay_params.append(param)
             else:
                 decay_params.append(param)
         param_groups = [
+            {"params": decay_params, "weight_decay": self.weight_decay},
+            {"params": no_decay_params, "weight_decay": 0.0},
         ]
         # Use AdamW with lower memory usage for CPU
         optimizer = optim.AdamW(
             param_groups,
             lr=self.learning_rate,
             betas=(0.9, 0.95),  # Slightly different from default for LLM training
+            eps=1e-8,
         )
         return optimizer
     def _create_scheduler(self) -> torch.optim.lr_scheduler._LRScheduler:
         """Create learning rate scheduler with warmup and cosine decay."""
         if self.warmup_steps > 0:
                     self.max_steps = max_steps
                     self.min_lr_factor = min_lr_factor
                     super().__init__(optimizer)
                 def get_lr(self):
                     if self.last_epoch < self.warmup_steps:
+                        # Linear warmup
                         factor = self.last_epoch / self.warmup_steps
                         return [base_lr * (0.01 + 0.99 * factor) for base_lr in self.base_lrs]
                     else:
                         # Cosine decay
+                        progress = (self.last_epoch - self.warmup_steps) / (
+                            self.max_steps - self.warmup_steps
+                        )
                         progress = min(progress, 1.0)  # Clamp to 1.0
                         factor = 0.5 * (1 + math.cos(math.pi * progress))
                         factor = self.min_lr_factor + (1 - self.min_lr_factor) * factor
                         return [base_lr * factor for base_lr in self.base_lrs]
             scheduler = WarmupCosineScheduler(
                 self.optimizer,
                 warmup_steps=self.warmup_steps,
                 max_steps=self.max_steps,
+                min_lr_factor=0.1,
             )
         else:
             # Just cosine decay - this should not trigger warnings
             scheduler = CosineAnnealingLR(
+                self.optimizer, T_max=self.max_steps, eta_min=self.learning_rate * 0.1
             )
         return scheduler
     def _calculate_loss(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
         """
         Calculate cross-entropy loss for autoregressive language modeling.
         This method computes the standard cross-entropy loss used in language model training.
         The loss measures how well the model predicts the next token in the sequence.
         Mathematical formulation:
             Loss = -∑ log(P(target_token | context))
             where P is the softmax probability distribution over vocabulary
         Implementation details:
             - Reshapes 3D tensors to 2D for efficient computation
             - Uses PyTorch's optimized cross_entropy function
             - Handles padding tokens by ignoring them in loss calculation
             - Computes mean loss across all valid positions
         Why cross-entropy for language modeling:
             - Natural choice for multi-class classification (next token prediction)
             - Provides strong gradient signal for correct token probabilities
             - Mathematically equivalent to minimizing negative log-likelihood
             - Well-studied optimization properties for neural language models
         Args:
             logits: Raw model predictions of shape (batch_size, seq_len, vocab_size)
                    Contains unnormalized scores for each token in vocabulary
             targets: Ground truth next tokens of shape (batch_size, seq_len)
                     Contains token IDs representing the true next tokens
                     Should be input sequence shifted by one position
         Returns:
             torch.Tensor: Scalar loss value representing prediction error
                          Lower values indicate better next-token prediction accuracy
         # where each row represents one prediction over the entire vocabulary
         logits = logits.view(-1, logits.size(-1))  # (batch_size * seq_len, vocab_size)
         targets = targets.view(-1)  # (batch_size * seq_len,)
         # Calculate cross-entropy loss with proper handling of special tokens
         # ignore_index=-1 excludes padding tokens from loss calculation
         # This prevents the model from learning to predict padding, which would skew training
         # The function internally applies softmax to logits and computes negative log-likelihood
         loss = nn.functional.cross_entropy(logits, targets, ignore_index=-1)
         # Return scalar loss for backpropagation
         # This loss will be used to compute gradients via automatic differentiation
         return loss
     def _get_memory_usage(self) -> Dict[str, float]:
         """Get current memory usage statistics."""
         memory_stats = {}
+        if torch.cuda.is_available() and self.device.startswith("cuda"):
+            memory_stats["gpu_allocated_mb"] = torch.cuda.memory_allocated() / (1024**2)
+            memory_stats["gpu_cached_mb"] = torch.cuda.memory_reserved() / (1024**2)
         # Estimate CPU memory (approximate)
         import psutil
         process = psutil.Process()
+        memory_stats["cpu_memory_mb"] = process.memory_info().rss / (1024**2)
         return memory_stats
     def _log_step(self, step: int, loss: float, lr: float, step_time: float) -> None:
         """Log training progress for a single step."""
         perplexity = math.exp(min(loss, 10))  # Cap at exp(10) to avoid overflow
         # Calculate tokens per second
         tokens_per_batch = self.data_loader.batch_size * self.data_loader.seq_len
         tokens_per_second = tokens_per_batch / step_time if step_time > 0 else 0
         # Get memory usage
         memory_stats = self._get_memory_usage()
         # Create log entry
         log_entry = {
+            "step": step,
+            "loss": loss,
+            "perplexity": perplexity,
+            "learning_rate": lr,
+            "step_time": step_time,
+            "tokens_per_second": tokens_per_second,
+            "memory_mb": memory_stats.get("cpu_memory_mb", 0),
         }
         self.training_log.append(log_entry)
         # Print progress
         elapsed_time = time.time() - self.start_time if self.start_time else 0
         eta_seconds = (self.max_steps - step) * step_time if step_time > 0 else 0
         eta_hours = eta_seconds / 3600
+        print(
+            f"Step {step:,}/{self.max_steps:,} | "
+            f"Loss: {loss:.4f} | "
+            f"PPL: {perplexity:.2f} | "
+            f"LR: {lr:.2e} | "
+            f"Time: {step_time:.2f}s | "
+            f"Tokens/s: {tokens_per_second:.1f} | "
+            f"Memory: {memory_stats.get('cpu_memory_mb', 0):.0f}MB | "
+            f"ETA: {eta_hours:.1f}h"
+        )
     def _save_checkpoint(self, step: int, is_best: bool = False) -> None:
         """Save model checkpoint."""
         checkpoint = {
+            "step": step,
+            "epoch": self.epoch,
+            "model_state_dict": self.model.state_dict(),
+            "optimizer_state_dict": self.optimizer.state_dict(),
+            "scheduler_state_dict": self.scheduler.state_dict(),
+            "best_loss": self.best_loss,
+            "training_log": self.training_log,
+            "config": self.model.config.__dict__,
         }
         # Save latest checkpoint
         checkpoint_path = self.output_dir / f"checkpoint_step_{step}.pt"
         torch.save(checkpoint, checkpoint_path)
         # Save best checkpoint
         if is_best:
             best_path = self.output_dir / "best_model.pt"
             torch.save(checkpoint, best_path)
             print(f"💾 New best model saved: {best_path}")
         # Save training log
         log_path = self.output_dir / "training_log.json"
+        with open(log_path, "w") as f:
             json.dump(self.training_log, f, indent=2)
         print(f"💾 Checkpoint saved: {checkpoint_path}")
     def _load_checkpoint(self, checkpoint_path: str) -> None:
         """Load model checkpoint to resume training."""
         if not os.path.exists(checkpoint_path):
             print(f"⚠️  Checkpoint not found: {checkpoint_path}")
             return
         print(f"📂 Loading checkpoint: {checkpoint_path}")
         checkpoint = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+        self.step = checkpoint["step"]
+        self.epoch = checkpoint["epoch"]
+        self.best_loss = checkpoint["best_loss"]
+        self.training_log = checkpoint.get("training_log", [])
         print(f"✓ Checkpoint loaded successfully")
         print(f"  Resuming from step: {self.step:,}")
         print(f"  Best loss so far: {self.best_loss:.4f}")
     def train(self) -> None:
         """Main training loop."""
         print(f"\n🚀 Starting training...")
         print(f"  Device: {self.device}")
         print(f"  Max steps: {self.max_steps:,}")
         print("=" * 80)
         self.model.train()
         self.start_time = time.time()
         # Initialize gradient accumulation
         accumulated_loss = 0.0
         self.optimizer.zero_grad()
         for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
             if self.step >= self.max_steps:
                 break
             step_start_time = time.time()
             # Move batch to device
             input_ids = input_ids.to(self.device)
             target_ids = target_ids.to(self.device)
             # Forward pass (model computes loss internally when targets provided)
             logits, loss = self.model(input_ids, target_ids)
             # Scale loss for gradient accumulation
             loss = loss / self.gradient_accumulation_steps
             accumulated_loss += loss.item()
             # Backward pass
             loss.backward()
             # Update weights every gradient_accumulation_steps
             if (batch_idx + 1) % self.gradient_accumulation_steps == 0:
                 # Clip gradients
                 if self.gradient_clipping > 0:
                     torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.gradient_clipping)
                 # Update parameters
                 self.optimizer.step()
                 self.scheduler.step()
                 self.optimizer.zero_grad()
                 # Update step count
                 self.step += 1
                 step_time = time.time() - step_start_time
                 self.step_times.append(step_time)
                 # Get current learning rate
                 current_lr = self.scheduler.get_last_lr()[0]
                 # Log progress
                 if self.step % self.log_every == 0:
                     avg_loss = accumulated_loss
                     self._log_step(self.step, avg_loss, current_lr, step_time)
                 # Save checkpoint
                 if self.step % self.save_every == 0:
                     is_best = accumulated_loss < self.best_loss
                     if is_best:
                         self.best_loss = accumulated_loss
                     self._save_checkpoint(self.step, is_best)
                 # Clean up memory periodically
                 if self.step % 100 == 0:
                     gc.collect()
                 # Reset accumulated loss
                 accumulated_loss = 0.0
                 # Check if training complete
                 if self.step >= self.max_steps:
                     break
         # Final checkpoint
         print(f"\n🎉 Training completed!")
         self._save_checkpoint(self.step, is_best=True)
         # Training summary
         total_time = time.time() - self.start_time
         avg_step_time = sum(self.step_times) / len(self.step_times) if self.step_times else 0
         print(f"\n📊 Training Summary:")
         print(f"  Steps completed: {self.step:,}")
         print(f"  Total time: {total_time/3600:.2f} hours")
     --batch-size 2 \\
     --max-steps 50000 \\
     --output-dir models/my-medium-model
+        """,
     )
     # Model and data arguments
     parser.add_argument(
         "--model-size",
         choices=["small", "medium", "large"],
         default="small",
+        help="Model size to train (default: small)",
     )
     parser.add_argument(
         "--data-file",
         default="data/clean/training_data.txt",
+        help="Path to training text file (default: data/clean/training_data.txt)",
     )
     parser.add_argument(
         "--tokenizer-dir",
         default="data/tokenizer/",
+        help="Path to tokenizer directory (default: data/tokenizer/)",
     )
     parser.add_argument(
+        "--output-dir", required=True, help="Output directory for model checkpoints"
     )
     # Training hyperparameters
     parser.add_argument(
+        "--seq-len", type=int, default=512, help="Sequence length for training (default: 512)"
     )
+    parser.add_argument("--batch-size", type=int, default=4, help="Batch size (default: 4)")
     parser.add_argument(
+        "--learning-rate", type=float, default=3e-4, help="Learning rate (default: 3e-4)"
     )
     parser.add_argument(
+        "--max-steps", type=int, default=10000, help="Maximum training steps (default: 10000)"
     )
     parser.add_argument(
+        "--warmup-steps", type=int, default=1000, help="Warmup steps (default: 1000)"
     )
     parser.add_argument(
         "--gradient-accumulation-steps",
         type=int,
         default=4,
+        help="Gradient accumulation steps (default: 4)",
     )
     parser.add_argument(
         "--device",
         choices=["cpu", "cuda", "auto"],
         default="auto",
+        help="Training device (default: auto)",
     )
+    parser.add_argument("--resume", help="Path to checkpoint to resume training from")
     parser.add_argument(
+        "--save-every", type=int, default=1000, help="Save checkpoint every N steps (default: 1000)"
     )
     args = parser.parse_args()
     print("🚀 OpenLLM Model Training")
     print("=" * 60)
     # Determine device
     if args.device == "auto":
         device = "cuda" if torch.cuda.is_available() else "cpu"
     else:
         device = args.device
     print(f"Using device: {device}")
     try:
         # Create model
         print(f"\n🏗️  Creating {args.model_size} model...")
         model = create_model(args.model_size)
         # Create data loader
         print(f"\n📊 Setting up data loader...")
         tokenizer_path = os.path.join(args.tokenizer_dir, "tokenizer.model")
         data_loader = TextDataLoader(
             data_file=args.data_file,
             tokenizer_path=tokenizer_path,
             seq_len=args.seq_len,
             batch_size=args.batch_size,
+            shuffle=True,
         )
         # Get data statistics
         data_stats = data_loader.get_data_stats()
         # Create trainer
         print(f"\n🎯 Setting up trainer...")
         trainer = ModelTrainer(
             max_steps=args.max_steps,
             warmup_steps=args.warmup_steps,
             gradient_accumulation_steps=args.gradient_accumulation_steps,
+            save_every=args.save_every,
         )
         # Resume from checkpoint if specified
         if args.resume:
             trainer._load_checkpoint(args.resume)
         # Start training
         trainer.train()
         print(f"\n🎉 Training completed successfully!")
     except Exception as e:
         print(f"\n❌ Training failed: {e}")
         import traceback
         traceback.print_exc()
         return False
     return True
 if __name__ == "__main__":
+    main()

training/train_tokenizer.py CHANGED Viewed

@@ -76,46 +76,48 @@ except ImportError:
 def validate_input_file(input_path: str) -> None:
     """
     Validate that the input training file exists and is readable.
     Args:
         input_path (str): Path to the training text file
     Raises:
         FileNotFoundError: If input file doesn't exist
         ValueError: If input file is empty or unreadable
     """
     if not os.path.exists(input_path):
         raise FileNotFoundError(f"Training data file not found: {input_path}")
     # Check file size and readability
     file_size = os.path.getsize(input_path)
     if file_size == 0:
         raise ValueError(f"Training data file is empty: {input_path}")
     # Test that we can read the file
     try:
-        with open(input_path, 'r', encoding='utf-8') as f:
             first_line = f.readline()
             if not first_line.strip():
-                raise ValueError(f"Training data file appears to be empty or contains only whitespace")
     except UnicodeDecodeError as e:
         raise ValueError(f"Cannot read training data file as UTF-8: {e}")
     print(f"✓ Input file validated: {input_path} ({file_size:,} bytes)")
 def count_training_sentences(input_path: str) -> int:
     """
     Count the number of training sentences/lines in the input file.
     Args:
         input_path (str): Path to the training text file
     Returns:
         int: Number of lines in the file
     """
     print("Counting training sentences...")
-    with open(input_path, 'r', encoding='utf-8') as f:
         count = sum(1 for line in f if line.strip())
     print(f"✓ Found {count:,} training sentences")
     return count
@@ -133,7 +135,7 @@ def train_sentencepiece_tokenizer(
 ) -> Dict[str, Any]:
     """
     Train a SentencePiece tokenizer with the specified parameters.
     Args:
         input_path (str): Path to training text file
         output_dir (str): Directory to save tokenizer files
@@ -143,16 +145,16 @@ def train_sentencepiece_tokenizer(
         max_sentence_length (int): Maximum sentence length in characters
         input_sentence_size (int): Maximum number of sentences to use for training
         shuffle_input_sentence (bool): Whether to shuffle input sentences
     Returns:
         Dict[str, Any]: Training statistics and configuration
     """
     # Ensure output directory exists
     os.makedirs(output_dir, exist_ok=True)
     # Define output paths
     model_prefix = os.path.join(output_dir, "tokenizer")
     # SentencePiece training parameters
     train_params = [
         f"--input={input_path}",
@@ -163,30 +165,28 @@ def train_sentencepiece_tokenizer(
         f"--max_sentence_length={max_sentence_length}",
         f"--input_sentence_size={input_sentence_size}",
         f"--shuffle_input_sentence={shuffle_input_sentence}",
         # Special tokens for language modeling
-        "--pad_id=0",          # Padding token
-        "--unk_id=1",          # Unknown token
-        "--bos_id=2",          # Beginning of sequence
-        "--eos_id=3",          # End of sequence
         # Additional useful parameters
-        "--split_by_unicode_script=true",    # Better handling of mixed scripts
-        "--split_by_whitespace=true",        # Split on whitespace
-        "--remove_extra_whitespaces=true",   # Clean up whitespace
-        "--normalization_rule_name=identity", # Keep original text as-is
     ]
     print(f"\nTraining SentencePiece tokenizer...")
     print(f"  Algorithm: {model_type.upper()}")
     print(f"  Vocabulary size: {vocab_size:,}")
     print(f"  Character coverage: {character_coverage}")
     print(f"  Output directory: {output_dir}")
     print(f"  Model files: {model_prefix}.model, {model_prefix}.vocab")
     # Record training start time
     start_time = time.time()
     # Train the tokenizer
     try:
         spm.SentencePieceTrainer.train(" ".join(train_params))
@@ -194,19 +194,19 @@ def train_sentencepiece_tokenizer(
         print(f"✓ Tokenizer training completed in {training_time:.1f} seconds")
     except Exception as e:
         raise RuntimeError(f"SentencePiece training failed: {e}")
     # Verify output files were created
     model_file = f"{model_prefix}.model"
     vocab_file = f"{model_prefix}.vocab"
     if not os.path.exists(model_file):
         raise RuntimeError(f"Expected model file not created: {model_file}")
     if not os.path.exists(vocab_file):
         raise RuntimeError(f"Expected vocab file not created: {vocab_file}")
     print(f"✓ Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
     print(f"✓ Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")
     # Return training configuration and statistics
     config = {
         "model_type": model_type,
@@ -219,24 +219,24 @@ def train_sentencepiece_tokenizer(
         "model_file": model_file,
         "vocab_file": vocab_file,
     }
     return config
 def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
     """
     Test the trained tokenizer on sample sentences to verify it works correctly.
     Args:
         model_path (str): Path to the trained .model file
         test_sentences (list): Optional list of test sentences
     """
     print(f"\nTesting trained tokenizer...")
     # Load the trained tokenizer
     sp = spm.SentencePieceProcessor()
     sp.load(model_path)
     # Default test sentences if none provided
     if test_sentences is None:
         test_sentences = [
@@ -245,63 +245,65 @@ def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
             "Machine learning and artificial intelligence are transforming technology.",
             "SentencePiece tokenization works well for language models.",
         ]
     print(f"Vocabulary size: {sp.vocab_size():,}")
-    print(f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}")
     print("\nTokenization examples:")
     for i, sentence in enumerate(test_sentences, 1):
         # Encode to token IDs and pieces
         token_ids = sp.encode(sentence)
         token_pieces = sp.encode(sentence, out_type=str)
         print(f"\n{i}. Input: {sentence}")
         print(f"   Tokens ({len(token_pieces)}): {token_pieces}")
         print(f"   IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")
         # Test decoding
         decoded = sp.decode(token_ids)
         print(f"   Decoded: {decoded}")
         # Verify round-trip encoding/decoding
         if decoded.strip() != sentence.strip():
             print(f"   ⚠️  Warning: Decode mismatch!")
     print("✓ Tokenizer testing completed")
 def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
     """
     Save a Hugging Face compatible tokenizer configuration file.
     Args:
         output_dir (str): Directory containing the tokenizer files
         config (Dict[str, Any]): Tokenizer configuration
     """
     # Create Hugging Face tokenizer config
     hf_config = {
-        "tokenizer_class": "SentencePieceTokenizer",
         "model_type": config["model_type"],
         "vocab_size": config["vocab_size"],
         "model_file": "tokenizer.model",
         "special_tokens": {
             "pad_token": "<pad>",
-            "unk_token": "<unk>",
             "bos_token": "<s>",
             "eos_token": "</s>",
         },
         "special_token_ids": {
             "pad_token_id": 0,
             "unk_token_id": 1,
-            "bos_token_id": 2,
             "eos_token_id": 3,
-        }
     }
     config_path = os.path.join(output_dir, "tokenizer_config.json")
-    with open(config_path, 'w', encoding='utf-8') as f:
         json.dump(hf_config, f, indent=2, ensure_ascii=False)
     print(f"✓ Hugging Face config saved: {config_path}")
@@ -322,69 +324,67 @@ Examples:
     --model_type bpe \\
     --output_dir data/tokenizer/ \\
     --character_coverage 0.9995
-        """
     )
     # Required arguments
     parser.add_argument(
-        "--input",
-        required=True,
-        help="Path to training text file (e.g., data/clean/training_data.txt)"
     )
     # Optional arguments with sensible defaults
     parser.add_argument(
-        "--vocab_size",
-        type=int,
         default=32000,
-        help="Vocabulary size (default: 32000, recommended: 8k-64k)"
     )
     parser.add_argument(
-        "--model_type",
-        choices=["bpe", "unigram"],
         default="bpe",
-        help="Tokenization algorithm (default: bpe)"
     )
     parser.add_argument(
-        "--output_dir",
         default="data/tokenizer/",
-        help="Output directory for tokenizer files (default: data/tokenizer/)"
     )
     parser.add_argument(
-        "--character_coverage",
-        type=float,
         default=0.9995,
-        help="Character coverage (default: 0.9995 for English)"
     )
     parser.add_argument(
-        "--max_sentence_length",
-        type=int,
         default=4192,
-        help="Maximum sentence length in characters (default: 4192)"
     )
     parser.add_argument(
-        "--no_test",
-        action="store_true",
-        help="Skip tokenizer testing after training"
     )
     args = parser.parse_args()
     print("🔤 SentencePiece Tokenizer Training")
     print("=" * 50)
     try:
         # Step 1: Validate input file
         validate_input_file(args.input)
         # Step 2: Count training data
         sentence_count = count_training_sentences(args.input)
         # Step 3: Train tokenizer
         config = train_sentencepiece_tokenizer(
             input_path=args.input,
@@ -394,36 +394,36 @@ Examples:
             character_coverage=args.character_coverage,
             max_sentence_length=args.max_sentence_length,
         )
         # Step 4: Save Hugging Face compatible config
         save_huggingface_config(args.output_dir, config)
         # Step 5: Test tokenizer (unless skipped)
         if not args.no_test:
             model_path = os.path.join(args.output_dir, "tokenizer.model")
             test_tokenizer(model_path)
         # Step 6: Print summary
         print(f"\n🎉 Tokenizer training completed successfully!")
         print(f"📁 Output directory: {args.output_dir}")
         print(f"📊 Vocabulary size: {config['vocab_size']:,}")
         print(f"⏱️  Training time: {config['training_time_seconds']:.1f}s")
         print(f"📄 Training sentences: {sentence_count:,}")
         print(f"\nFiles created:")
         print(f"  • {config['model_file']} - SentencePiece model")
-        print(f"  • {config['vocab_file']} - Vocabulary file")
         print(f"  • {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")
         print(f"\nTo use this tokenizer in your language model:")
         print(f"  import sentencepiece as spm")
         print(f"  sp = spm.SentencePieceProcessor()")
         print(f"  sp.load('{config['model_file']}')")
     except Exception as e:
         print(f"\n❌ Error: {e}")
         exit(1)
 if __name__ == "__main__":
-    main()

 def validate_input_file(input_path: str) -> None:
     """
     Validate that the input training file exists and is readable.
     Args:
         input_path (str): Path to the training text file
     Raises:
         FileNotFoundError: If input file doesn't exist
         ValueError: If input file is empty or unreadable
     """
     if not os.path.exists(input_path):
         raise FileNotFoundError(f"Training data file not found: {input_path}")
     # Check file size and readability
     file_size = os.path.getsize(input_path)
     if file_size == 0:
         raise ValueError(f"Training data file is empty: {input_path}")
     # Test that we can read the file
     try:
+        with open(input_path, "r", encoding="utf-8") as f:
             first_line = f.readline()
             if not first_line.strip():
+                raise ValueError(
+                    f"Training data file appears to be empty or contains only whitespace"
+                )
     except UnicodeDecodeError as e:
         raise ValueError(f"Cannot read training data file as UTF-8: {e}")
     print(f"✓ Input file validated: {input_path} ({file_size:,} bytes)")
 def count_training_sentences(input_path: str) -> int:
     """
     Count the number of training sentences/lines in the input file.
     Args:
         input_path (str): Path to the training text file
     Returns:
         int: Number of lines in the file
     """
     print("Counting training sentences...")
+    with open(input_path, "r", encoding="utf-8") as f:
         count = sum(1 for line in f if line.strip())
     print(f"✓ Found {count:,} training sentences")
     return count
 ) -> Dict[str, Any]:
     """
     Train a SentencePiece tokenizer with the specified parameters.
     Args:
         input_path (str): Path to training text file
         output_dir (str): Directory to save tokenizer files
         max_sentence_length (int): Maximum sentence length in characters
         input_sentence_size (int): Maximum number of sentences to use for training
         shuffle_input_sentence (bool): Whether to shuffle input sentences
     Returns:
         Dict[str, Any]: Training statistics and configuration
     """
     # Ensure output directory exists
     os.makedirs(output_dir, exist_ok=True)
     # Define output paths
     model_prefix = os.path.join(output_dir, "tokenizer")
     # SentencePiece training parameters
     train_params = [
         f"--input={input_path}",
         f"--max_sentence_length={max_sentence_length}",
         f"--input_sentence_size={input_sentence_size}",
         f"--shuffle_input_sentence={shuffle_input_sentence}",
         # Special tokens for language modeling
+        "--pad_id=0",  # Padding token
+        "--unk_id=1",  # Unknown token
+        "--bos_id=2",  # Beginning of sequence
+        "--eos_id=3",  # End of sequence
         # Additional useful parameters
+        "--split_by_unicode_script=true",  # Better handling of mixed scripts
+        "--split_by_whitespace=true",  # Split on whitespace
+        "--remove_extra_whitespaces=true",  # Clean up whitespace
+        "--normalization_rule_name=identity",  # Keep original text as-is
     ]
     print(f"\nTraining SentencePiece tokenizer...")
     print(f"  Algorithm: {model_type.upper()}")
     print(f"  Vocabulary size: {vocab_size:,}")
     print(f"  Character coverage: {character_coverage}")
     print(f"  Output directory: {output_dir}")
     print(f"  Model files: {model_prefix}.model, {model_prefix}.vocab")
     # Record training start time
     start_time = time.time()
     # Train the tokenizer
     try:
         spm.SentencePieceTrainer.train(" ".join(train_params))
         print(f"✓ Tokenizer training completed in {training_time:.1f} seconds")
     except Exception as e:
         raise RuntimeError(f"SentencePiece training failed: {e}")
     # Verify output files were created
     model_file = f"{model_prefix}.model"
     vocab_file = f"{model_prefix}.vocab"
     if not os.path.exists(model_file):
         raise RuntimeError(f"Expected model file not created: {model_file}")
     if not os.path.exists(vocab_file):
         raise RuntimeError(f"Expected vocab file not created: {vocab_file}")
     print(f"✓ Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
     print(f"✓ Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")
     # Return training configuration and statistics
     config = {
         "model_type": model_type,
         "model_file": model_file,
         "vocab_file": vocab_file,
     }
     return config
 def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
     """
     Test the trained tokenizer on sample sentences to verify it works correctly.
     Args:
         model_path (str): Path to the trained .model file
         test_sentences (list): Optional list of test sentences
     """
     print(f"\nTesting trained tokenizer...")
     # Load the trained tokenizer
     sp = spm.SentencePieceProcessor()
     sp.load(model_path)
     # Default test sentences if none provided
     if test_sentences is None:
         test_sentences = [
             "Machine learning and artificial intelligence are transforming technology.",
             "SentencePiece tokenization works well for language models.",
         ]
     print(f"Vocabulary size: {sp.vocab_size():,}")
+    print(
+        f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}"
+    )
     print("\nTokenization examples:")
     for i, sentence in enumerate(test_sentences, 1):
         # Encode to token IDs and pieces
         token_ids = sp.encode(sentence)
         token_pieces = sp.encode(sentence, out_type=str)
         print(f"\n{i}. Input: {sentence}")
         print(f"   Tokens ({len(token_pieces)}): {token_pieces}")
         print(f"   IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")
         # Test decoding
         decoded = sp.decode(token_ids)
         print(f"   Decoded: {decoded}")
         # Verify round-trip encoding/decoding
         if decoded.strip() != sentence.strip():
             print(f"   ⚠️  Warning: Decode mismatch!")
     print("✓ Tokenizer testing completed")
 def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
     """
     Save a Hugging Face compatible tokenizer configuration file.
     Args:
         output_dir (str): Directory containing the tokenizer files
         config (Dict[str, Any]): Tokenizer configuration
     """
     # Create Hugging Face tokenizer config
     hf_config = {
+        "tokenizer_class": "SentencePieceTokenizer",
         "model_type": config["model_type"],
         "vocab_size": config["vocab_size"],
         "model_file": "tokenizer.model",
         "special_tokens": {
             "pad_token": "<pad>",
+            "unk_token": "<unk>",
             "bos_token": "<s>",
             "eos_token": "</s>",
         },
         "special_token_ids": {
             "pad_token_id": 0,
             "unk_token_id": 1,
+            "bos_token_id": 2,
             "eos_token_id": 3,
+        },
     }
     config_path = os.path.join(output_dir, "tokenizer_config.json")
+    with open(config_path, "w", encoding="utf-8") as f:
         json.dump(hf_config, f, indent=2, ensure_ascii=False)
     print(f"✓ Hugging Face config saved: {config_path}")
     --model_type bpe \\
     --output_dir data/tokenizer/ \\
     --character_coverage 0.9995
+        """,
     )
     # Required arguments
     parser.add_argument(
+        "--input",
+        required=True,
+        help="Path to training text file (e.g., data/clean/training_data.txt)",
     )
     # Optional arguments with sensible defaults
     parser.add_argument(
+        "--vocab_size",
+        type=int,
         default=32000,
+        help="Vocabulary size (default: 32000, recommended: 8k-64k)",
     )
     parser.add_argument(
+        "--model_type",
+        choices=["bpe", "unigram"],
         default="bpe",
+        help="Tokenization algorithm (default: bpe)",
     )
     parser.add_argument(
+        "--output_dir",
         default="data/tokenizer/",
+        help="Output directory for tokenizer files (default: data/tokenizer/)",
     )
     parser.add_argument(
+        "--character_coverage",
+        type=float,
         default=0.9995,
+        help="Character coverage (default: 0.9995 for English)",
     )
     parser.add_argument(
+        "--max_sentence_length",
+        type=int,
         default=4192,
+        help="Maximum sentence length in characters (default: 4192)",
     )
     parser.add_argument(
+        "--no_test", action="store_true", help="Skip tokenizer testing after training"
     )
     args = parser.parse_args()
     print("🔤 SentencePiece Tokenizer Training")
     print("=" * 50)
     try:
         # Step 1: Validate input file
         validate_input_file(args.input)
         # Step 2: Count training data
         sentence_count = count_training_sentences(args.input)
         # Step 3: Train tokenizer
         config = train_sentencepiece_tokenizer(
             input_path=args.input,
             character_coverage=args.character_coverage,
             max_sentence_length=args.max_sentence_length,
         )
         # Step 4: Save Hugging Face compatible config
         save_huggingface_config(args.output_dir, config)
         # Step 5: Test tokenizer (unless skipped)
         if not args.no_test:
             model_path = os.path.join(args.output_dir, "tokenizer.model")
             test_tokenizer(model_path)
         # Step 6: Print summary
         print(f"\n🎉 Tokenizer training completed successfully!")
         print(f"📁 Output directory: {args.output_dir}")
         print(f"📊 Vocabulary size: {config['vocab_size']:,}")
         print(f"⏱️  Training time: {config['training_time_seconds']:.1f}s")
         print(f"📄 Training sentences: {sentence_count:,}")
         print(f"\nFiles created:")
         print(f"  • {config['model_file']} - SentencePiece model")
+        print(f"  • {config['vocab_file']} - Vocabulary file")
         print(f"  • {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")
         print(f"\nTo use this tokenizer in your language model:")
         print(f"  import sentencepiece as spm")
         print(f"  sp = spm.SentencePieceProcessor()")
         print(f"  sp.load('{config['model_file']}')")
     except Exception as e:
         print(f"\n❌ Error: {e}")
         exit(1)
 if __name__ == "__main__":
+    main()