#!/usr/bin/env python3 """ OpenLLM Training Space Application - Fixed with Uploaded Modules This version imports OpenLLM modules from the uploaded files in the HF Space: - Imports model.py and data_loader.py that were uploaded to the Space - Uses OpenLLM's actual custom model architecture - Compatible with OpenLLM's implementation Author: Louis Chua Bean Chong License: GPL-3.0 Version: 2.1.0 Last Updated: 2024 """ import gradio as gr import torch import torch.nn as nn import os import time import math import gc from typing import Dict, Any, Optional import threading from dataclasses import dataclass from pathlib import Path # Import OpenLLM's custom model architecture from uploaded files try: # Import from the uploaded files in the HF Space from model import GPTModel, GPTConfig, create_model from data_loader import TextDataLoader OPENLLM_AVAILABLE = True print("✅ OpenLLM custom model architecture imported successfully from uploaded files") except ImportError as e: print(f"❌ OpenLLM imports failed: {e}") OPENLLM_AVAILABLE = False # Try to import sentencepiece try: import sentencepiece as spm SENTENCEPIECE_AVAILABLE = True print(f"✅ SentencePiece available: {spm.__version__}") except ImportError: SENTENCEPIECE_AVAILABLE = False print("❌ SentencePiece not available") # Import other dependencies try: from datasets import load_dataset from huggingface_hub import HfApi, hf_hub_download DEPENDENCIES_AVAILABLE = True except ImportError as e: print(f"❌ Dependencies not available: {e}") DEPENDENCIES_AVAILABLE = False @dataclass class TrainingConfig: """Configuration class for training parameters.""" model_size: str max_steps: int learning_rate: float batch_size: int output_dir: str = "./openllm-trained" save_steps: int = 100 logging_steps: int = 10 warmup_steps: int = 50 gradient_accumulation_steps: int = 4 class OpenLLMTrainer: """ Complete training implementation using OpenLLM's actual architecture. This class handles the entire training pipeline including: - Model loading using OpenLLM's custom GPTModel - Tokenizer loading using sentencepiece.SentencePieceProcessor() - Dataset preparation - Training execution using OpenLLM's approach - Model saving and uploading """ def __init__(self): """Initialize the trainer with default settings.""" self.model = None self.tokenizer = None self.data_loader = None self.optimizer = None self.scheduler = None self.is_training = False self.training_progress = { "status": "Ready", "current_step": 0, "total_steps": 0, "loss": 0.0, "learning_rate": 0.0 } # Initialize Hugging Face API for model uploading try: self.hf_api = HfApi() except Exception as e: print(f"Failed to initialize HF API: {e}") self.hf_api = None def load_model_and_tokenizer(self, model_size: str) -> str: """ Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach. Args: model_size: Size of the model to load ("small", "medium", "large") Returns: Status message indicating success or failure """ try: if not OPENLLM_AVAILABLE: return "❌ OpenLLM custom model architecture not available" print(f"🔄 Loading OpenLLM {model_size} model using custom architecture...") # Create model using OpenLLM's factory function try: self.model = create_model(model_size) print(f"✅ OpenLLM {model_size} model created: {type(self.model).__name__}") print(f" Parameters: {self.model.get_num_params():,}") except Exception as e: print(f"❌ Failed to create model: {e}") return f"❌ Failed to create OpenLLM model: {str(e)}" # Load tokenizer using sentencepiece try: print("🔄 Loading tokenizer using sentencepiece.SentencePieceProcessor()...") # Download tokenizer.model from HF Hub model_name = f"lemms/openllm-{model_size}-extended-7k" tokenizer_path = hf_hub_download( repo_id=model_name, filename="tokenizer.model" ) print(f"✅ Tokenizer downloaded to: {tokenizer_path}") # Create SentencePieceProcessor sp_processor = spm.SentencePieceProcessor() sp_processor.load(tokenizer_path) # Store tokenizer for later use self.tokenizer = sp_processor print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor") print(f" Vocabulary size: {sp_processor.vocab_size()}") except Exception as e: print(f"❌ Failed to load tokenizer: {e}") return f"❌ Failed to load OpenLLM tokenizer: {str(e)}" return f"✅ Successfully loaded OpenLLM {model_size} model with custom architecture" except Exception as e: return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}" def prepare_dataset(self) -> str: """ Load and prepare the training dataset using OpenLLM's approach. Returns: Status message indicating success or failure """ try: if not DEPENDENCIES_AVAILABLE: return "❌ Required dependencies not available" print("🔄 Loading training dataset...") # Load dataset from HF Hub dataset = load_dataset("lemms/openllm-training-data") print(f"✅ Dataset loaded: {len(dataset['train'])} samples") # Create temporary data file for OpenLLM's TextDataLoader temp_data_file = "temp_training_data.txt" with open(temp_data_file, 'w', encoding='utf-8') as f: for item in dataset['train']: f.write(item['text'] + '\n') print(f"✅ Temporary data file created: {temp_data_file}") # Create OpenLLM's TextDataLoader try: # Get tokenizer path tokenizer_path = self.tokenizer.model_file_path self.data_loader = TextDataLoader( data_file=temp_data_file, tokenizer_path=tokenizer_path, seq_len=512, batch_size=4, # Will be overridden by training config shuffle=True ) print(f"✅ OpenLLM TextDataLoader created successfully") except Exception as e: print(f"❌ Failed to create TextDataLoader: {e}") return f"❌ Failed to create data loader: {str(e)}" return f"✅ Successfully prepared dataset with {len(dataset['train'])} samples" except Exception as e: return f"❌ Failed to prepare dataset: {str(e)}" def setup_training(self, config: TrainingConfig) -> str: """ Set up the training configuration using OpenLLM's approach. Args: config: Training configuration object Returns: Status message indicating success or failure """ try: # Create output directory os.makedirs(config.output_dir, exist_ok=True) # Set up optimizer (AdamW with weight decay) decay_params = [] no_decay_params = [] for name, param in self.model.named_parameters(): if not param.requires_grad: continue if len(param.shape) == 1 or name.endswith('.bias'): no_decay_params.append(param) else: decay_params.append(param) param_groups = [ {'params': decay_params, 'weight_decay': 0.01}, {'params': no_decay_params, 'weight_decay': 0.0} ] self.optimizer = torch.optim.AdamW( param_groups, lr=config.learning_rate, betas=(0.9, 0.95), eps=1e-8 ) # Set up learning rate scheduler warmup_scheduler = torch.optim.lr_scheduler.LinearLR( self.optimizer, start_factor=0.01, end_factor=1.0, total_iters=config.warmup_steps ) main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max=config.max_steps - config.warmup_steps ) self.scheduler = torch.optim.lr_scheduler.SequentialLR( self.optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[config.warmup_steps] ) print("✅ Training setup completed successfully") return f"✅ Training setup completed successfully" except Exception as e: return f"❌ Failed to setup training: {str(e)}" def train_model(self, config: TrainingConfig, progress_callback=None) -> str: """ Execute the actual model training using OpenLLM's approach. Args: config: Training configuration object progress_callback: Optional callback function for progress updates Returns: Status message indicating success or failure """ try: self.is_training = True self.training_progress["status"] = "Training" self.training_progress["total_steps"] = config.max_steps print(f"🚀 Starting OpenLLM training for {config.max_steps} steps...") # Training loop using OpenLLM's approach self.model.train() accumulated_loss = 0.0 self.optimizer.zero_grad() step = 0 for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader): if step >= config.max_steps: break # Forward pass (model computes loss internally when targets provided) logits, loss = self.model(input_ids, target_ids) # Scale loss for gradient accumulation loss = loss / config.gradient_accumulation_steps accumulated_loss += loss.item() # Backward pass loss.backward() # Update weights every gradient_accumulation_steps if (batch_idx + 1) % config.gradient_accumulation_steps == 0: # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # Update parameters self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() # Update step count step += 1 # Update progress self.training_progress["current_step"] = step self.training_progress["loss"] = accumulated_loss self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0] # Log progress if step % config.logging_steps == 0: print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {self.scheduler.get_last_lr()[0]:.2e}") # Save checkpoint if step % config.save_steps == 0: self._save_checkpoint(config.output_dir, step) # Reset accumulated loss accumulated_loss = 0.0 # Clean up memory if step % 100 == 0: gc.collect() # Final checkpoint self._save_checkpoint(config.output_dir, step, is_best=True) # Update final progress self.training_progress["status"] = "Completed" self.training_progress["current_step"] = step print(f"✅ Training completed! Final step: {step}") return f"✅ Training completed successfully! Final step: {step}" except Exception as e: self.training_progress["status"] = "Failed" print(f"❌ Training failed: {e}") return f"❌ Training failed: {str(e)}" finally: self.is_training = False def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None: """Save model checkpoint using OpenLLM's approach.""" try: checkpoint = { 'step': step, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'scheduler_state_dict': self.scheduler.state_dict(), 'config': self.model.config.__dict__ } # Save latest checkpoint checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt") torch.save(checkpoint, checkpoint_path) # Save best checkpoint if is_best: best_path = os.path.join(output_dir, "best_model.pt") torch.save(checkpoint, best_path) print(f"💾 Best model saved: {best_path}") print(f"💾 Checkpoint saved: {checkpoint_path}") except Exception as e: print(f"❌ Failed to save checkpoint: {e}") def save_and_upload_model(self, config: TrainingConfig) -> str: """ Save the trained model and upload it to Hugging Face Hub. Args: config: Training configuration object Returns: Status message indicating success or failure """ try: print("🔄 Saving trained model...") # Save the final model self._save_checkpoint(config.output_dir, config.max_steps, is_best=True) # Save tokenizer files tokenizer_dir = os.path.join(config.output_dir, "tokenizer") os.makedirs(tokenizer_dir, exist_ok=True) # Copy the tokenizer.model file import shutil shutil.copy2(self.tokenizer.model_file_path, os.path.join(tokenizer_dir, "tokenizer.model")) print("✅ Model saved locally") # Generate model name for upload model_name = f"openllm-{config.model_size}-extended-8k" repo_id = f"lemms/{model_name}" # Upload to Hugging Face Hub if self.hf_api: print(f"🔄 Uploading model to {repo_id}...") # Upload model files self.hf_api.upload_folder( folder_path=config.output_dir, repo_id=repo_id, repo_type="model", commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)" ) print(f"✅ Model uploaded successfully to {repo_id}") return f"✅ Model saved and uploaded to https://huggingface.co/{repo_id}" else: return f"✅ Model saved locally to {config.output_dir}" except Exception as e: print(f"❌ Failed to save/upload model: {e}") return f"❌ Failed to save/upload model: {str(e)}" def get_training_progress(self) -> Dict[str, Any]: """Get current training progress information.""" return self.training_progress.copy() def main(): """ Main function that creates the complete Gradio application interface. """ # Initialize the trainer trainer = OpenLLMTrainer() # Create the main Gradio application interface with gr.Blocks( title="OpenLLM Training Space - Fixed with Uploaded Modules", theme=gr.themes.Soft() ) as demo: # Application Header gr.Markdown("# 🚀 OpenLLM Training Space - Fixed with Uploaded Modules") gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*") gr.Markdown("---") # Status Information gr.Markdown(f"**OpenLLM Available**: {'✅ Yes' if OPENLLM_AVAILABLE else '❌ No'}") gr.Markdown(f"**SentencePiece Available**: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}") gr.Markdown(f"**Dependencies Available**: {'✅ Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}") gr.Markdown("**Architecture**: ✅ OpenLLM Custom GPTModel (From Uploaded Files)") # Main Content Area with gr.Row(): # Left Column: Training Configuration with gr.Column(scale=1): gr.Markdown("## 📊 Training Configuration") # Model Size Selection model_size = gr.Dropdown( choices=["small", "medium", "large"], value="small", label="Model Size" ) # Training Steps Configuration max_steps = gr.Slider( minimum=100, maximum=10000, value=1000, step=100, label="Max Training Steps" ) # Learning Rate Configuration learning_rate = gr.Slider( minimum=1e-5, maximum=1e-3, value=3e-4, step=1e-5, label="Learning Rate" ) # Batch Size Configuration batch_size = gr.Slider( minimum=1, maximum=16, value=4, step=1, label="Batch Size" ) # Right Column: Training Status and Controls with gr.Column(scale=1): gr.Markdown("## 🎯 Training Status") # Training Status Display status_text = gr.Textbox( value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available", label="Current Status", interactive=False, lines=5 ) # Progress Information progress_info = gr.JSON( value=trainer.get_training_progress(), label="Training Progress" ) # Training Control Buttons with gr.Row(): start_btn = gr.Button("🚀 Start Training", variant="primary") stop_btn = gr.Button("⏹️ Stop Training", variant="stop") # Instructions Section gr.Markdown("## 📋 OpenLLM Training Instructions") gr.Markdown(""" This interface uses **OpenLLM's actual custom model architecture** from uploaded files: ### **Step 1: Configure Parameters** - **Model Size**: Select the base model to train from (small, medium, large) - **Max Steps**: Number of training iterations (100-10,000) - **Learning Rate**: Training rate (0.00001-0.001) - **Batch Size**: Samples per training batch (1-16) ### **Step 2: Start Training** - Click "Start Training" to begin the actual training process - Uses OpenLLM's custom GPTModel class from uploaded files - Uses sentencepiece.SentencePieceProcessor() for tokenization - Compatible with OpenLLM's actual implementation ### **Step 3: Monitor Progress** - Watch the status updates and progress information - Training may take several minutes depending on steps - The final model will be uploaded to Hugging Face Hub ### **Step 4: Access Results** - Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k` - Check the model repository for your trained model - Use the model for inference or further training """) # Resource Links Section gr.Markdown("## 🔗 Model Resources") gr.Markdown(""" - [📚 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k) - [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k) - [📊 Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data) - [📖 Main Project](https://github.com/louischua/openllm) """) # Training Function Definition def start_complete_training(model_size, max_steps, learning_rate, batch_size): """ Execute the complete training process using OpenLLM's approach. """ if not OPENLLM_AVAILABLE: return "❌ OpenLLM custom model architecture not available. Please check the installation." try: # Create training configuration config = TrainingConfig( model_size=model_size, max_steps=max_steps, learning_rate=learning_rate, batch_size=batch_size ) # Step 1: Load model and tokenizer using OpenLLM's approach status = trainer.load_model_and_tokenizer(model_size) if "❌" in status: return status # Step 2: Prepare dataset status = trainer.prepare_dataset() if "❌" in status: return status # Step 3: Setup training status = trainer.setup_training(config) if "❌" in status: return status # Step 4: Execute training status = trainer.train_model(config) if "❌" in status: return status # Step 5: Save and upload model status = trainer.save_and_upload_model(config) return f"🚀 Complete training process finished!\n{status}" except Exception as e: return f"❌ Training process failed: {str(e)}" def update_progress(): """Update the progress display.""" return trainer.get_training_progress() # Connect UI Components to Functions start_btn.click( fn=start_complete_training, inputs=[model_size, max_steps, learning_rate, batch_size], outputs=[status_text] ) # Auto-refresh progress every 5 seconds during training demo.load(update_progress, outputs=[progress_info]) # Application Footer gr.Markdown("---") gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0") gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)") gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()") return demo if __name__ == "__main__": demo = main() demo.launch()