openllm / app.py
lemms's picture
Fix: Use OpenLLM modules from uploaded files (model.py and data_loader.py)
e02f3cd verified
raw
history blame
25.4 kB
#!/usr/bin/env python3
"""
OpenLLM Training Space Application - Fixed with Uploaded Modules
This version imports OpenLLM modules from the uploaded files in the HF Space:
- Imports model.py and data_loader.py that were uploaded to the Space
- Uses OpenLLM's actual custom model architecture
- Compatible with OpenLLM's implementation
Author: Louis Chua Bean Chong
License: GPL-3.0
Version: 2.1.0
Last Updated: 2024
"""
import gradio as gr
import torch
import torch.nn as nn
import os
import time
import math
import gc
from typing import Dict, Any, Optional
import threading
from dataclasses import dataclass
from pathlib import Path
# Import OpenLLM's custom model architecture from uploaded files
try:
# Import from the uploaded files in the HF Space
from model import GPTModel, GPTConfig, create_model
from data_loader import TextDataLoader
OPENLLM_AVAILABLE = True
print("βœ… OpenLLM custom model architecture imported successfully from uploaded files")
except ImportError as e:
print(f"❌ OpenLLM imports failed: {e}")
OPENLLM_AVAILABLE = False
# Try to import sentencepiece
try:
import sentencepiece as spm
SENTENCEPIECE_AVAILABLE = True
print(f"βœ… SentencePiece available: {spm.__version__}")
except ImportError:
SENTENCEPIECE_AVAILABLE = False
print("❌ SentencePiece not available")
# Import other dependencies
try:
from datasets import load_dataset
from huggingface_hub import HfApi, hf_hub_download
DEPENDENCIES_AVAILABLE = True
except ImportError as e:
print(f"❌ Dependencies not available: {e}")
DEPENDENCIES_AVAILABLE = False
@dataclass
class TrainingConfig:
"""Configuration class for training parameters."""
model_size: str
max_steps: int
learning_rate: float
batch_size: int
output_dir: str = "./openllm-trained"
save_steps: int = 100
logging_steps: int = 10
warmup_steps: int = 50
gradient_accumulation_steps: int = 4
class OpenLLMTrainer:
"""
Complete training implementation using OpenLLM's actual architecture.
This class handles the entire training pipeline including:
- Model loading using OpenLLM's custom GPTModel
- Tokenizer loading using sentencepiece.SentencePieceProcessor()
- Dataset preparation
- Training execution using OpenLLM's approach
- Model saving and uploading
"""
def __init__(self):
"""Initialize the trainer with default settings."""
self.model = None
self.tokenizer = None
self.data_loader = None
self.optimizer = None
self.scheduler = None
self.is_training = False
self.training_progress = {
"status": "Ready",
"current_step": 0,
"total_steps": 0,
"loss": 0.0,
"learning_rate": 0.0
}
# Initialize Hugging Face API for model uploading
try:
self.hf_api = HfApi()
except Exception as e:
print(f"Failed to initialize HF API: {e}")
self.hf_api = None
def load_model_and_tokenizer(self, model_size: str) -> str:
"""
Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
Args:
model_size: Size of the model to load ("small", "medium", "large")
Returns:
Status message indicating success or failure
"""
try:
if not OPENLLM_AVAILABLE:
return "❌ OpenLLM custom model architecture not available"
print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
# Create model using OpenLLM's factory function
try:
self.model = create_model(model_size)
print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
print(f" Parameters: {self.model.get_num_params():,}")
except Exception as e:
print(f"❌ Failed to create model: {e}")
return f"❌ Failed to create OpenLLM model: {str(e)}"
# Load tokenizer using sentencepiece
try:
print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
# Download tokenizer.model from HF Hub
model_name = f"lemms/openllm-{model_size}-extended-7k"
tokenizer_path = hf_hub_download(
repo_id=model_name,
filename="tokenizer.model"
)
print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
# Create SentencePieceProcessor
sp_processor = spm.SentencePieceProcessor()
sp_processor.load(tokenizer_path)
# Store tokenizer for later use
self.tokenizer = sp_processor
print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
print(f" Vocabulary size: {sp_processor.vocab_size()}")
except Exception as e:
print(f"❌ Failed to load tokenizer: {e}")
return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
return f"βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
except Exception as e:
return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
def prepare_dataset(self) -> str:
"""
Load and prepare the training dataset using OpenLLM's approach.
Returns:
Status message indicating success or failure
"""
try:
if not DEPENDENCIES_AVAILABLE:
return "❌ Required dependencies not available"
print("πŸ”„ Loading training dataset...")
# Load dataset from HF Hub
dataset = load_dataset("lemms/openllm-training-data")
print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
# Create temporary data file for OpenLLM's TextDataLoader
temp_data_file = "temp_training_data.txt"
with open(temp_data_file, 'w', encoding='utf-8') as f:
for item in dataset['train']:
f.write(item['text'] + '\n')
print(f"βœ… Temporary data file created: {temp_data_file}")
# Create OpenLLM's TextDataLoader
try:
# Get tokenizer path
tokenizer_path = self.tokenizer.model_file_path
self.data_loader = TextDataLoader(
data_file=temp_data_file,
tokenizer_path=tokenizer_path,
seq_len=512,
batch_size=4, # Will be overridden by training config
shuffle=True
)
print(f"βœ… OpenLLM TextDataLoader created successfully")
except Exception as e:
print(f"❌ Failed to create TextDataLoader: {e}")
return f"❌ Failed to create data loader: {str(e)}"
return f"βœ… Successfully prepared dataset with {len(dataset['train'])} samples"
except Exception as e:
return f"❌ Failed to prepare dataset: {str(e)}"
def setup_training(self, config: TrainingConfig) -> str:
"""
Set up the training configuration using OpenLLM's approach.
Args:
config: Training configuration object
Returns:
Status message indicating success or failure
"""
try:
# Create output directory
os.makedirs(config.output_dir, exist_ok=True)
# Set up optimizer (AdamW with weight decay)
decay_params = []
no_decay_params = []
for name, param in self.model.named_parameters():
if not param.requires_grad:
continue
if len(param.shape) == 1 or name.endswith('.bias'):
no_decay_params.append(param)
else:
decay_params.append(param)
param_groups = [
{'params': decay_params, 'weight_decay': 0.01},
{'params': no_decay_params, 'weight_decay': 0.0}
]
self.optimizer = torch.optim.AdamW(
param_groups,
lr=config.learning_rate,
betas=(0.9, 0.95),
eps=1e-8
)
# Set up learning rate scheduler
warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
self.optimizer,
start_factor=0.01,
end_factor=1.0,
total_iters=config.warmup_steps
)
main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
self.optimizer,
T_max=config.max_steps - config.warmup_steps
)
self.scheduler = torch.optim.lr_scheduler.SequentialLR(
self.optimizer,
schedulers=[warmup_scheduler, main_scheduler],
milestones=[config.warmup_steps]
)
print("βœ… Training setup completed successfully")
return f"βœ… Training setup completed successfully"
except Exception as e:
return f"❌ Failed to setup training: {str(e)}"
def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
"""
Execute the actual model training using OpenLLM's approach.
Args:
config: Training configuration object
progress_callback: Optional callback function for progress updates
Returns:
Status message indicating success or failure
"""
try:
self.is_training = True
self.training_progress["status"] = "Training"
self.training_progress["total_steps"] = config.max_steps
print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
# Training loop using OpenLLM's approach
self.model.train()
accumulated_loss = 0.0
self.optimizer.zero_grad()
step = 0
for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
if step >= config.max_steps:
break
# Forward pass (model computes loss internally when targets provided)
logits, loss = self.model(input_ids, target_ids)
# Scale loss for gradient accumulation
loss = loss / config.gradient_accumulation_steps
accumulated_loss += loss.item()
# Backward pass
loss.backward()
# Update weights every gradient_accumulation_steps
if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
# Clip gradients
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
# Update parameters
self.optimizer.step()
self.scheduler.step()
self.optimizer.zero_grad()
# Update step count
step += 1
# Update progress
self.training_progress["current_step"] = step
self.training_progress["loss"] = accumulated_loss
self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
# Log progress
if step % config.logging_steps == 0:
print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {self.scheduler.get_last_lr()[0]:.2e}")
# Save checkpoint
if step % config.save_steps == 0:
self._save_checkpoint(config.output_dir, step)
# Reset accumulated loss
accumulated_loss = 0.0
# Clean up memory
if step % 100 == 0:
gc.collect()
# Final checkpoint
self._save_checkpoint(config.output_dir, step, is_best=True)
# Update final progress
self.training_progress["status"] = "Completed"
self.training_progress["current_step"] = step
print(f"βœ… Training completed! Final step: {step}")
return f"βœ… Training completed successfully! Final step: {step}"
except Exception as e:
self.training_progress["status"] = "Failed"
print(f"❌ Training failed: {e}")
return f"❌ Training failed: {str(e)}"
finally:
self.is_training = False
def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
"""Save model checkpoint using OpenLLM's approach."""
try:
checkpoint = {
'step': step,
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'scheduler_state_dict': self.scheduler.state_dict(),
'config': self.model.config.__dict__
}
# Save latest checkpoint
checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
torch.save(checkpoint, checkpoint_path)
# Save best checkpoint
if is_best:
best_path = os.path.join(output_dir, "best_model.pt")
torch.save(checkpoint, best_path)
print(f"πŸ’Ύ Best model saved: {best_path}")
print(f"πŸ’Ύ Checkpoint saved: {checkpoint_path}")
except Exception as e:
print(f"❌ Failed to save checkpoint: {e}")
def save_and_upload_model(self, config: TrainingConfig) -> str:
"""
Save the trained model and upload it to Hugging Face Hub.
Args:
config: Training configuration object
Returns:
Status message indicating success or failure
"""
try:
print("πŸ”„ Saving trained model...")
# Save the final model
self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
# Save tokenizer files
tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
os.makedirs(tokenizer_dir, exist_ok=True)
# Copy the tokenizer.model file
import shutil
shutil.copy2(self.tokenizer.model_file_path, os.path.join(tokenizer_dir, "tokenizer.model"))
print("βœ… Model saved locally")
# Generate model name for upload
model_name = f"openllm-{config.model_size}-extended-8k"
repo_id = f"lemms/{model_name}"
# Upload to Hugging Face Hub
if self.hf_api:
print(f"πŸ”„ Uploading model to {repo_id}...")
# Upload model files
self.hf_api.upload_folder(
folder_path=config.output_dir,
repo_id=repo_id,
repo_type="model",
commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
)
print(f"βœ… Model uploaded successfully to {repo_id}")
return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
else:
return f"βœ… Model saved locally to {config.output_dir}"
except Exception as e:
print(f"❌ Failed to save/upload model: {e}")
return f"❌ Failed to save/upload model: {str(e)}"
def get_training_progress(self) -> Dict[str, Any]:
"""Get current training progress information."""
return self.training_progress.copy()
def main():
"""
Main function that creates the complete Gradio application interface.
"""
# Initialize the trainer
trainer = OpenLLMTrainer()
# Create the main Gradio application interface
with gr.Blocks(
title="OpenLLM Training Space - Fixed with Uploaded Modules",
theme=gr.themes.Soft()
) as demo:
# Application Header
gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed with Uploaded Modules")
gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
gr.Markdown("---")
# Status Information
gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (From Uploaded Files)")
# Main Content Area
with gr.Row():
# Left Column: Training Configuration
with gr.Column(scale=1):
gr.Markdown("## πŸ“Š Training Configuration")
# Model Size Selection
model_size = gr.Dropdown(
choices=["small", "medium", "large"],
value="small",
label="Model Size"
)
# Training Steps Configuration
max_steps = gr.Slider(
minimum=100,
maximum=10000,
value=1000,
step=100,
label="Max Training Steps"
)
# Learning Rate Configuration
learning_rate = gr.Slider(
minimum=1e-5,
maximum=1e-3,
value=3e-4,
step=1e-5,
label="Learning Rate"
)
# Batch Size Configuration
batch_size = gr.Slider(
minimum=1,
maximum=16,
value=4,
step=1,
label="Batch Size"
)
# Right Column: Training Status and Controls
with gr.Column(scale=1):
gr.Markdown("## 🎯 Training Status")
# Training Status Display
status_text = gr.Textbox(
value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
label="Current Status",
interactive=False,
lines=5
)
# Progress Information
progress_info = gr.JSON(
value=trainer.get_training_progress(),
label="Training Progress"
)
# Training Control Buttons
with gr.Row():
start_btn = gr.Button("πŸš€ Start Training", variant="primary")
stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
# Instructions Section
gr.Markdown("## πŸ“‹ OpenLLM Training Instructions")
gr.Markdown("""
This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
### **Step 1: Configure Parameters**
- **Model Size**: Select the base model to train from (small, medium, large)
- **Max Steps**: Number of training iterations (100-10,000)
- **Learning Rate**: Training rate (0.00001-0.001)
- **Batch Size**: Samples per training batch (1-16)
### **Step 2: Start Training**
- Click "Start Training" to begin the actual training process
- Uses OpenLLM's custom GPTModel class from uploaded files
- Uses sentencepiece.SentencePieceProcessor() for tokenization
- Compatible with OpenLLM's actual implementation
### **Step 3: Monitor Progress**
- Watch the status updates and progress information
- Training may take several minutes depending on steps
- The final model will be uploaded to Hugging Face Hub
### **Step 4: Access Results**
- Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
- Check the model repository for your trained model
- Use the model for inference or further training
""")
# Resource Links Section
gr.Markdown("## πŸ”— Model Resources")
gr.Markdown("""
- [πŸ“š 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
- [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
- [πŸ“Š Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
- [πŸ“– Main Project](https://github.com/louischua/openllm)
""")
# Training Function Definition
def start_complete_training(model_size, max_steps, learning_rate, batch_size):
"""
Execute the complete training process using OpenLLM's approach.
"""
if not OPENLLM_AVAILABLE:
return "❌ OpenLLM custom model architecture not available. Please check the installation."
try:
# Create training configuration
config = TrainingConfig(
model_size=model_size,
max_steps=max_steps,
learning_rate=learning_rate,
batch_size=batch_size
)
# Step 1: Load model and tokenizer using OpenLLM's approach
status = trainer.load_model_and_tokenizer(model_size)
if "❌" in status:
return status
# Step 2: Prepare dataset
status = trainer.prepare_dataset()
if "❌" in status:
return status
# Step 3: Setup training
status = trainer.setup_training(config)
if "❌" in status:
return status
# Step 4: Execute training
status = trainer.train_model(config)
if "❌" in status:
return status
# Step 5: Save and upload model
status = trainer.save_and_upload_model(config)
return f"πŸš€ Complete training process finished!\n{status}"
except Exception as e:
return f"❌ Training process failed: {str(e)}"
def update_progress():
"""Update the progress display."""
return trainer.get_training_progress()
# Connect UI Components to Functions
start_btn.click(
fn=start_complete_training,
inputs=[model_size, max_steps, learning_rate, batch_size],
outputs=[status_text]
)
# Auto-refresh progress every 5 seconds during training
demo.load(update_progress, outputs=[progress_info])
# Application Footer
gr.Markdown("---")
gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
return demo
if __name__ == "__main__":
demo = main()
demo.launch()