Spaces:

lemms
/

openllm

Runtime error

App Files Files Community

openllm / app.py

lemms

Fix: Use OpenLLM modules from uploaded files (model.py and data_loader.py)

e02f3cd verified 5 months ago

raw

history blame

25.4 kB

	#!/usr/bin/env python3
	"""
	OpenLLM Training Space Application - Fixed with Uploaded Modules

	This version imports OpenLLM modules from the uploaded files in the HF Space:
	- Imports model.py and data_loader.py that were uploaded to the Space
	- Uses OpenLLM's actual custom model architecture
	- Compatible with OpenLLM's implementation

	Author: Louis Chua Bean Chong
	License: GPL-3.0
	Version: 2.1.0
	Last Updated: 2024
	"""

	import gradio as gr
	import torch
	import torch.nn as nn
	import os
	import time
	import math
	import gc
	from typing import Dict, Any, Optional
	import threading
	from dataclasses import dataclass
	from pathlib import Path

	# Import OpenLLM's custom model architecture from uploaded files
	try:
	# Import from the uploaded files in the HF Space
	from model import GPTModel, GPTConfig, create_model
	from data_loader import TextDataLoader
	OPENLLM_AVAILABLE = True
	print("✅ OpenLLM custom model architecture imported successfully from uploaded files")
	except ImportError as e:
	print(f"❌ OpenLLM imports failed: {e}")
	OPENLLM_AVAILABLE = False

	# Try to import sentencepiece
	try:
	import sentencepiece as spm
	SENTENCEPIECE_AVAILABLE = True
	print(f"✅ SentencePiece available: {spm.__version__}")
	except ImportError:
	SENTENCEPIECE_AVAILABLE = False
	print("❌ SentencePiece not available")

	# Import other dependencies
	try:
	from datasets import load_dataset
	from huggingface_hub import HfApi, hf_hub_download
	DEPENDENCIES_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Dependencies not available: {e}")
	DEPENDENCIES_AVAILABLE = False

	@dataclass
	class TrainingConfig:
	"""Configuration class for training parameters."""
	model_size: str
	max_steps: int
	learning_rate: float
	batch_size: int
	output_dir: str = "./openllm-trained"
	save_steps: int = 100
	logging_steps: int = 10
	warmup_steps: int = 50
	gradient_accumulation_steps: int = 4

	class OpenLLMTrainer:
	"""
	Complete training implementation using OpenLLM's actual architecture.

	This class handles the entire training pipeline including:
	- Model loading using OpenLLM's custom GPTModel
	- Tokenizer loading using sentencepiece.SentencePieceProcessor()
	- Dataset preparation
	- Training execution using OpenLLM's approach
	- Model saving and uploading
	"""

	def __init__(self):
	"""Initialize the trainer with default settings."""
	self.model = None
	self.tokenizer = None
	self.data_loader = None
	self.optimizer = None
	self.scheduler = None
	self.is_training = False
	self.training_progress = {
	"status": "Ready",
	"current_step": 0,
	"total_steps": 0,
	"loss": 0.0,
	"learning_rate": 0.0
	}

	# Initialize Hugging Face API for model uploading
	try:
	self.hf_api = HfApi()
	except Exception as e:
	print(f"Failed to initialize HF API: {e}")
	self.hf_api = None

	def load_model_and_tokenizer(self, model_size: str) -> str:
	"""
	Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.

	Args:
	model_size: Size of the model to load ("small", "medium", "large")

	Returns:
	Status message indicating success or failure
	"""
	try:
	if not OPENLLM_AVAILABLE:
	return "❌ OpenLLM custom model architecture not available"

	print(f"🔄 Loading OpenLLM {model_size} model using custom architecture...")

	# Create model using OpenLLM's factory function
	try:
	self.model = create_model(model_size)
	print(f"✅ OpenLLM {model_size} model created: {type(self.model).__name__}")
	print(f" Parameters: {self.model.get_num_params():,}")
	except Exception as e:
	print(f"❌ Failed to create model: {e}")
	return f"❌ Failed to create OpenLLM model: {str(e)}"

	# Load tokenizer using sentencepiece
	try:
	print("🔄 Loading tokenizer using sentencepiece.SentencePieceProcessor()...")

	# Download tokenizer.model from HF Hub
	model_name = f"lemms/openllm-{model_size}-extended-7k"
	tokenizer_path = hf_hub_download(
	repo_id=model_name,
	filename="tokenizer.model"
	)

	print(f"✅ Tokenizer downloaded to: {tokenizer_path}")

	# Create SentencePieceProcessor
	sp_processor = spm.SentencePieceProcessor()
	sp_processor.load(tokenizer_path)

	# Store tokenizer for later use
	self.tokenizer = sp_processor

	print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
	print(f" Vocabulary size: {sp_processor.vocab_size()}")

	except Exception as e:
	print(f"❌ Failed to load tokenizer: {e}")
	return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"

	return f"✅ Successfully loaded OpenLLM {model_size} model with custom architecture"

	except Exception as e:
	return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"

	def prepare_dataset(self) -> str:
	"""
	Load and prepare the training dataset using OpenLLM's approach.

	Returns:
	Status message indicating success or failure
	"""
	try:
	if not DEPENDENCIES_AVAILABLE:
	return "❌ Required dependencies not available"

	print("🔄 Loading training dataset...")

	# Load dataset from HF Hub
	dataset = load_dataset("lemms/openllm-training-data")
	print(f"✅ Dataset loaded: {len(dataset['train'])} samples")

	# Create temporary data file for OpenLLM's TextDataLoader
	temp_data_file = "temp_training_data.txt"
	with open(temp_data_file, 'w', encoding='utf-8') as f:
	for item in dataset['train']:
	f.write(item['text'] + '\n')

	print(f"✅ Temporary data file created: {temp_data_file}")

	# Create OpenLLM's TextDataLoader
	try:
	# Get tokenizer path
	tokenizer_path = self.tokenizer.model_file_path

	self.data_loader = TextDataLoader(
	data_file=temp_data_file,
	tokenizer_path=tokenizer_path,
	seq_len=512,
	batch_size=4, # Will be overridden by training config
	shuffle=True
	)

	print(f"✅ OpenLLM TextDataLoader created successfully")

	except Exception as e:
	print(f"❌ Failed to create TextDataLoader: {e}")
	return f"❌ Failed to create data loader: {str(e)}"

	return f"✅ Successfully prepared dataset with {len(dataset['train'])} samples"

	except Exception as e:
	return f"❌ Failed to prepare dataset: {str(e)}"

	def setup_training(self, config: TrainingConfig) -> str:
	"""
	Set up the training configuration using OpenLLM's approach.

	Args:
	config: Training configuration object

	Returns:
	Status message indicating success or failure
	"""
	try:
	# Create output directory
	os.makedirs(config.output_dir, exist_ok=True)

	# Set up optimizer (AdamW with weight decay)
	decay_params = []
	no_decay_params = []

	for name, param in self.model.named_parameters():
	if not param.requires_grad:
	continue

	if len(param.shape) == 1 or name.endswith('.bias'):
	no_decay_params.append(param)
	else:
	decay_params.append(param)

	param_groups = [
	{'params': decay_params, 'weight_decay': 0.01},
	{'params': no_decay_params, 'weight_decay': 0.0}
	]

	self.optimizer = torch.optim.AdamW(
	param_groups,
	lr=config.learning_rate,
	betas=(0.9, 0.95),
	eps=1e-8
	)

	# Set up learning rate scheduler
	warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
	self.optimizer,
	start_factor=0.01,
	end_factor=1.0,
	total_iters=config.warmup_steps
	)

	main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
	self.optimizer,
	T_max=config.max_steps - config.warmup_steps
	)

	self.scheduler = torch.optim.lr_scheduler.SequentialLR(
	self.optimizer,
	schedulers=[warmup_scheduler, main_scheduler],
	milestones=[config.warmup_steps]
	)

	print("✅ Training setup completed successfully")
	return f"✅ Training setup completed successfully"

	except Exception as e:
	return f"❌ Failed to setup training: {str(e)}"

	def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
	"""
	Execute the actual model training using OpenLLM's approach.

	Args:
	config: Training configuration object
	progress_callback: Optional callback function for progress updates

	Returns:
	Status message indicating success or failure
	"""
	try:
	self.is_training = True
	self.training_progress["status"] = "Training"
	self.training_progress["total_steps"] = config.max_steps

	print(f"🚀 Starting OpenLLM training for {config.max_steps} steps...")

	# Training loop using OpenLLM's approach
	self.model.train()
	accumulated_loss = 0.0
	self.optimizer.zero_grad()

	step = 0
	for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
	if step >= config.max_steps:
	break

	# Forward pass (model computes loss internally when targets provided)
	logits, loss = self.model(input_ids, target_ids)

	# Scale loss for gradient accumulation
	loss = loss / config.gradient_accumulation_steps
	accumulated_loss += loss.item()

	# Backward pass
	loss.backward()

	# Update weights every gradient_accumulation_steps
	if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
	# Clip gradients
	torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

	# Update parameters
	self.optimizer.step()
	self.scheduler.step()
	self.optimizer.zero_grad()

	# Update step count
	step += 1

	# Update progress
	self.training_progress["current_step"] = step
	self.training_progress["loss"] = accumulated_loss
	self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]

	# Log progress
	if step % config.logging_steps == 0:
	print(f"Step {step}/{config.max_steps} \| Loss: {accumulated_loss:.4f} \| LR: {self.scheduler.get_last_lr()[0]:.2e}")

	# Save checkpoint
	if step % config.save_steps == 0:
	self._save_checkpoint(config.output_dir, step)

	# Reset accumulated loss
	accumulated_loss = 0.0

	# Clean up memory
	if step % 100 == 0:
	gc.collect()

	# Final checkpoint
	self._save_checkpoint(config.output_dir, step, is_best=True)

	# Update final progress
	self.training_progress["status"] = "Completed"
	self.training_progress["current_step"] = step

	print(f"✅ Training completed! Final step: {step}")

	return f"✅ Training completed successfully! Final step: {step}"

	except Exception as e:
	self.training_progress["status"] = "Failed"
	print(f"❌ Training failed: {e}")
	return f"❌ Training failed: {str(e)}"
	finally:
	self.is_training = False

	def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
	"""Save model checkpoint using OpenLLM's approach."""
	try:
	checkpoint = {
	'step': step,
	'model_state_dict': self.model.state_dict(),
	'optimizer_state_dict': self.optimizer.state_dict(),
	'scheduler_state_dict': self.scheduler.state_dict(),
	'config': self.model.config.__dict__
	}

	# Save latest checkpoint
	checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
	torch.save(checkpoint, checkpoint_path)

	# Save best checkpoint
	if is_best:
	best_path = os.path.join(output_dir, "best_model.pt")
	torch.save(checkpoint, best_path)
	print(f"💾 Best model saved: {best_path}")

	print(f"💾 Checkpoint saved: {checkpoint_path}")

	except Exception as e:
	print(f"❌ Failed to save checkpoint: {e}")

	def save_and_upload_model(self, config: TrainingConfig) -> str:
	"""
	Save the trained model and upload it to Hugging Face Hub.

	Args:
	config: Training configuration object

	Returns:
	Status message indicating success or failure
	"""
	try:
	print("🔄 Saving trained model...")

	# Save the final model
	self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)

	# Save tokenizer files
	tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
	os.makedirs(tokenizer_dir, exist_ok=True)

	# Copy the tokenizer.model file
	import shutil
	shutil.copy2(self.tokenizer.model_file_path, os.path.join(tokenizer_dir, "tokenizer.model"))

	print("✅ Model saved locally")

	# Generate model name for upload
	model_name = f"openllm-{config.model_size}-extended-8k"
	repo_id = f"lemms/{model_name}"

	# Upload to Hugging Face Hub
	if self.hf_api:
	print(f"🔄 Uploading model to {repo_id}...")

	# Upload model files
	self.hf_api.upload_folder(
	folder_path=config.output_dir,
	repo_id=repo_id,
	repo_type="model",
	commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
	)

	print(f"✅ Model uploaded successfully to {repo_id}")
	return f"✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
	else:
	return f"✅ Model saved locally to {config.output_dir}"

	except Exception as e:
	print(f"❌ Failed to save/upload model: {e}")
	return f"❌ Failed to save/upload model: {str(e)}"

	def get_training_progress(self) -> Dict[str, Any]:
	"""Get current training progress information."""
	return self.training_progress.copy()

	def main():
	"""
	Main function that creates the complete Gradio application interface.
	"""

	# Initialize the trainer
	trainer = OpenLLMTrainer()

	# Create the main Gradio application interface
	with gr.Blocks(
	title="OpenLLM Training Space - Fixed with Uploaded Modules",
	theme=gr.themes.Soft()
	) as demo:

	# Application Header
	gr.Markdown("# 🚀 OpenLLM Training Space - Fixed with Uploaded Modules")
	gr.Markdown("### Uses OpenLLM's Custom Model Architecture from Uploaded Files")
	gr.Markdown("---")

	# Status Information
	gr.Markdown(f"OpenLLM Available: {'✅ Yes' if OPENLLM_AVAILABLE else '❌ No'}")
	gr.Markdown(f"SentencePiece Available: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
	gr.Markdown(f"Dependencies Available: {'✅ Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
	gr.Markdown("Architecture: ✅ OpenLLM Custom GPTModel (From Uploaded Files)")

	# Main Content Area
	with gr.Row():

	# Left Column: Training Configuration
	with gr.Column(scale=1):
	gr.Markdown("## 📊 Training Configuration")

	# Model Size Selection
	model_size = gr.Dropdown(
	choices=["small", "medium", "large"],
	value="small",
	label="Model Size"
	)

	# Training Steps Configuration
	max_steps = gr.Slider(
	minimum=100,
	maximum=10000,
	value=1000,
	step=100,
	label="Max Training Steps"
	)

	# Learning Rate Configuration
	learning_rate = gr.Slider(
	minimum=1e-5,
	maximum=1e-3,
	value=3e-4,
	step=1e-5,
	label="Learning Rate"
	)

	# Batch Size Configuration
	batch_size = gr.Slider(
	minimum=1,
	maximum=16,
	value=4,
	step=1,
	label="Batch Size"
	)

	# Right Column: Training Status and Controls
	with gr.Column(scale=1):
	gr.Markdown("## 🎯 Training Status")

	# Training Status Display
	status_text = gr.Textbox(
	value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
	label="Current Status",
	interactive=False,
	lines=5
	)

	# Progress Information
	progress_info = gr.JSON(
	value=trainer.get_training_progress(),
	label="Training Progress"
	)

	# Training Control Buttons
	with gr.Row():
	start_btn = gr.Button("🚀 Start Training", variant="primary")
	stop_btn = gr.Button("⏹️ Stop Training", variant="stop")

	# Instructions Section
	gr.Markdown("## 📋 OpenLLM Training Instructions")
	gr.Markdown("""
	This interface uses OpenLLM's actual custom model architecture from uploaded files:

	### Step 1: Configure Parameters
	- Model Size: Select the base model to train from (small, medium, large)
	- Max Steps: Number of training iterations (100-10,000)
	- Learning Rate: Training rate (0.00001-0.001)
	- Batch Size: Samples per training batch (1-16)

	### Step 2: Start Training
	- Click "Start Training" to begin the actual training process
	- Uses OpenLLM's custom GPTModel class from uploaded files
	- Uses sentencepiece.SentencePieceProcessor() for tokenization
	- Compatible with OpenLLM's actual implementation

	### Step 3: Monitor Progress
	- Watch the status updates and progress information
	- Training may take several minutes depending on steps
	- The final model will be uploaded to Hugging Face Hub

	### Step 4: Access Results
	- Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
	- Check the model repository for your trained model
	- Use the model for inference or further training
	""")

	# Resource Links Section
	gr.Markdown("## 🔗 Model Resources")
	gr.Markdown("""
	- [📚 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
	- [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
	- [📊 Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
	- [📖 Main Project](https://github.com/louischua/openllm)
	""")

	# Training Function Definition
	def start_complete_training(model_size, max_steps, learning_rate, batch_size):
	"""
	Execute the complete training process using OpenLLM's approach.
	"""
	if not OPENLLM_AVAILABLE:
	return "❌ OpenLLM custom model architecture not available. Please check the installation."

	try:
	# Create training configuration
	config = TrainingConfig(
	model_size=model_size,
	max_steps=max_steps,
	learning_rate=learning_rate,
	batch_size=batch_size
	)

	# Step 1: Load model and tokenizer using OpenLLM's approach
	status = trainer.load_model_and_tokenizer(model_size)
	if "❌" in status:
	return status

	# Step 2: Prepare dataset
	status = trainer.prepare_dataset()
	if "❌" in status:
	return status

	# Step 3: Setup training
	status = trainer.setup_training(config)
	if "❌" in status:
	return status

	# Step 4: Execute training
	status = trainer.train_model(config)
	if "❌" in status:
	return status

	# Step 5: Save and upload model
	status = trainer.save_and_upload_model(config)

	return f"🚀 Complete training process finished!\n{status}"

	except Exception as e:
	return f"❌ Training process failed: {str(e)}"

	def update_progress():
	"""Update the progress display."""
	return trainer.get_training_progress()

	# Connect UI Components to Functions
	start_btn.click(
	fn=start_complete_training,
	inputs=[model_size, max_steps, learning_rate, batch_size],
	outputs=[status_text]
	)

	# Auto-refresh progress every 5 seconds during training
	demo.load(update_progress, outputs=[progress_info])

	# Application Footer
	gr.Markdown("---")
	gr.Markdown("Author: Louis Chua Bean Chong \| Project: OpenLLM \| License: GPL-3.0")
	gr.Markdown("Architecture: OpenLLM Custom GPTModel (From Uploaded Files)")
	gr.Markdown("Tokenizer: sentencepiece.SentencePieceProcessor()")

	return demo

	if __name__ == "__main__":
	demo = main()
	demo.launch()