lemms commited on
Commit
fda995e
Β·
verified Β·
1 Parent(s): 3cc9282

feat: Sync training infrastructure from main repository

Browse files
Files changed (2) hide show
  1. app.py +980 -209
  2. requirements.txt +51 -40
app.py CHANGED
@@ -1,253 +1,1024 @@
1
  #!/usr/bin/env python3
2
  """
3
- OpenLLM Training Space - Main Application
4
 
5
- This is the main entry point for the Hugging Face Space.
6
- It provides a web interface for running OpenLLM training with authentication.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  Author: Louis Chua Bean Chong
9
- License: GPLv3
 
 
10
  """
11
 
 
 
 
12
  import os
13
- import sys
 
 
 
 
 
14
  from pathlib import Path
15
 
16
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Import our authentication and training modules
 
19
  try:
20
- from openllm_training_with_auth import OpenLLMTrainingManager
21
- from space_auth_test import test_space_authentication
 
 
 
 
 
 
 
 
22
 
23
- MODULES_AVAILABLE = True
 
 
 
 
 
 
 
24
  except ImportError as e:
25
- MODULES_AVAILABLE = False
26
- print(f"❌ Required modules not available: {e}")
27
-
28
 
29
- def create_space_interface():
30
- """Create the Gradio interface for the Space."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- def run_authentication_test():
33
- """Run the authentication test and return results."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
- if not MODULES_AVAILABLE:
36
- return "❌ Required modules not available. Please check deployment."
37
-
38
- # Capture output from authentication test
39
- import contextlib
40
- import io
41
-
42
- output = io.StringIO()
43
- with contextlib.redirect_stdout(output):
44
- success = test_space_authentication()
45
-
46
- result = output.getvalue()
47
-
48
- if success:
49
- return f"βœ… Authentication Test Results:\n\n{result}"
50
- else:
51
- return f"❌ Authentication Test Failed:\n\n{result}"
52
-
53
  except Exception as e:
54
- return f"❌ Error running authentication test: {e}"
55
-
56
- def run_training(model_size, training_steps):
57
- """Run the OpenLLM training with authentication."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
- if not MODULES_AVAILABLE:
60
- return "❌ Required modules not available. Please check deployment."
61
-
62
- # Security mitigation: Input validation and sanitization
63
- if not isinstance(model_size, str) or model_size not in ["small", "medium", "large"]:
64
- return "❌ Invalid model size. Must be 'small', 'medium', or 'large'."
65
-
66
- if (
67
- not isinstance(training_steps, (int, float))
68
- or training_steps < 1000
69
- or training_steps > 50000
70
- ):
71
- return "❌ Invalid training steps. Must be between 1000 and 50000."
72
-
73
- # Sanitize inputs
74
- model_size = str(model_size).strip().lower()
75
- training_steps = int(float(training_steps))
76
-
77
- # Capture output from training
78
- import contextlib
79
- import io
80
-
81
- output = io.StringIO()
82
- with contextlib.redirect_stdout(output):
83
- training_manager = OpenLLMTrainingManager()
84
- repo_id = training_manager.run_training(model_size=model_size, steps=training_steps)
85
-
86
- result = output.getvalue()
87
-
88
- return f"βœ… Training Results:\n\n{result}\n\nπŸŽ‰ Model available at: https://huggingface.co/{repo_id}"
89
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
- return f"❌ Error running training: {e}"
92
-
93
- def check_space_environment():
94
- """Check the Space environment and configuration."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  try:
96
- # Check if we're in a Space
97
- space_vars = ["SPACE_ID", "SPACE_HOST", "SPACE_REPO_ID"]
98
- is_space = any(os.getenv(var) for var in space_vars)
99
-
100
- # Check HF_TOKEN
101
- hf_token = os.getenv("HF_TOKEN")
102
-
103
- result = "πŸ” Space Environment Check:\n\n"
104
-
105
- if is_space:
106
- result += "βœ… Running in Hugging Face Space environment\n"
107
- for var in space_vars:
108
- value = os.getenv(var)
109
- if value:
110
- result += f" - {var}: {value}\n"
111
- else:
112
- result += "ℹ️ Running in local environment\n"
113
-
114
- if hf_token:
115
- result += f"βœ… HF access token found: {hf_token[:8]}...{hf_token[-4:]}\n"
116
- result += " - Source: HF access token in Space settings\n"
117
- else:
118
- result += "❌ HF access token not found\n"
119
- result += " - Please set HF_TOKEN in Space settings with HF access token\n"
120
-
121
- result += f"\nπŸ“ Available modules: {'βœ…' if MODULES_AVAILABLE else '❌'}"
122
-
123
- return result
124
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
- return f"❌ Error checking environment: {e}"
127
-
128
- # Create the Gradio interface with security mitigations
129
- with gr.Blocks(
130
- title="OpenLLM Training Space",
131
- theme=gr.themes.Soft(),
132
- # Security mitigations
133
- analytics_enabled=False, # Disable analytics
134
- show_error=False, # Don't expose error details
135
- ) as interface:
136
- gr.Markdown(
137
- """
138
- # πŸš€ OpenLLM Training Space
139
-
140
- Welcome to the OpenLLM Training Space! This Space provides a complete environment for training OpenLLM models with automatic Hugging Face authentication and model upload.
141
 
142
- ## πŸ” Authentication
143
-
144
- This Space uses HF access token for secure authentication. The HF_TOKEN is automatically available from your Space settings.
 
 
145
 
146
- ## πŸ“‹ Available Actions
 
 
 
 
147
 
148
- 1. **Environment Check**: Verify Space configuration and authentication
149
- 2. **Authentication Test**: Test Hugging Face authentication
150
- 3. **Run Training**: Start OpenLLM training with automatic upload
 
 
 
 
151
  """
152
- )
153
-
154
- with gr.Tab("πŸ” Environment Check"):
155
- gr.Markdown("Check the Space environment and configuration.")
156
- env_check_btn = gr.Button("Check Environment", variant="primary")
157
- env_output = gr.Textbox(label="Environment Status", lines=10, interactive=False)
158
- env_check_btn.click(check_space_environment, outputs=env_output)
159
-
160
- with gr.Tab("πŸ” Authentication Test"):
161
- gr.Markdown("Test Hugging Face authentication using HF access token.")
162
- auth_test_btn = gr.Button("Run Authentication Test", variant="primary")
163
- auth_output = gr.Textbox(label="Authentication Results", lines=15, interactive=False)
164
- auth_test_btn.click(run_authentication_test, outputs=auth_output)
165
-
166
- with gr.Tab("πŸš€ Run Training"):
167
- gr.Markdown(
168
- """
169
- Start OpenLLM training with automatic model upload.
170
-
171
- **Training Parameters:**
172
- - **Model Size**: Choose the model size (small, medium, large)
173
- - **Training Steps**: Number of training steps (default: 8000)
174
-
175
- **Expected Results:**
176
- - Training will complete successfully
177
- - Model will be uploaded to Hugging Face Hub
178
- - Repository will be created with proper model files
179
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  model_size = gr.Dropdown(
184
  choices=["small", "medium", "large"],
185
  value="small",
186
  label="Model Size",
187
- info="Choose the model size for training",
188
  )
189
- training_steps = gr.Number(
190
- value=8000,
191
- label="Training Steps",
192
- info="Number of training steps",
193
- minimum=1000,
194
- maximum=50000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  )
196
-
197
- train_btn = gr.Button("Start Training", variant="primary", size="lg")
198
- train_output = gr.Textbox(label="Training Results", lines=20, interactive=False)
199
-
200
- train_btn.click(run_training, inputs=[model_size, training_steps], outputs=train_output)
201
-
202
- with gr.Tab("πŸ“š Documentation"):
203
- gr.Markdown(
204
- """
205
- ## πŸ“– Available Documentation
206
 
207
- - **HUGGINGFACE_SPACE_SETUP_GUIDE.md**: Complete setup guide
208
- - **SPACE_AUTHENTICATION_SUMMARY.md**: Authentication summary
209
- - **SPACE_READY_SUMMARY.md**: Deployment summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- ## πŸ”§ Available Scripts
 
 
 
 
 
 
 
212
 
213
- - **space_auth_test.py**: Authentication verification
214
- - **openllm_training_with_auth.py**: Complete training script
215
- - **integrate_auth_into_training.py**: Integration guide
216
- - **setup_hf_space_auth.py**: Space authentication setup
217
- - **verify_space_auth.py**: Space verification script
218
 
219
- ## 🎯 Quick Start
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- 1. Check the environment to verify configuration
222
- 2. Run authentication test to ensure GitHub secrets are working
223
- 3. Start training with your desired parameters
224
- 4. Monitor the training progress and model upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- ## πŸ”’ Security
 
 
227
 
228
- - HF_TOKEN is securely stored in GitHub repository secrets
229
- - No hardcoded tokens in any scripts
230
- - Automatic cleanup of test repositories
231
- - Proper error handling and logging
232
  """
233
- )
234
-
235
- return interface
236
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  if __name__ == "__main__":
239
- # Create and launch the interface
240
- interface = create_space_interface()
241
- interface.launch(
242
- server_name="0.0.0.0",
243
- server_port=7860,
244
- share=False,
245
- # Security mitigations for Gradio vulnerabilities
246
- allowed_paths=[], # Restrict file access
247
- auth=None, # Disable authentication to prevent code injection
248
- show_error=False, # Don't expose error details
249
- quiet=True, # Reduce logging
250
- # Disable potentially vulnerable features
251
- enable_queue=False,
252
- max_threads=1, # Limit concurrent requests
253
- )
 
1
  #!/usr/bin/env python3
2
  """
3
+ OpenLLM Training Space Application - Fixed with Uploaded Modules
4
 
5
+ This version imports OpenLLM modules from the uploaded files in the HF Space:
6
+ - Imports model.py and data_loader.py that were uploaded to the Space
7
+ - Uses OpenLLM's actual custom model architecture
8
+ - Compatible with OpenLLM's implementation
9
+
10
+ This application provides a complete training interface for OpenLLM models on Hugging Face Spaces.
11
+ It uses OpenLLM's custom GPTModel architecture instead of Hugging Face Transformers,
12
+ ensuring compatibility with the actual OpenLLM implementation.
13
+
14
+ Key Features:
15
+ - Real model training using OpenLLM's custom architecture
16
+ - SentencePiece tokenization for OpenLLM models
17
+ - Complete training pipeline with progress monitoring
18
+ - Automatic model saving and uploading to Hugging Face Hub
19
+ - Gradio 4.44.1 compatible user interface
20
+
21
+ Technical Architecture:
22
+ - Uses OpenLLM's GPTModel class (not Hugging Face Transformers)
23
+ - Imports custom modules from uploaded files in the Space
24
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
25
+ - Implements OpenLLM's training loop and optimization strategy
26
+ - Saves checkpoints in OpenLLM's format
27
 
28
  Author: Louis Chua Bean Chong
29
+ License: GPL-3.0
30
+ Version: 2.1.1
31
+ Last Updated: 2024
32
  """
33
 
34
+ import gradio as gr
35
+ import torch
36
+ import torch.nn as nn
37
  import os
38
+ import time
39
+ import math
40
+ import gc
41
+ from typing import Dict, Any, Optional
42
+ import threading
43
+ from dataclasses import dataclass
44
  from pathlib import Path
45
 
46
+ # Import OpenLLM's custom model architecture from uploaded files
47
+ # These files were uploaded to the HF Space and contain OpenLLM's actual implementation
48
+ try:
49
+ # Import from the uploaded files in the HF Space
50
+ # model.py contains GPTModel, GPTConfig, and create_model factory function
51
+ from model import GPTModel, GPTConfig, create_model
52
+ # data_loader.py contains TextDataLoader for OpenLLM's data loading approach
53
+ from data_loader import TextDataLoader
54
+ OPENLLM_AVAILABLE = True
55
+ print("βœ… OpenLLM custom model architecture imported successfully from uploaded files")
56
+ print(" - GPTModel: Custom PyTorch model architecture")
57
+ print(" - GPTConfig: Model configuration dataclass")
58
+ print(" - create_model: Factory function for model creation")
59
+ print(" - TextDataLoader: Custom data loading implementation")
60
+ except ImportError as e:
61
+ print(f"❌ OpenLLM imports failed: {e}")
62
+ print(" This indicates the uploaded OpenLLM source files are not available")
63
+ print(" The training functionality will be disabled")
64
+ OPENLLM_AVAILABLE = False
65
 
66
+ # Try to import sentencepiece - CRITICAL for OpenLLM tokenization
67
+ # OpenLLM uses SentencePiece for tokenization, not Hugging Face tokenizers
68
  try:
69
+ import sentencepiece as spm
70
+ SENTENCEPIECE_AVAILABLE = True
71
+ print(f"βœ… SentencePiece available: {spm.__version__}")
72
+ print(" - Required for OpenLLM tokenization")
73
+ print(" - Used for loading tokenizer.model files")
74
+ except ImportError:
75
+ SENTENCEPIECE_AVAILABLE = False
76
+ print("❌ SentencePiece not available")
77
+ print(" - This will prevent tokenizer loading")
78
+ print(" - Training functionality will be limited")
79
 
80
+ # Import other dependencies for the complete training pipeline
81
+ try:
82
+ from datasets import load_dataset # For loading training data from HF Hub
83
+ from huggingface_hub import HfApi, hf_hub_download # For model uploads and downloads
84
+ DEPENDENCIES_AVAILABLE = True
85
+ print("βœ… Training dependencies available")
86
+ print(" - datasets: For loading training data")
87
+ print(" - huggingface_hub: For model uploads/downloads")
88
  except ImportError as e:
89
+ print(f"❌ Dependencies not available: {e}")
90
+ print(" - This will prevent dataset loading and model uploading")
91
+ DEPENDENCIES_AVAILABLE = False
92
 
93
+ @dataclass
94
+ class TrainingConfig:
95
+ """
96
+ Configuration class for training parameters.
97
+
98
+ This dataclass encapsulates all the training hyperparameters and settings
99
+ that control the OpenLLM training process. It provides a clean interface
100
+ for passing configuration between different components of the training pipeline.
101
+
102
+ Attributes:
103
+ model_size: Size of the model to train ("small", "medium", "large")
104
+ max_steps: Maximum number of training iterations
105
+ learning_rate: Learning rate for the optimizer
106
+ batch_size: Number of samples per training batch
107
+ output_dir: Directory to save trained models and checkpoints
108
+ save_steps: Frequency of checkpoint saving (every N steps)
109
+ logging_steps: Frequency of progress logging (every N steps)
110
+ warmup_steps: Number of warmup steps for learning rate scheduling
111
+ gradient_accumulation_steps: Number of steps to accumulate gradients
112
+ """
113
+ model_size: str
114
+ max_steps: int
115
+ learning_rate: float
116
+ batch_size: int
117
+ output_dir: str = "./openllm-trained"
118
+ save_steps: int = 100
119
+ logging_steps: int = 10
120
+ warmup_steps: int = 50
121
+ gradient_accumulation_steps: int = 4
122
 
123
+ class OpenLLMTrainer:
124
+ """
125
+ Complete training implementation using OpenLLM's actual architecture.
126
+
127
+ This class handles the entire training pipeline including:
128
+ - Model loading using OpenLLM's custom GPTModel
129
+ - Tokenizer loading using sentencepiece.SentencePieceProcessor()
130
+ - Dataset preparation using OpenLLM's TextDataLoader
131
+ - Training execution using OpenLLM's approach
132
+ - Model saving and uploading to Hugging Face Hub
133
+
134
+ The trainer implements OpenLLM's actual training methodology rather than
135
+ using Hugging Face Transformers, ensuring compatibility with the real
136
+ OpenLLM implementation.
137
+
138
+ Key Features:
139
+ - Custom model architecture (GPTModel, not PreTrainedModel)
140
+ - SentencePiece tokenization (not Hugging Face tokenizers)
141
+ - OpenLLM's training loop and optimization strategy
142
+ - Gradient accumulation for memory efficiency
143
+ - Learning rate scheduling with warmup
144
+ - Automatic checkpoint saving and model uploading
145
+ """
146
+
147
+ def __init__(self):
148
+ """
149
+ Initialize the trainer with default settings.
150
+
151
+ Sets up the trainer with default values and initializes the Hugging Face
152
+ API for model uploading. All components start as None and are initialized
153
+ during the training process.
154
+ """
155
+ # Core training components - initialized during training
156
+ self.model = None # OpenLLM's GPTModel instance
157
+ self.tokenizer = None # SentencePieceProcessor instance
158
+ self.data_loader = None # OpenLLM's TextDataLoader instance
159
+ self.optimizer = None # PyTorch optimizer (AdamW)
160
+ self.scheduler = None # Learning rate scheduler
161
+
162
+ # Training state management
163
+ self.is_training = False # Flag to track training status
164
+ self.tokenizer_path = None # Path to the tokenizer.model file
165
+
166
+ # Progress tracking for UI updates
167
+ self.training_progress = {
168
+ "status": "Ready", # Current training status
169
+ "current_step": 0, # Current training step
170
+ "total_steps": 0, # Total steps to complete
171
+ "loss": 0.0, # Current training loss
172
+ "learning_rate": 0.0 # Current learning rate
173
+ }
174
+
175
+ # Initialize Hugging Face API for model uploading
176
+ # This allows the trained model to be automatically uploaded to HF Hub
177
  try:
178
+ self.hf_api = HfApi()
179
+ print("βœ… Hugging Face API initialized for model uploading")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  except Exception as e:
181
+ print(f"Failed to initialize HF API: {e}")
182
+ print(" - Model uploading will be disabled")
183
+ self.hf_api = None
184
+
185
+ def load_model_and_tokenizer(self, model_size: str) -> str:
186
+ """
187
+ Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
188
+
189
+ This method implements OpenLLM's actual model loading strategy:
190
+ 1. Creates a new GPTModel using OpenLLM's factory function
191
+ 2. Downloads the tokenizer.model file from Hugging Face Hub
192
+ 3. Loads the tokenizer using SentencePieceProcessor
193
+ 4. Stores both components for use in training
194
+
195
+ This approach differs from Hugging Face Transformers because:
196
+ - Uses OpenLLM's custom GPTModel (not AutoModelForCausalLM)
197
+ - Uses SentencePiece directly (not AutoTokenizer)
198
+ - Downloads specific files rather than using from_pretrained()
199
+
200
+ Args:
201
+ model_size: Size of the model to load ("small", "medium", "large")
202
+ Determines which pre-trained model to download
203
+
204
+ Returns:
205
+ Status message indicating success or failure
206
+ Success: "βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
207
+ Failure: "❌ Failed to load OpenLLM model and tokenizer: {error details}"
208
+ """
209
  try:
210
+ # Verify OpenLLM modules are available
211
+ if not OPENLLM_AVAILABLE:
212
+ return "❌ OpenLLM custom model architecture not available"
213
+
214
+ print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
215
+ print(f" - Using OpenLLM's create_model factory function")
216
+ print(f" - Not using Hugging Face Transformers")
217
+
218
+ # Step 1: Create model using OpenLLM's factory function
219
+ # This creates a fresh GPTModel instance with the specified size
220
+ try:
221
+ self.model = create_model(model_size)
222
+ print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
223
+ print(f" - Model type: {type(self.model).__name__}")
224
+ print(f" - Parameters: {self.model.get_num_params():,}")
225
+ print(f" - Architecture: Custom GPTModel (not PreTrainedModel)")
226
+ except Exception as e:
227
+ print(f"❌ Failed to create model: {e}")
228
+ return f"❌ Failed to create OpenLLM model: {str(e)}"
229
+
230
+ # Step 2: Load tokenizer using sentencepiece
231
+ # OpenLLM uses SentencePiece directly, not Hugging Face tokenizers
232
+ try:
233
+ print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
234
+ print(" - Using SentencePiece directly (not AutoTokenizer)")
235
+ print(" - Downloading tokenizer.model from Hugging Face Hub")
236
+
237
+ # Download tokenizer.model from HF Hub
238
+ # This is the actual tokenizer file used by OpenLLM models
239
+ model_name = f"lemms/openllm-{model_size}-extended-7k"
240
+ tokenizer_path = hf_hub_download(
241
+ repo_id=model_name,
242
+ filename="tokenizer.model" # Specific file name for OpenLLM
243
+ )
244
+
245
+ print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
246
+ print(f" - Source: {model_name}")
247
+ print(f" - File: tokenizer.model")
248
+
249
+ # Create SentencePieceProcessor and load the tokenizer
250
+ # This is OpenLLM's actual tokenization approach
251
+ sp_processor = spm.SentencePieceProcessor()
252
+ sp_processor.load(tokenizer_path)
253
+
254
+ # Store tokenizer and its path separately
255
+ # We need the path for the TextDataLoader later
256
+ self.tokenizer = sp_processor
257
+ self.tokenizer_path = tokenizer_path # Store the path separately
258
+
259
+ print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
260
+ print(f" - Vocabulary size: {sp_processor.vocab_size()}")
261
+ print(f" - Tokenizer path: {tokenizer_path}")
262
+ print(f" - Tokenizer type: {type(sp_processor).__name__}")
263
+
264
+ except Exception as e:
265
+ print(f"❌ Failed to load tokenizer: {e}")
266
+ return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
267
+
268
+ return f"βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
269
+
270
  except Exception as e:
271
+ return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
272
+
273
+ def prepare_dataset(self) -> str:
274
+ """
275
+ Load and prepare the training dataset using OpenLLM's approach.
276
+
277
+ This method implements OpenLLM's data preparation strategy:
278
+ 1. Loads training data from Hugging Face Hub dataset
279
+ 2. Creates a temporary text file for OpenLLM's TextDataLoader
280
+ 3. Initializes OpenLLM's TextDataLoader with the tokenizer
281
+ 4. Prepares the data for training
282
+
283
+ OpenLLM's approach differs from Hugging Face because:
284
+ - Uses a simple text file format (not tokenized datasets)
285
+ - Uses OpenLLM's TextDataLoader (not Hugging Face datasets)
286
+ - Tokenization happens on-the-fly during training
287
+
288
+ Returns:
289
+ Status message indicating success or failure
290
+ Success: "βœ… Successfully prepared dataset with {count} samples"
291
+ Failure: "❌ Failed to prepare dataset: {error details}"
292
+ """
293
  try:
294
+ # Verify dependencies are available
295
+ if not DEPENDENCIES_AVAILABLE:
296
+ return "❌ Required dependencies not available"
297
+
298
+ print("πŸ”„ Loading training dataset...")
299
+ print(" - Loading from Hugging Face Hub dataset")
300
+ print(" - Using OpenLLM's data preparation approach")
301
+
302
+ # Load dataset from HF Hub
303
+ # This contains the training text data for continuing model training
304
+ dataset = load_dataset("lemms/openllm-training-data")
305
+ print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
306
+ print(f" - Dataset: lemms/openllm-training-data")
307
+ print(f" - Samples: {len(dataset['train'])}")
308
+
309
+ # Create temporary data file for OpenLLM's TextDataLoader
310
+ # OpenLLM expects a simple text file with one text sample per line
311
+ temp_data_file = "temp_training_data.txt"
312
+ with open(temp_data_file, 'w', encoding='utf-8') as f:
313
+ for item in dataset['train']:
314
+ f.write(item['text'] + '\n')
315
+
316
+ print(f"βœ… Temporary data file created: {temp_data_file}")
317
+ print(f" - Format: One text sample per line")
318
+ print(f" - Encoding: UTF-8")
319
+
320
+ # Create OpenLLM's TextDataLoader
321
+ # This is OpenLLM's custom data loading implementation
322
+ try:
323
+ # Use the stored tokenizer path instead of trying to access model_file_path
324
+ # SentencePieceProcessor doesn't have a model_file_path attribute
325
+ tokenizer_path = self.tokenizer_path # Use the stored path
326
+
327
+ print(f"πŸ”„ Creating OpenLLM TextDataLoader...")
328
+ print(f" - Data file: {temp_data_file}")
329
+ print(f" - Tokenizer path: {tokenizer_path}")
330
+ print(f" - Sequence length: 512")
331
+ print(f" - Batch size: 4 (will be overridden by training config)")
332
+
333
+ self.data_loader = TextDataLoader(
334
+ data_file=temp_data_file,
335
+ tokenizer_path=tokenizer_path,
336
+ seq_len=512, # Maximum sequence length for training
337
+ batch_size=4, # Will be overridden by training config
338
+ shuffle=True # Shuffle data for better training
339
+ )
340
+
341
+ print(f"βœ… OpenLLM TextDataLoader created successfully")
342
+ print(f" - DataLoader type: {type(self.data_loader).__name__}")
343
+ print(f" - Uses OpenLLM's custom implementation")
344
+
345
+ except Exception as e:
346
+ print(f"❌ Failed to create TextDataLoader: {e}")
347
+ return f"❌ Failed to create data loader: {str(e)}"
348
+
349
+ return f"βœ… Successfully prepared dataset with {len(dataset['train'])} samples"
350
+
351
  except Exception as e:
352
+ return f"❌ Failed to prepare dataset: {str(e)}"
353
+
354
+ def setup_training(self, config: TrainingConfig) -> str:
355
+ """
356
+ Set up the training configuration using OpenLLM's approach.
 
 
 
 
 
 
 
 
 
 
357
 
358
+ This method configures the training environment with:
359
+ 1. Output directory creation
360
+ 2. Optimizer setup with weight decay groups
361
+ 3. Learning rate scheduler with warmup
362
+ 4. Training hyperparameters
363
 
364
+ The setup follows OpenLLM's training methodology:
365
+ - Uses AdamW optimizer with weight decay
366
+ - Implements learning rate warmup followed by cosine annealing
367
+ - Separates parameters for different weight decay rates
368
+ - Uses gradient clipping for stability
369
 
370
+ Args:
371
+ config: Training configuration object containing all hyperparameters
372
+
373
+ Returns:
374
+ Status message indicating success or failure
375
+ Success: "βœ… Training setup completed successfully"
376
+ Failure: "❌ Failed to setup training: {error details}"
377
  """
378
+ try:
379
+ print("πŸ”„ Setting up training configuration...")
380
+ print(f" - Output directory: {config.output_dir}")
381
+ print(f" - Learning rate: {config.learning_rate}")
382
+ print(f" - Max steps: {config.max_steps}")
383
+
384
+ # Create output directory for saving models and checkpoints
385
+ os.makedirs(config.output_dir, exist_ok=True)
386
+ print(f"βœ… Output directory created: {config.output_dir}")
387
+
388
+ # Set up optimizer (AdamW with weight decay)
389
+ # This follows OpenLLM's optimization strategy
390
+ print("πŸ”„ Setting up AdamW optimizer with weight decay...")
391
+
392
+ # Separate parameters for different weight decay rates
393
+ # This is a common practice for transformer training
394
+ decay_params = [] # Parameters that should have weight decay
395
+ no_decay_params = [] # Parameters that should not have weight decay
396
+
397
+ for name, param in self.model.named_parameters():
398
+ if not param.requires_grad:
399
+ continue
400
+
401
+ # Apply weight decay to all parameters except biases and layer norm weights
402
+ if len(param.shape) == 1 or name.endswith('.bias'):
403
+ no_decay_params.append(param)
404
+ else:
405
+ decay_params.append(param)
406
+
407
+ # Create parameter groups with different weight decay rates
408
+ param_groups = [
409
+ {'params': decay_params, 'weight_decay': 0.01}, # 1% weight decay
410
+ {'params': no_decay_params, 'weight_decay': 0.0} # No weight decay
411
+ ]
412
+
413
+ print(f" - Decay parameters: {len(decay_params)}")
414
+ print(f" - No-decay parameters: {len(no_decay_params)}")
415
+
416
+ # Initialize AdamW optimizer with OpenLLM's recommended settings
417
+ self.optimizer = torch.optim.AdamW(
418
+ param_groups,
419
+ lr=config.learning_rate,
420
+ betas=(0.9, 0.95), # Beta values for momentum
421
+ eps=1e-8 # Epsilon for numerical stability
422
+ )
423
+
424
+ print(f"βœ… AdamW optimizer configured")
425
+ print(f" - Learning rate: {config.learning_rate}")
426
+ print(f" - Betas: (0.9, 0.95)")
427
+ print(f" - Epsilon: 1e-8")
428
+
429
+ # Set up learning rate scheduler
430
+ # OpenLLM uses a warmup followed by cosine annealing
431
+ print("πŸ”„ Setting up learning rate scheduler...")
432
+
433
+ # Warmup scheduler: linearly increase LR from 1% to 100%
434
+ warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
435
+ self.optimizer,
436
+ start_factor=0.01, # Start at 1% of target LR
437
+ end_factor=1.0, # End at 100% of target LR
438
+ total_iters=config.warmup_steps
439
+ )
440
+
441
+ # Main scheduler: cosine annealing after warmup
442
+ main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
443
+ self.optimizer,
444
+ T_max=config.max_steps - config.warmup_steps # Duration of cosine annealing
445
  )
446
+
447
+ # Combine warmup and main schedulers
448
+ self.scheduler = torch.optim.lr_scheduler.SequentialLR(
449
+ self.optimizer,
450
+ schedulers=[warmup_scheduler, main_scheduler],
451
+ milestones=[config.warmup_steps] # Switch to main scheduler after warmup
452
+ )
453
+
454
+ print(f"βœ… Learning rate scheduler configured")
455
+ print(f" - Warmup steps: {config.warmup_steps}")
456
+ print(f" - Total steps: {config.max_steps}")
457
+ print(f" - Schedule: Linear warmup β†’ Cosine annealing")
458
+
459
+ print("βœ… Training setup completed successfully")
460
+ return f"βœ… Training setup completed successfully"
461
+
462
+ except Exception as e:
463
+ return f"❌ Failed to setup training: {str(e)}"
464
+
465
+ def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
466
+ """
467
+ Execute the actual model training using OpenLLM's approach.
468
+
469
+ This method implements OpenLLM's training loop:
470
+ 1. Sets up training mode and progress tracking
471
+ 2. Iterates through data batches using OpenLLM's TextDataLoader
472
+ 3. Performs forward pass, loss computation, and backward pass
473
+ 4. Implements gradient accumulation for memory efficiency
474
+ 5. Updates model parameters and learning rate
475
+ 6. Saves checkpoints and logs progress
476
+
477
+ The training loop follows OpenLLM's methodology:
478
+ - Uses OpenLLM's GPTModel forward pass (returns logits and loss)
479
+ - Implements gradient accumulation for effective larger batch sizes
480
+ - Uses gradient clipping for training stability
481
+ - Saves checkpoints in OpenLLM's format
482
+ - Updates progress for UI monitoring
483
+
484
+ Args:
485
+ config: Training configuration object containing hyperparameters
486
+ progress_callback: Optional callback function for progress updates
487
+ (Not used in current implementation)
488
+
489
+ Returns:
490
+ Status message indicating success or failure
491
+ Success: "βœ… Training completed successfully! Final step: {step}"
492
+ Failure: "❌ Training failed: {error details}"
493
+ """
494
+ try:
495
+ # Set training state
496
+ self.is_training = True
497
+ self.training_progress["status"] = "Training"
498
+ self.training_progress["total_steps"] = config.max_steps
499
+
500
+ print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
501
+ print(f" - Model: {type(self.model).__name__}")
502
+ print(f" - DataLoader: {type(self.data_loader).__name__}")
503
+ print(f" - Optimizer: {type(self.optimizer).__name__}")
504
+ print(f" - Gradient accumulation: {config.gradient_accumulation_steps}")
505
+
506
+ # Training loop using OpenLLM's approach
507
+ self.model.train() # Set model to training mode
508
+ accumulated_loss = 0.0 # Track loss across accumulation steps
509
+ self.optimizer.zero_grad() # Clear gradients
510
+
511
+ step = 0 # Current training step
512
+ for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
513
+ # Check if we've reached the maximum number of steps
514
+ if step >= config.max_steps:
515
+ break
516
+
517
+ # Forward pass (model computes loss internally when targets provided)
518
+ # OpenLLM's GPTModel returns both logits and loss
519
+ logits, loss = self.model(input_ids, target_ids)
520
+
521
+ # Scale loss for gradient accumulation
522
+ # This allows us to simulate larger batch sizes
523
+ loss = loss / config.gradient_accumulation_steps
524
+ accumulated_loss += loss.item()
525
+
526
+ # Backward pass - compute gradients
527
+ loss.backward()
528
+
529
+ # Update weights every gradient_accumulation_steps
530
+ if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
531
+ # Clip gradients for training stability
532
+ # This prevents exploding gradients
533
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
534
+
535
+ # Update parameters using the optimizer
536
+ self.optimizer.step()
537
+
538
+ # Update learning rate using the scheduler
539
+ self.scheduler.step()
540
+
541
+ # Clear gradients for the next accumulation cycle
542
+ self.optimizer.zero_grad()
543
+
544
+ # Update step count
545
+ step += 1
546
+
547
+ # Update progress for UI monitoring
548
+ self.training_progress["current_step"] = step
549
+ self.training_progress["loss"] = accumulated_loss
550
+ self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
551
+
552
+ # Log progress at specified intervals
553
+ if step % config.logging_steps == 0:
554
+ current_lr = self.scheduler.get_last_lr()[0]
555
+ print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {current_lr:.2e}")
556
+
557
+ # Save checkpoint at specified intervals
558
+ if step % config.save_steps == 0:
559
+ self._save_checkpoint(config.output_dir, step)
560
+ print(f"πŸ’Ύ Checkpoint saved at step {step}")
561
+
562
+ # Reset accumulated loss for the next accumulation cycle
563
+ accumulated_loss = 0.0
564
+
565
+ # Clean up memory periodically
566
+ if step % 100 == 0:
567
+ gc.collect()
568
+ print(f"🧹 Memory cleanup at step {step}")
569
+
570
+ # Save final checkpoint
571
+ self._save_checkpoint(config.output_dir, step, is_best=True)
572
+ print(f"πŸ’Ύ Final checkpoint saved at step {step}")
573
+
574
+ # Update final progress
575
+ self.training_progress["status"] = "Completed"
576
+ self.training_progress["current_step"] = step
577
+
578
+ print(f"βœ… Training completed! Final step: {step}")
579
+ print(f" - Total steps completed: {step}")
580
+ print(f" - Final loss: {self.training_progress['loss']:.4f}")
581
+ print(f" - Final learning rate: {self.training_progress['learning_rate']:.2e}")
582
+
583
+ return f"βœ… Training completed successfully! Final step: {step}"
584
+
585
+ except Exception as e:
586
+ self.training_progress["status"] = "Failed"
587
+ print(f"❌ Training failed: {e}")
588
+ print(f" - Error occurred during training")
589
+ print(f" - Training state: {self.training_progress['status']}")
590
+ return f"❌ Training failed: {str(e)}"
591
+ finally:
592
+ self.is_training = False
593
+
594
+ def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
595
+ """
596
+ Save model checkpoint using OpenLLM's approach.
597
+
598
+ This method saves the model state in OpenLLM's checkpoint format:
599
+ - Model state dictionary
600
+ - Optimizer state dictionary
601
+ - Scheduler state dictionary
602
+ - Model configuration
603
+ - Training step information
604
+
605
+ The checkpoint format is compatible with OpenLLM's loading mechanism
606
+ and can be used to resume training or load the model for inference.
607
+
608
+ Args:
609
+ output_dir: Directory to save the checkpoint
610
+ step: Current training step number
611
+ is_best: Whether this is the best model so far
612
+ """
613
+ try:
614
+ # Create checkpoint dictionary with all necessary components
615
+ checkpoint = {
616
+ 'step': step, # Current training step
617
+ 'model_state_dict': self.model.state_dict(), # Model parameters
618
+ 'optimizer_state_dict': self.optimizer.state_dict(), # Optimizer state
619
+ 'scheduler_state_dict': self.scheduler.state_dict(), # Scheduler state
620
+ 'config': self.model.config.__dict__ # Model configuration
621
+ }
622
+
623
+ # Save latest checkpoint
624
+ checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
625
+ torch.save(checkpoint, checkpoint_path)
626
+
627
+ # Save best checkpoint if this is the best model
628
+ if is_best:
629
+ best_path = os.path.join(output_dir, "best_model.pt")
630
+ torch.save(checkpoint, best_path)
631
+ print(f"πŸ’Ύ Best model saved: {best_path}")
632
+
633
+ print(f"πŸ’Ύ Checkpoint saved: {checkpoint_path}")
634
+
635
+ except Exception as e:
636
+ print(f"❌ Failed to save checkpoint: {e}")
637
+
638
+ def save_and_upload_model(self, config: TrainingConfig) -> str:
639
+ """
640
+ Save the trained model and upload it to Hugging Face Hub.
641
+
642
+ This method completes the training pipeline by:
643
+ 1. Saving the final model checkpoint
644
+ 2. Copying the tokenizer files
645
+ 3. Uploading the complete model to Hugging Face Hub
646
+ 4. Creating a new model repository for the trained model
647
+
648
+ The uploaded model will be available at:
649
+ https://huggingface.co/lemms/openllm-{size}-extended-8k
650
+
651
+ Args:
652
+ config: Training configuration object
653
+
654
+ Returns:
655
+ Status message indicating success or failure
656
+ Success: "βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
657
+ Failure: "❌ Failed to save/upload model: {error details}"
658
+ """
659
+ try:
660
+ print("πŸ”„ Saving trained model...")
661
+ print(f" - Output directory: {config.output_dir}")
662
+ print(f" - Model size: {config.model_size}")
663
+
664
+ # Save the final model checkpoint
665
+ self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
666
+
667
+ # Save tokenizer files
668
+ # Create a tokenizer directory within the output directory
669
+ tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
670
+ os.makedirs(tokenizer_dir, exist_ok=True)
671
+
672
+ # Copy the tokenizer.model file using the stored path
673
+ # This ensures the tokenizer is included with the model
674
+ import shutil
675
+ shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
676
+
677
+ print("βœ… Model saved locally")
678
+ print(f" - Model checkpoint: {config.output_dir}/best_model.pt")
679
+ print(f" - Tokenizer: {tokenizer_dir}/tokenizer.model")
680
+
681
+ # Generate model name for upload
682
+ # The naming convention follows: openllm-{size}-extended-8k
683
+ model_name = f"openllm-{config.model_size}-extended-8k"
684
+ repo_id = f"lemms/{model_name}"
685
+
686
+ # Upload to Hugging Face Hub
687
+ if self.hf_api:
688
+ print(f"πŸ”„ Uploading model to {repo_id}...")
689
+ print(f" - Repository: {repo_id}")
690
+ print(f" - Type: model")
691
+ print(f" - Source: {config.output_dir}")
692
+
693
+ # Create the repository first if it doesn't exist
694
+ try:
695
+ from huggingface_hub import create_repo
696
+ create_repo(
697
+ repo_id=repo_id,
698
+ repo_type="model",
699
+ exist_ok=True,
700
+ private=False
701
+ )
702
+ print(f"βœ… Repository {repo_id} ready for upload")
703
+ except Exception as create_error:
704
+ print(f"⚠️ Repository creation warning: {create_error}")
705
+ print(" Continuing with upload attempt...")
706
+
707
+ # Upload model files to Hugging Face Hub
708
+ # This creates a new model repository with all the files
709
+ self.hf_api.upload_folder(
710
+ folder_path=config.output_dir,
711
+ repo_id=repo_id,
712
+ repo_type="model",
713
+ commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
714
+ )
715
+
716
+ print(f"βœ… Model uploaded successfully to {repo_id}")
717
+ print(f" - Available at: https://huggingface.co/{repo_id}")
718
+ return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
719
+ else:
720
+ print("⚠️ Hugging Face API not available - model saved locally only")
721
+ return f"βœ… Model saved locally to {config.output_dir}"
722
+
723
+ except Exception as e:
724
+ print(f"❌ Failed to save/upload model: {e}")
725
+ return f"❌ Failed to save/upload model: {str(e)}"
726
+
727
+ def get_training_progress(self) -> Dict[str, Any]:
728
+ """
729
+ Get current training progress information.
730
+
731
+ This method returns a copy of the current training progress
732
+ for display in the Gradio UI. The progress information includes:
733
+ - Current training status
734
+ - Current step and total steps
735
+ - Current loss value
736
+ - Current learning rate
737
+
738
+ Returns:
739
+ Dictionary containing current training progress information
740
+ """
741
+ return self.training_progress.copy()
742
 
743
+ def main():
744
+ """
745
+ Main function that creates the complete Gradio application interface.
746
+
747
+ This function sets up the entire Gradio application with:
748
+ 1. Application header and status information
749
+ 2. Training configuration controls
750
+ 3. Training status and progress display
751
+ 4. Training control buttons
752
+ 5. Instructions and resource links
753
+ 6. Training function implementation
754
+
755
+ The interface provides a complete training experience for OpenLLM models
756
+ with real-time progress monitoring and comprehensive configuration options.
757
+
758
+ Returns:
759
+ Gradio Blocks interface for the training application
760
+ """
761
+
762
+ # Initialize the trainer
763
+ # This creates the OpenLLMTrainer instance that will handle all training operations
764
+ trainer = OpenLLMTrainer()
765
+
766
+ # Create the main Gradio application interface
767
+ # Using Gradio 4.44.1 with Soft theme for modern appearance
768
+ with gr.Blocks(
769
+ title="OpenLLM Training Space - Fixed with Uploaded Modules",
770
+ theme=gr.themes.Soft()
771
+ ) as demo:
772
+
773
+ # Application Header
774
+ # Provides clear identification and description of the application
775
+ gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed with Uploaded Modules")
776
+ gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
777
+ gr.Markdown("---")
778
+
779
+ # Status Information
780
+ # Shows the availability of key components and dependencies
781
+ gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
782
+ gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
783
+ gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
784
+ gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (From Uploaded Files)")
785
+
786
+ # Main Content Area
787
+ # Two-column layout for configuration and status
788
+ with gr.Row():
789
+
790
+ # Left Column: Training Configuration
791
+ # Contains all the training hyperparameters and settings
792
+ with gr.Column(scale=1):
793
+ gr.Markdown("## πŸ“Š Training Configuration")
794
+
795
+ # Model Size Selection
796
+ # Allows users to choose which base model to train from
797
  model_size = gr.Dropdown(
798
  choices=["small", "medium", "large"],
799
  value="small",
800
  label="Model Size",
801
+ info="Select the base model size to train from"
802
  )
803
+
804
+ # Training Steps Configuration
805
+ # Controls the number of training iterations
806
+ max_steps = gr.Slider(
807
+ minimum=100,
808
+ maximum=10000,
809
+ value=1000,
810
+ step=100,
811
+ label="Max Training Steps",
812
+ info="Number of training iterations (100-10,000)"
813
+ )
814
+
815
+ # Learning Rate Configuration
816
+ # Controls the learning rate for the optimizer
817
+ learning_rate = gr.Slider(
818
+ minimum=1e-5,
819
+ maximum=1e-3,
820
+ value=3e-4,
821
+ step=1e-5,
822
+ label="Learning Rate",
823
+ info="Training rate (0.00001-0.001)"
824
+ )
825
+
826
+ # Batch Size Configuration
827
+ # Controls the number of samples per training batch
828
+ batch_size = gr.Slider(
829
+ minimum=1,
830
+ maximum=16,
831
+ value=4,
832
+ step=1,
833
+ label="Batch Size",
834
+ info="Samples per training batch (1-16)"
835
  )
 
 
 
 
 
 
 
 
 
 
836
 
837
+ # Right Column: Training Status and Controls
838
+ # Contains status display and control buttons
839
+ with gr.Column(scale=1):
840
+ gr.Markdown("## 🎯 Training Status")
841
+
842
+ # Training Status Display
843
+ # Shows current training status and any error messages
844
+ status_text = gr.Textbox(
845
+ value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
846
+ label="Current Status",
847
+ interactive=False,
848
+ lines=5,
849
+ info="Shows current training status and progress updates"
850
+ )
851
+
852
+ # Progress Information
853
+ # Displays detailed training progress in JSON format
854
+ progress_info = gr.JSON(
855
+ value=trainer.get_training_progress(),
856
+ label="Training Progress"
857
+ )
858
+
859
+ # Training Control Buttons
860
+ # Buttons to start and stop training
861
+ with gr.Row():
862
+ start_btn = gr.Button("πŸš€ Start Training", variant="primary")
863
+ stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
864
+
865
+ # Instructions Section
866
+ # Provides detailed instructions for using the training interface
867
+ gr.Markdown("## πŸ“‹ OpenLLM Training Instructions")
868
+ gr.Markdown("""
869
+ This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
870
+
871
+ ### **Step 1: Configure Parameters**
872
+ - **Model Size**: Select the base model to train from (small, medium, large)
873
+ - **Max Steps**: Number of training iterations (100-10,000)
874
+ - **Learning Rate**: Training rate (0.00001-0.001)
875
+ - **Batch Size**: Samples per training batch (1-16)
876
+
877
+ ### **Step 2: Start Training**
878
+ - Click "Start Training" to begin the actual training process
879
+ - Uses OpenLLM's custom GPTModel class from uploaded files
880
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
881
+ - Compatible with OpenLLM's actual implementation
882
+
883
+ ### **Step 3: Monitor Progress**
884
+ - Watch the status updates and progress information
885
+ - Training may take several minutes depending on steps
886
+ - The final model will be uploaded to Hugging Face Hub
887
+
888
+ ### **Step 4: Access Results**
889
+ - Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
890
+ - Check the model repository for your trained model
891
+ - Use the model for inference or further training
892
+ """)
893
+
894
+ # Resource Links Section
895
+ # Provides links to related models and resources
896
+ gr.Markdown("## πŸ”— Model Resources")
897
+ gr.Markdown("""
898
+ - [πŸ“š 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
899
+ - [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
900
+ - [πŸ“Š Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
901
+ - [πŸ“– Main Project](https://github.com/louischua/openllm)
902
+ """)
903
+
904
+ # Training Function Definition
905
+ # This function is called when the Start Training button is clicked
906
+ def start_complete_training(model_size, max_steps, learning_rate, batch_size):
907
+ """
908
+ Execute the complete training process using OpenLLM's approach.
909
 
910
+ This function orchestrates the entire training pipeline:
911
+ 1. Validates OpenLLM availability
912
+ 2. Creates training configuration
913
+ 3. Loads model and tokenizer
914
+ 4. Prepares dataset
915
+ 5. Sets up training environment
916
+ 6. Executes training
917
+ 7. Saves and uploads the trained model
918
 
919
+ The function provides comprehensive error handling and status updates
920
+ throughout the training process.
 
 
 
921
 
922
+ Args:
923
+ model_size: Size of the model to train ("small", "medium", "large")
924
+ max_steps: Maximum number of training steps
925
+ learning_rate: Learning rate for the optimizer
926
+ batch_size: Batch size for training
927
+
928
+ Returns:
929
+ Status message indicating the result of the training process
930
+ """
931
+ # Validate OpenLLM availability
932
+ if not OPENLLM_AVAILABLE:
933
+ return "❌ OpenLLM custom model architecture not available. Please check the installation."
934
 
935
+ try:
936
+ print(f"πŸš€ Starting complete training process...")
937
+ print(f" - Model size: {model_size}")
938
+ print(f" - Max steps: {max_steps}")
939
+ print(f" - Learning rate: {learning_rate}")
940
+ print(f" - Batch size: {batch_size}")
941
+
942
+ # Create training configuration
943
+ # This encapsulates all training parameters
944
+ config = TrainingConfig(
945
+ model_size=model_size,
946
+ max_steps=max_steps,
947
+ learning_rate=learning_rate,
948
+ batch_size=batch_size
949
+ )
950
+
951
+ # Step 1: Load model and tokenizer using OpenLLM's approach
952
+ print("πŸ”„ Step 1: Loading model and tokenizer...")
953
+ status = trainer.load_model_and_tokenizer(model_size)
954
+ if "❌" in status:
955
+ return status
956
+
957
+ # Step 2: Prepare dataset
958
+ print("πŸ”„ Step 2: Preparing dataset...")
959
+ status = trainer.prepare_dataset()
960
+ if "❌" in status:
961
+ return status
962
+
963
+ # Step 3: Setup training
964
+ print("πŸ”„ Step 3: Setting up training...")
965
+ status = trainer.setup_training(config)
966
+ if "❌" in status:
967
+ return status
968
+
969
+ # Step 4: Execute training
970
+ print("πŸ”„ Step 4: Executing training...")
971
+ status = trainer.train_model(config)
972
+ if "❌" in status:
973
+ return status
974
+
975
+ # Step 5: Save and upload model
976
+ print("πŸ”„ Step 5: Saving and uploading model...")
977
+ status = trainer.save_and_upload_model(config)
978
+
979
+ print("πŸŽ‰ Complete training process finished!")
980
+ return f"πŸš€ Complete training process finished!\n{status}"
981
+
982
+ except Exception as e:
983
+ print(f"❌ Training process failed: {str(e)}")
984
+ return f"❌ Training process failed: {str(e)}"
985
+
986
+ def update_progress():
987
+ """
988
+ Update the progress display.
989
 
990
+ This function is called periodically to update the progress
991
+ information displayed in the Gradio interface. It returns the
992
+ current training progress from the trainer.
993
 
994
+ Returns:
995
+ Current training progress dictionary
 
 
996
  """
997
+ return trainer.get_training_progress()
998
+
999
+ # Connect UI Components to Functions
1000
+ # This connects the Start Training button to the training function
1001
+ start_btn.click(
1002
+ fn=start_complete_training,
1003
+ inputs=[model_size, max_steps, learning_rate, batch_size],
1004
+ outputs=[status_text]
1005
+ )
1006
+
1007
+ # Auto-refresh progress every 5 seconds during training
1008
+ # This ensures the progress display stays up to date
1009
+ demo.load(update_progress, outputs=[progress_info])
1010
+
1011
+ # Application Footer
1012
+ # Provides attribution and technical information
1013
+ gr.Markdown("---")
1014
+ gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
1015
+ gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
1016
+ gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
1017
+
1018
+ return demo
1019
 
1020
  if __name__ == "__main__":
1021
+ # Launch the Gradio application
1022
+ # This starts the web interface for the training application
1023
+ demo = main()
1024
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,40 +1,51 @@
1
- # OpenLLM Training Space Requirements
2
- # Core dependencies for Space deployment
3
-
4
- # Hugging Face Hub for authentication and model upload
5
- huggingface_hub>=0.19.0
6
-
7
- # Gradio for web interface (latest stable version with security fixes)
8
- gradio>=5.31.0
9
-
10
- # PyTorch for model training
11
- torch>=2.0.0
12
- torchvision>=0.15.0
13
-
14
- # Transformers for model handling
15
- transformers>=4.35.0
16
-
17
- # SentencePiece for tokenization
18
- sentencepiece>=0.1.99
19
-
20
- # NumPy and other utilities
21
- numpy>=1.24.0
22
- pandas>=2.0.0
23
-
24
- # Additional utilities
25
- requests>=2.31.0
26
- tqdm>=4.65.0
27
-
28
- # Testing dependencies
29
- pytest>=7.0.0
30
- pytest-cov>=4.0.0
31
-
32
- # Development dependencies
33
- black>=23.0.0
34
- isort>=5.12.0
35
- bandit>=1.7.7
36
- safety>=2.3.0
37
-
38
- # FastAPI for inference server
39
- fastapi>=0.100.0
40
- uvicorn>=0.23.0
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Complete Training Dependencies for OpenLLM Space - Updated for Gradio 4.44.1
2
+ # This file includes all necessary packages for real model training
3
+
4
+ # Core Machine Learning Framework
5
+ torch>=2.0.0 # PyTorch deep learning framework
6
+ torchvision>=0.15.0 # Computer vision utilities
7
+ torchaudio>=2.0.0 # Audio processing utilities
8
+
9
+ # Hugging Face Ecosystem - Complete Training Stack
10
+ transformers>=4.30.0 # Pre-trained models and training utilities
11
+ datasets>=2.12.0 # Dataset loading and processing
12
+ tokenizers>=0.13.0 # Fast tokenization library
13
+ sentencepiece>=0.1.99 # SentencePiece tokenization (CRITICAL for OpenLLM models)
14
+ huggingface_hub>=0.34.0 # Hugging Face Hub integration
15
+ accelerate>=0.20.0 # Distributed training acceleration
16
+
17
+ # User Interface Framework - Updated to 4.44.1
18
+ gradio==4.44.1 # Web UI framework for ML applications (fixed version)
19
+
20
+ # Data Processing and Scientific Computing
21
+ numpy>=1.24.0 # Numerical computing library
22
+ pandas>=2.0.0 # Data manipulation and analysis
23
+ scipy>=1.10.0 # Scientific computing utilities
24
+
25
+ # Progress and Monitoring
26
+ tqdm>=4.65.0 # Progress bars for long-running operations
27
+ psutil>=5.9.0 # System and process utilities
28
+
29
+ # Memory and Performance Optimization
30
+ bitsandbytes>=0.41.0 # Quantization utilities for memory efficiency
31
+ peft>=0.4.0 # Parameter-Efficient Fine-Tuning
32
+
33
+ # Logging and Debugging
34
+ wandb>=0.15.0 # Experiment tracking (optional)
35
+ tensorboard>=2.13.0 # Training visualization (optional)
36
+
37
+ # Additional Utilities
38
+ requests>=2.31.0 # HTTP library for API calls
39
+ pillow>=9.5.0 # Image processing (if needed)
40
+ matplotlib>=3.7.0 # Plotting and visualization
41
+ seaborn>=0.12.0 # Statistical data visualization
42
+
43
+ # Development and Testing (optional)
44
+ pytest>=7.4.0 # Testing framework
45
+ black>=23.0.0 # Code formatting
46
+ flake8>=6.0.0 # Code linting
47
+
48
+ # Note: These versions are compatible with Hugging Face Spaces
49
+ # and provide stable training performance for OpenLLM models
50
+ # Gradio 4.44.1 fixes compatibility issues with JSON components
51
+ # SentencePiece is CRITICAL for OpenLLM model tokenization