lemms commited on
Commit
22ac115
Β·
verified Β·
1 Parent(s): 417d22e

fix: Resolve Gradio 4.44.1 compatibility issue - Remove unsupported 'info' parameter from gr.JSON()

Browse files
Files changed (1) hide show
  1. app.py +1010 -1011
app.py CHANGED
@@ -1,1011 +1,1010 @@
1
- #!/usr/bin/env python3
2
- """
3
- OpenLLM Training Space Application - Fixed with Uploaded Modules
4
-
5
- This version imports OpenLLM modules from the uploaded files in the HF Space:
6
- - Imports model.py and data_loader.py that were uploaded to the Space
7
- - Uses OpenLLM's actual custom model architecture
8
- - Compatible with OpenLLM's implementation
9
-
10
- This application provides a complete training interface for OpenLLM models on Hugging Face Spaces.
11
- It uses OpenLLM's custom GPTModel architecture instead of Hugging Face Transformers,
12
- ensuring compatibility with the actual OpenLLM implementation.
13
-
14
- Key Features:
15
- - Real model training using OpenLLM's custom architecture
16
- - SentencePiece tokenization for OpenLLM models
17
- - Complete training pipeline with progress monitoring
18
- - Automatic model saving and uploading to Hugging Face Hub
19
- - Gradio 4.44.1 compatible user interface
20
-
21
- Technical Architecture:
22
- - Uses OpenLLM's GPTModel class (not Hugging Face Transformers)
23
- - Imports custom modules from uploaded files in the Space
24
- - Uses sentencepiece.SentencePieceProcessor() for tokenization
25
- - Implements OpenLLM's training loop and optimization strategy
26
- - Saves checkpoints in OpenLLM's format
27
-
28
- Author: Louis Chua Bean Chong
29
- License: GPL-3.0
30
- Version: 2.1.1
31
- Last Updated: 2024
32
- """
33
-
34
- import gradio as gr
35
- import torch
36
- import torch.nn as nn
37
- import os
38
- import time
39
- import math
40
- import gc
41
- from typing import Dict, Any, Optional
42
- import threading
43
- from dataclasses import dataclass
44
- from pathlib import Path
45
-
46
- # Import OpenLLM's custom model architecture from uploaded files
47
- # These files were uploaded to the HF Space and contain OpenLLM's actual implementation
48
- try:
49
- # Import from the uploaded files in the HF Space
50
- # model.py contains GPTModel, GPTConfig, and create_model factory function
51
- from model import GPTModel, GPTConfig, create_model
52
- # data_loader.py contains TextDataLoader for OpenLLM's data loading approach
53
- from data_loader import TextDataLoader
54
- OPENLLM_AVAILABLE = True
55
- print("βœ… OpenLLM custom model architecture imported successfully from uploaded files")
56
- print(" - GPTModel: Custom PyTorch model architecture")
57
- print(" - GPTConfig: Model configuration dataclass")
58
- print(" - create_model: Factory function for model creation")
59
- print(" - TextDataLoader: Custom data loading implementation")
60
- except ImportError as e:
61
- print(f"❌ OpenLLM imports failed: {e}")
62
- print(" This indicates the uploaded OpenLLM source files are not available")
63
- print(" The training functionality will be disabled")
64
- OPENLLM_AVAILABLE = False
65
-
66
- # Try to import sentencepiece - CRITICAL for OpenLLM tokenization
67
- # OpenLLM uses SentencePiece for tokenization, not Hugging Face tokenizers
68
- try:
69
- import sentencepiece as spm
70
- SENTENCEPIECE_AVAILABLE = True
71
- print(f"βœ… SentencePiece available: {spm.__version__}")
72
- print(" - Required for OpenLLM tokenization")
73
- print(" - Used for loading tokenizer.model files")
74
- except ImportError:
75
- SENTENCEPIECE_AVAILABLE = False
76
- print("❌ SentencePiece not available")
77
- print(" - This will prevent tokenizer loading")
78
- print(" - Training functionality will be limited")
79
-
80
- # Import other dependencies for the complete training pipeline
81
- try:
82
- from datasets import load_dataset # For loading training data from HF Hub
83
- from huggingface_hub import HfApi, hf_hub_download # For model uploads and downloads
84
- DEPENDENCIES_AVAILABLE = True
85
- print("βœ… Training dependencies available")
86
- print(" - datasets: For loading training data")
87
- print(" - huggingface_hub: For model uploads/downloads")
88
- except ImportError as e:
89
- print(f"❌ Dependencies not available: {e}")
90
- print(" - This will prevent dataset loading and model uploading")
91
- DEPENDENCIES_AVAILABLE = False
92
-
93
- @dataclass
94
- class TrainingConfig:
95
- """
96
- Configuration class for training parameters.
97
-
98
- This dataclass encapsulates all the training hyperparameters and settings
99
- that control the OpenLLM training process. It provides a clean interface
100
- for passing configuration between different components of the training pipeline.
101
-
102
- Attributes:
103
- model_size: Size of the model to train ("small", "medium", "large")
104
- max_steps: Maximum number of training iterations
105
- learning_rate: Learning rate for the optimizer
106
- batch_size: Number of samples per training batch
107
- output_dir: Directory to save trained models and checkpoints
108
- save_steps: Frequency of checkpoint saving (every N steps)
109
- logging_steps: Frequency of progress logging (every N steps)
110
- warmup_steps: Number of warmup steps for learning rate scheduling
111
- gradient_accumulation_steps: Number of steps to accumulate gradients
112
- """
113
- model_size: str
114
- max_steps: int
115
- learning_rate: float
116
- batch_size: int
117
- output_dir: str = "./openllm-trained"
118
- save_steps: int = 100
119
- logging_steps: int = 10
120
- warmup_steps: int = 50
121
- gradient_accumulation_steps: int = 4
122
-
123
- class OpenLLMTrainer:
124
- """
125
- Complete training implementation using OpenLLM's actual architecture.
126
-
127
- This class handles the entire training pipeline including:
128
- - Model loading using OpenLLM's custom GPTModel
129
- - Tokenizer loading using sentencepiece.SentencePieceProcessor()
130
- - Dataset preparation using OpenLLM's TextDataLoader
131
- - Training execution using OpenLLM's approach
132
- - Model saving and uploading to Hugging Face Hub
133
-
134
- The trainer implements OpenLLM's actual training methodology rather than
135
- using Hugging Face Transformers, ensuring compatibility with the real
136
- OpenLLM implementation.
137
-
138
- Key Features:
139
- - Custom model architecture (GPTModel, not PreTrainedModel)
140
- - SentencePiece tokenization (not Hugging Face tokenizers)
141
- - OpenLLM's training loop and optimization strategy
142
- - Gradient accumulation for memory efficiency
143
- - Learning rate scheduling with warmup
144
- - Automatic checkpoint saving and model uploading
145
- """
146
-
147
- def __init__(self):
148
- """
149
- Initialize the trainer with default settings.
150
-
151
- Sets up the trainer with default values and initializes the Hugging Face
152
- API for model uploading. All components start as None and are initialized
153
- during the training process.
154
- """
155
- # Core training components - initialized during training
156
- self.model = None # OpenLLM's GPTModel instance
157
- self.tokenizer = None # SentencePieceProcessor instance
158
- self.data_loader = None # OpenLLM's TextDataLoader instance
159
- self.optimizer = None # PyTorch optimizer (AdamW)
160
- self.scheduler = None # Learning rate scheduler
161
-
162
- # Training state management
163
- self.is_training = False # Flag to track training status
164
- self.tokenizer_path = None # Path to the tokenizer.model file
165
-
166
- # Progress tracking for UI updates
167
- self.training_progress = {
168
- "status": "Ready", # Current training status
169
- "current_step": 0, # Current training step
170
- "total_steps": 0, # Total steps to complete
171
- "loss": 0.0, # Current training loss
172
- "learning_rate": 0.0 # Current learning rate
173
- }
174
-
175
- # Initialize Hugging Face API for model uploading
176
- # This allows the trained model to be automatically uploaded to HF Hub
177
- try:
178
- self.hf_api = HfApi()
179
- print("βœ… Hugging Face API initialized for model uploading")
180
- except Exception as e:
181
- print(f"Failed to initialize HF API: {e}")
182
- print(" - Model uploading will be disabled")
183
- self.hf_api = None
184
-
185
- def load_model_and_tokenizer(self, model_size: str) -> str:
186
- """
187
- Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
188
-
189
- This method implements OpenLLM's actual model loading strategy:
190
- 1. Creates a new GPTModel using OpenLLM's factory function
191
- 2. Downloads the tokenizer.model file from Hugging Face Hub
192
- 3. Loads the tokenizer using SentencePieceProcessor
193
- 4. Stores both components for use in training
194
-
195
- This approach differs from Hugging Face Transformers because:
196
- - Uses OpenLLM's custom GPTModel (not AutoModelForCausalLM)
197
- - Uses SentencePiece directly (not AutoTokenizer)
198
- - Downloads specific files rather than using from_pretrained()
199
-
200
- Args:
201
- model_size: Size of the model to load ("small", "medium", "large")
202
- Determines which pre-trained model to download
203
-
204
- Returns:
205
- Status message indicating success or failure
206
- Success: "βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
207
- Failure: "❌ Failed to load OpenLLM model and tokenizer: {error details}"
208
- """
209
- try:
210
- # Verify OpenLLM modules are available
211
- if not OPENLLM_AVAILABLE:
212
- return "❌ OpenLLM custom model architecture not available"
213
-
214
- print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
215
- print(f" - Using OpenLLM's create_model factory function")
216
- print(f" - Not using Hugging Face Transformers")
217
-
218
- # Step 1: Create model using OpenLLM's factory function
219
- # This creates a fresh GPTModel instance with the specified size
220
- try:
221
- self.model = create_model(model_size)
222
- print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
223
- print(f" - Model type: {type(self.model).__name__}")
224
- print(f" - Parameters: {self.model.get_num_params():,}")
225
- print(f" - Architecture: Custom GPTModel (not PreTrainedModel)")
226
- except Exception as e:
227
- print(f"❌ Failed to create model: {e}")
228
- return f"❌ Failed to create OpenLLM model: {str(e)}"
229
-
230
- # Step 2: Load tokenizer using sentencepiece
231
- # OpenLLM uses SentencePiece directly, not Hugging Face tokenizers
232
- try:
233
- print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
234
- print(" - Using SentencePiece directly (not AutoTokenizer)")
235
- print(" - Downloading tokenizer.model from Hugging Face Hub")
236
-
237
- # Download tokenizer.model from HF Hub
238
- # This is the actual tokenizer file used by OpenLLM models
239
- model_name = f"lemms/openllm-{model_size}-extended-7k"
240
- tokenizer_path = hf_hub_download(
241
- repo_id=model_name,
242
- filename="tokenizer.model" # Specific file name for OpenLLM
243
- )
244
-
245
- print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
246
- print(f" - Source: {model_name}")
247
- print(f" - File: tokenizer.model")
248
-
249
- # Create SentencePieceProcessor and load the tokenizer
250
- # This is OpenLLM's actual tokenization approach
251
- sp_processor = spm.SentencePieceProcessor()
252
- sp_processor.load(tokenizer_path)
253
-
254
- # Store tokenizer and its path separately
255
- # We need the path for the TextDataLoader later
256
- self.tokenizer = sp_processor
257
- self.tokenizer_path = tokenizer_path # Store the path separately
258
-
259
- print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
260
- print(f" - Vocabulary size: {sp_processor.vocab_size()}")
261
- print(f" - Tokenizer path: {tokenizer_path}")
262
- print(f" - Tokenizer type: {type(sp_processor).__name__}")
263
-
264
- except Exception as e:
265
- print(f"❌ Failed to load tokenizer: {e}")
266
- return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
267
-
268
- return f"βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
269
-
270
- except Exception as e:
271
- return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
272
-
273
- def prepare_dataset(self) -> str:
274
- """
275
- Load and prepare the training dataset using OpenLLM's approach.
276
-
277
- This method implements OpenLLM's data preparation strategy:
278
- 1. Loads training data from Hugging Face Hub dataset
279
- 2. Creates a temporary text file for OpenLLM's TextDataLoader
280
- 3. Initializes OpenLLM's TextDataLoader with the tokenizer
281
- 4. Prepares the data for training
282
-
283
- OpenLLM's approach differs from Hugging Face because:
284
- - Uses a simple text file format (not tokenized datasets)
285
- - Uses OpenLLM's TextDataLoader (not Hugging Face datasets)
286
- - Tokenization happens on-the-fly during training
287
-
288
- Returns:
289
- Status message indicating success or failure
290
- Success: "βœ… Successfully prepared dataset with {count} samples"
291
- Failure: "❌ Failed to prepare dataset: {error details}"
292
- """
293
- try:
294
- # Verify dependencies are available
295
- if not DEPENDENCIES_AVAILABLE:
296
- return "❌ Required dependencies not available"
297
-
298
- print("πŸ”„ Loading training dataset...")
299
- print(" - Loading from Hugging Face Hub dataset")
300
- print(" - Using OpenLLM's data preparation approach")
301
-
302
- # Load dataset from HF Hub
303
- # This contains the training text data for continuing model training
304
- dataset = load_dataset("lemms/openllm-training-data")
305
- print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
306
- print(f" - Dataset: lemms/openllm-training-data")
307
- print(f" - Samples: {len(dataset['train'])}")
308
-
309
- # Create temporary data file for OpenLLM's TextDataLoader
310
- # OpenLLM expects a simple text file with one text sample per line
311
- temp_data_file = "temp_training_data.txt"
312
- with open(temp_data_file, 'w', encoding='utf-8') as f:
313
- for item in dataset['train']:
314
- f.write(item['text'] + '\n')
315
-
316
- print(f"βœ… Temporary data file created: {temp_data_file}")
317
- print(f" - Format: One text sample per line")
318
- print(f" - Encoding: UTF-8")
319
-
320
- # Create OpenLLM's TextDataLoader
321
- # This is OpenLLM's custom data loading implementation
322
- try:
323
- # Use the stored tokenizer path instead of trying to access model_file_path
324
- # SentencePieceProcessor doesn't have a model_file_path attribute
325
- tokenizer_path = self.tokenizer_path # Use the stored path
326
-
327
- print(f"πŸ”„ Creating OpenLLM TextDataLoader...")
328
- print(f" - Data file: {temp_data_file}")
329
- print(f" - Tokenizer path: {tokenizer_path}")
330
- print(f" - Sequence length: 512")
331
- print(f" - Batch size: 4 (will be overridden by training config)")
332
-
333
- self.data_loader = TextDataLoader(
334
- data_file=temp_data_file,
335
- tokenizer_path=tokenizer_path,
336
- seq_len=512, # Maximum sequence length for training
337
- batch_size=4, # Will be overridden by training config
338
- shuffle=True # Shuffle data for better training
339
- )
340
-
341
- print(f"βœ… OpenLLM TextDataLoader created successfully")
342
- print(f" - DataLoader type: {type(self.data_loader).__name__}")
343
- print(f" - Uses OpenLLM's custom implementation")
344
-
345
- except Exception as e:
346
- print(f"❌ Failed to create TextDataLoader: {e}")
347
- return f"❌ Failed to create data loader: {str(e)}"
348
-
349
- return f"βœ… Successfully prepared dataset with {len(dataset['train'])} samples"
350
-
351
- except Exception as e:
352
- return f"❌ Failed to prepare dataset: {str(e)}"
353
-
354
- def setup_training(self, config: TrainingConfig) -> str:
355
- """
356
- Set up the training configuration using OpenLLM's approach.
357
-
358
- This method configures the training environment with:
359
- 1. Output directory creation
360
- 2. Optimizer setup with weight decay groups
361
- 3. Learning rate scheduler with warmup
362
- 4. Training hyperparameters
363
-
364
- The setup follows OpenLLM's training methodology:
365
- - Uses AdamW optimizer with weight decay
366
- - Implements learning rate warmup followed by cosine annealing
367
- - Separates parameters for different weight decay rates
368
- - Uses gradient clipping for stability
369
-
370
- Args:
371
- config: Training configuration object containing all hyperparameters
372
-
373
- Returns:
374
- Status message indicating success or failure
375
- Success: "βœ… Training setup completed successfully"
376
- Failure: "❌ Failed to setup training: {error details}"
377
- """
378
- try:
379
- print("πŸ”„ Setting up training configuration...")
380
- print(f" - Output directory: {config.output_dir}")
381
- print(f" - Learning rate: {config.learning_rate}")
382
- print(f" - Max steps: {config.max_steps}")
383
-
384
- # Create output directory for saving models and checkpoints
385
- os.makedirs(config.output_dir, exist_ok=True)
386
- print(f"βœ… Output directory created: {config.output_dir}")
387
-
388
- # Set up optimizer (AdamW with weight decay)
389
- # This follows OpenLLM's optimization strategy
390
- print("πŸ”„ Setting up AdamW optimizer with weight decay...")
391
-
392
- # Separate parameters for different weight decay rates
393
- # This is a common practice for transformer training
394
- decay_params = [] # Parameters that should have weight decay
395
- no_decay_params = [] # Parameters that should not have weight decay
396
-
397
- for name, param in self.model.named_parameters():
398
- if not param.requires_grad:
399
- continue
400
-
401
- # Apply weight decay to all parameters except biases and layer norm weights
402
- if len(param.shape) == 1 or name.endswith('.bias'):
403
- no_decay_params.append(param)
404
- else:
405
- decay_params.append(param)
406
-
407
- # Create parameter groups with different weight decay rates
408
- param_groups = [
409
- {'params': decay_params, 'weight_decay': 0.01}, # 1% weight decay
410
- {'params': no_decay_params, 'weight_decay': 0.0} # No weight decay
411
- ]
412
-
413
- print(f" - Decay parameters: {len(decay_params)}")
414
- print(f" - No-decay parameters: {len(no_decay_params)}")
415
-
416
- # Initialize AdamW optimizer with OpenLLM's recommended settings
417
- self.optimizer = torch.optim.AdamW(
418
- param_groups,
419
- lr=config.learning_rate,
420
- betas=(0.9, 0.95), # Beta values for momentum
421
- eps=1e-8 # Epsilon for numerical stability
422
- )
423
-
424
- print(f"βœ… AdamW optimizer configured")
425
- print(f" - Learning rate: {config.learning_rate}")
426
- print(f" - Betas: (0.9, 0.95)")
427
- print(f" - Epsilon: 1e-8")
428
-
429
- # Set up learning rate scheduler
430
- # OpenLLM uses a warmup followed by cosine annealing
431
- print("πŸ”„ Setting up learning rate scheduler...")
432
-
433
- # Warmup scheduler: linearly increase LR from 1% to 100%
434
- warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
435
- self.optimizer,
436
- start_factor=0.01, # Start at 1% of target LR
437
- end_factor=1.0, # End at 100% of target LR
438
- total_iters=config.warmup_steps
439
- )
440
-
441
- # Main scheduler: cosine annealing after warmup
442
- main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
443
- self.optimizer,
444
- T_max=config.max_steps - config.warmup_steps # Duration of cosine annealing
445
- )
446
-
447
- # Combine warmup and main schedulers
448
- self.scheduler = torch.optim.lr_scheduler.SequentialLR(
449
- self.optimizer,
450
- schedulers=[warmup_scheduler, main_scheduler],
451
- milestones=[config.warmup_steps] # Switch to main scheduler after warmup
452
- )
453
-
454
- print(f"βœ… Learning rate scheduler configured")
455
- print(f" - Warmup steps: {config.warmup_steps}")
456
- print(f" - Total steps: {config.max_steps}")
457
- print(f" - Schedule: Linear warmup β†’ Cosine annealing")
458
-
459
- print("βœ… Training setup completed successfully")
460
- return f"βœ… Training setup completed successfully"
461
-
462
- except Exception as e:
463
- return f"❌ Failed to setup training: {str(e)}"
464
-
465
- def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
466
- """
467
- Execute the actual model training using OpenLLM's approach.
468
-
469
- This method implements OpenLLM's training loop:
470
- 1. Sets up training mode and progress tracking
471
- 2. Iterates through data batches using OpenLLM's TextDataLoader
472
- 3. Performs forward pass, loss computation, and backward pass
473
- 4. Implements gradient accumulation for memory efficiency
474
- 5. Updates model parameters and learning rate
475
- 6. Saves checkpoints and logs progress
476
-
477
- The training loop follows OpenLLM's methodology:
478
- - Uses OpenLLM's GPTModel forward pass (returns logits and loss)
479
- - Implements gradient accumulation for effective larger batch sizes
480
- - Uses gradient clipping for training stability
481
- - Saves checkpoints in OpenLLM's format
482
- - Updates progress for UI monitoring
483
-
484
- Args:
485
- config: Training configuration object containing hyperparameters
486
- progress_callback: Optional callback function for progress updates
487
- (Not used in current implementation)
488
-
489
- Returns:
490
- Status message indicating success or failure
491
- Success: "βœ… Training completed successfully! Final step: {step}"
492
- Failure: "❌ Training failed: {error details}"
493
- """
494
- try:
495
- # Set training state
496
- self.is_training = True
497
- self.training_progress["status"] = "Training"
498
- self.training_progress["total_steps"] = config.max_steps
499
-
500
- print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
501
- print(f" - Model: {type(self.model).__name__}")
502
- print(f" - DataLoader: {type(self.data_loader).__name__}")
503
- print(f" - Optimizer: {type(self.optimizer).__name__}")
504
- print(f" - Gradient accumulation: {config.gradient_accumulation_steps}")
505
-
506
- # Training loop using OpenLLM's approach
507
- self.model.train() # Set model to training mode
508
- accumulated_loss = 0.0 # Track loss across accumulation steps
509
- self.optimizer.zero_grad() # Clear gradients
510
-
511
- step = 0 # Current training step
512
- for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
513
- # Check if we've reached the maximum number of steps
514
- if step >= config.max_steps:
515
- break
516
-
517
- # Forward pass (model computes loss internally when targets provided)
518
- # OpenLLM's GPTModel returns both logits and loss
519
- logits, loss = self.model(input_ids, target_ids)
520
-
521
- # Scale loss for gradient accumulation
522
- # This allows us to simulate larger batch sizes
523
- loss = loss / config.gradient_accumulation_steps
524
- accumulated_loss += loss.item()
525
-
526
- # Backward pass - compute gradients
527
- loss.backward()
528
-
529
- # Update weights every gradient_accumulation_steps
530
- if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
531
- # Clip gradients for training stability
532
- # This prevents exploding gradients
533
- torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
534
-
535
- # Update parameters using the optimizer
536
- self.optimizer.step()
537
-
538
- # Update learning rate using the scheduler
539
- self.scheduler.step()
540
-
541
- # Clear gradients for the next accumulation cycle
542
- self.optimizer.zero_grad()
543
-
544
- # Update step count
545
- step += 1
546
-
547
- # Update progress for UI monitoring
548
- self.training_progress["current_step"] = step
549
- self.training_progress["loss"] = accumulated_loss
550
- self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
551
-
552
- # Log progress at specified intervals
553
- if step % config.logging_steps == 0:
554
- current_lr = self.scheduler.get_last_lr()[0]
555
- print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {current_lr:.2e}")
556
-
557
- # Save checkpoint at specified intervals
558
- if step % config.save_steps == 0:
559
- self._save_checkpoint(config.output_dir, step)
560
- print(f"πŸ’Ύ Checkpoint saved at step {step}")
561
-
562
- # Reset accumulated loss for the next accumulation cycle
563
- accumulated_loss = 0.0
564
-
565
- # Clean up memory periodically
566
- if step % 100 == 0:
567
- gc.collect()
568
- print(f"🧹 Memory cleanup at step {step}")
569
-
570
- # Save final checkpoint
571
- self._save_checkpoint(config.output_dir, step, is_best=True)
572
- print(f"πŸ’Ύ Final checkpoint saved at step {step}")
573
-
574
- # Update final progress
575
- self.training_progress["status"] = "Completed"
576
- self.training_progress["current_step"] = step
577
-
578
- print(f"βœ… Training completed! Final step: {step}")
579
- print(f" - Total steps completed: {step}")
580
- print(f" - Final loss: {self.training_progress['loss']:.4f}")
581
- print(f" - Final learning rate: {self.training_progress['learning_rate']:.2e}")
582
-
583
- return f"βœ… Training completed successfully! Final step: {step}"
584
-
585
- except Exception as e:
586
- self.training_progress["status"] = "Failed"
587
- print(f"❌ Training failed: {e}")
588
- print(f" - Error occurred during training")
589
- print(f" - Training state: {self.training_progress['status']}")
590
- return f"❌ Training failed: {str(e)}"
591
- finally:
592
- self.is_training = False
593
-
594
- def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
595
- """
596
- Save model checkpoint using OpenLLM's approach.
597
-
598
- This method saves the model state in OpenLLM's checkpoint format:
599
- - Model state dictionary
600
- - Optimizer state dictionary
601
- - Scheduler state dictionary
602
- - Model configuration
603
- - Training step information
604
-
605
- The checkpoint format is compatible with OpenLLM's loading mechanism
606
- and can be used to resume training or load the model for inference.
607
-
608
- Args:
609
- output_dir: Directory to save the checkpoint
610
- step: Current training step number
611
- is_best: Whether this is the best model so far
612
- """
613
- try:
614
- # Create checkpoint dictionary with all necessary components
615
- checkpoint = {
616
- 'step': step, # Current training step
617
- 'model_state_dict': self.model.state_dict(), # Model parameters
618
- 'optimizer_state_dict': self.optimizer.state_dict(), # Optimizer state
619
- 'scheduler_state_dict': self.scheduler.state_dict(), # Scheduler state
620
- 'config': self.model.config.__dict__ # Model configuration
621
- }
622
-
623
- # Save latest checkpoint
624
- checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
625
- torch.save(checkpoint, checkpoint_path)
626
-
627
- # Save best checkpoint if this is the best model
628
- if is_best:
629
- best_path = os.path.join(output_dir, "best_model.pt")
630
- torch.save(checkpoint, best_path)
631
- print(f"πŸ’Ύ Best model saved: {best_path}")
632
-
633
- print(f"πŸ’Ύ Checkpoint saved: {checkpoint_path}")
634
-
635
- except Exception as e:
636
- print(f"❌ Failed to save checkpoint: {e}")
637
-
638
- def save_and_upload_model(self, config: TrainingConfig) -> str:
639
- """
640
- Save the trained model and upload it to Hugging Face Hub.
641
-
642
- This method completes the training pipeline by:
643
- 1. Saving the final model checkpoint
644
- 2. Copying the tokenizer files
645
- 3. Uploading the complete model to Hugging Face Hub
646
- 4. Creating a new model repository for the trained model
647
-
648
- The uploaded model will be available at:
649
- https://huggingface.co/lemms/openllm-{size}-extended-8k
650
-
651
- Args:
652
- config: Training configuration object
653
-
654
- Returns:
655
- Status message indicating success or failure
656
- Success: "βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
657
- Failure: "❌ Failed to save/upload model: {error details}"
658
- """
659
- try:
660
- print("πŸ”„ Saving trained model...")
661
- print(f" - Output directory: {config.output_dir}")
662
- print(f" - Model size: {config.model_size}")
663
-
664
- # Save the final model checkpoint
665
- self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
666
-
667
- # Save tokenizer files
668
- # Create a tokenizer directory within the output directory
669
- tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
670
- os.makedirs(tokenizer_dir, exist_ok=True)
671
-
672
- # Copy the tokenizer.model file using the stored path
673
- # This ensures the tokenizer is included with the model
674
- import shutil
675
- shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
676
-
677
- print("βœ… Model saved locally")
678
- print(f" - Model checkpoint: {config.output_dir}/best_model.pt")
679
- print(f" - Tokenizer: {tokenizer_dir}/tokenizer.model")
680
-
681
- # Generate model name for upload
682
- # The naming convention follows: openllm-{size}-extended-8k
683
- model_name = f"openllm-{config.model_size}-extended-8k"
684
- repo_id = f"lemms/{model_name}"
685
-
686
- # Upload to Hugging Face Hub
687
- if self.hf_api:
688
- print(f"πŸ”„ Uploading model to {repo_id}...")
689
- print(f" - Repository: {repo_id}")
690
- print(f" - Type: model")
691
- print(f" - Source: {config.output_dir}")
692
-
693
- # Upload model files to Hugging Face Hub
694
- # This creates a new model repository with all the files
695
- self.hf_api.upload_folder(
696
- folder_path=config.output_dir,
697
- repo_id=repo_id,
698
- repo_type="model",
699
- commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
700
- )
701
-
702
- print(f"βœ… Model uploaded successfully to {repo_id}")
703
- print(f" - Available at: https://huggingface.co/{repo_id}")
704
- return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
705
- else:
706
- print("⚠️ Hugging Face API not available - model saved locally only")
707
- return f"βœ… Model saved locally to {config.output_dir}"
708
-
709
- except Exception as e:
710
- print(f"❌ Failed to save/upload model: {e}")
711
- return f"❌ Failed to save/upload model: {str(e)}"
712
-
713
- def get_training_progress(self) -> Dict[str, Any]:
714
- """
715
- Get current training progress information.
716
-
717
- This method returns a copy of the current training progress
718
- for display in the Gradio UI. The progress information includes:
719
- - Current training status
720
- - Current step and total steps
721
- - Current loss value
722
- - Current learning rate
723
-
724
- Returns:
725
- Dictionary containing current training progress information
726
- """
727
- return self.training_progress.copy()
728
-
729
- def main():
730
- """
731
- Main function that creates the complete Gradio application interface.
732
-
733
- This function sets up the entire Gradio application with:
734
- 1. Application header and status information
735
- 2. Training configuration controls
736
- 3. Training status and progress display
737
- 4. Training control buttons
738
- 5. Instructions and resource links
739
- 6. Training function implementation
740
-
741
- The interface provides a complete training experience for OpenLLM models
742
- with real-time progress monitoring and comprehensive configuration options.
743
-
744
- Returns:
745
- Gradio Blocks interface for the training application
746
- """
747
-
748
- # Initialize the trainer
749
- # This creates the OpenLLMTrainer instance that will handle all training operations
750
- trainer = OpenLLMTrainer()
751
-
752
- # Create the main Gradio application interface
753
- # Using Gradio 4.44.1 with Soft theme for modern appearance
754
- with gr.Blocks(
755
- title="OpenLLM Training Space - Fixed with Uploaded Modules",
756
- theme=gr.themes.Soft()
757
- ) as demo:
758
-
759
- # Application Header
760
- # Provides clear identification and description of the application
761
- gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed with Uploaded Modules")
762
- gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
763
- gr.Markdown("---")
764
-
765
- # Status Information
766
- # Shows the availability of key components and dependencies
767
- gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
768
- gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
769
- gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
770
- gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (From Uploaded Files)")
771
-
772
- # Main Content Area
773
- # Two-column layout for configuration and status
774
- with gr.Row():
775
-
776
- # Left Column: Training Configuration
777
- # Contains all the training hyperparameters and settings
778
- with gr.Column(scale=1):
779
- gr.Markdown("## πŸ“Š Training Configuration")
780
-
781
- # Model Size Selection
782
- # Allows users to choose which base model to train from
783
- model_size = gr.Dropdown(
784
- choices=["small", "medium", "large"],
785
- value="small",
786
- label="Model Size",
787
- info="Select the base model size to train from"
788
- )
789
-
790
- # Training Steps Configuration
791
- # Controls the number of training iterations
792
- max_steps = gr.Slider(
793
- minimum=100,
794
- maximum=10000,
795
- value=1000,
796
- step=100,
797
- label="Max Training Steps",
798
- info="Number of training iterations (100-10,000)"
799
- )
800
-
801
- # Learning Rate Configuration
802
- # Controls the learning rate for the optimizer
803
- learning_rate = gr.Slider(
804
- minimum=1e-5,
805
- maximum=1e-3,
806
- value=3e-4,
807
- step=1e-5,
808
- label="Learning Rate",
809
- info="Training rate (0.00001-0.001)"
810
- )
811
-
812
- # Batch Size Configuration
813
- # Controls the number of samples per training batch
814
- batch_size = gr.Slider(
815
- minimum=1,
816
- maximum=16,
817
- value=4,
818
- step=1,
819
- label="Batch Size",
820
- info="Samples per training batch (1-16)"
821
- )
822
-
823
- # Right Column: Training Status and Controls
824
- # Contains status display and control buttons
825
- with gr.Column(scale=1):
826
- gr.Markdown("## 🎯 Training Status")
827
-
828
- # Training Status Display
829
- # Shows current training status and any error messages
830
- status_text = gr.Textbox(
831
- value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
832
- label="Current Status",
833
- interactive=False,
834
- lines=5,
835
- info="Shows current training status and progress updates"
836
- )
837
-
838
- # Progress Information
839
- # Displays detailed training progress in JSON format
840
- progress_info = gr.JSON(
841
- value=trainer.get_training_progress(),
842
- label="Training Progress",
843
- info="Real-time training progress information"
844
- )
845
-
846
- # Training Control Buttons
847
- # Buttons to start and stop training
848
- with gr.Row():
849
- start_btn = gr.Button("πŸš€ Start Training", variant="primary")
850
- stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
851
-
852
- # Instructions Section
853
- # Provides detailed instructions for using the training interface
854
- gr.Markdown("## πŸ“‹ OpenLLM Training Instructions")
855
- gr.Markdown("""
856
- This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
857
-
858
- ### **Step 1: Configure Parameters**
859
- - **Model Size**: Select the base model to train from (small, medium, large)
860
- - **Max Steps**: Number of training iterations (100-10,000)
861
- - **Learning Rate**: Training rate (0.00001-0.001)
862
- - **Batch Size**: Samples per training batch (1-16)
863
-
864
- ### **Step 2: Start Training**
865
- - Click "Start Training" to begin the actual training process
866
- - Uses OpenLLM's custom GPTModel class from uploaded files
867
- - Uses sentencepiece.SentencePieceProcessor() for tokenization
868
- - Compatible with OpenLLM's actual implementation
869
-
870
- ### **Step 3: Monitor Progress**
871
- - Watch the status updates and progress information
872
- - Training may take several minutes depending on steps
873
- - The final model will be uploaded to Hugging Face Hub
874
-
875
- ### **Step 4: Access Results**
876
- - Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
877
- - Check the model repository for your trained model
878
- - Use the model for inference or further training
879
- """)
880
-
881
- # Resource Links Section
882
- # Provides links to related models and resources
883
- gr.Markdown("## πŸ”— Model Resources")
884
- gr.Markdown("""
885
- - [πŸ“š 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
886
- - [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
887
- - [πŸ“Š Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
888
- - [πŸ“– Main Project](https://github.com/louischua/openllm)
889
- """)
890
-
891
- # Training Function Definition
892
- # This function is called when the Start Training button is clicked
893
- def start_complete_training(model_size, max_steps, learning_rate, batch_size):
894
- """
895
- Execute the complete training process using OpenLLM's approach.
896
-
897
- This function orchestrates the entire training pipeline:
898
- 1. Validates OpenLLM availability
899
- 2. Creates training configuration
900
- 3. Loads model and tokenizer
901
- 4. Prepares dataset
902
- 5. Sets up training environment
903
- 6. Executes training
904
- 7. Saves and uploads the trained model
905
-
906
- The function provides comprehensive error handling and status updates
907
- throughout the training process.
908
-
909
- Args:
910
- model_size: Size of the model to train ("small", "medium", "large")
911
- max_steps: Maximum number of training steps
912
- learning_rate: Learning rate for the optimizer
913
- batch_size: Batch size for training
914
-
915
- Returns:
916
- Status message indicating the result of the training process
917
- """
918
- # Validate OpenLLM availability
919
- if not OPENLLM_AVAILABLE:
920
- return "❌ OpenLLM custom model architecture not available. Please check the installation."
921
-
922
- try:
923
- print(f"πŸš€ Starting complete training process...")
924
- print(f" - Model size: {model_size}")
925
- print(f" - Max steps: {max_steps}")
926
- print(f" - Learning rate: {learning_rate}")
927
- print(f" - Batch size: {batch_size}")
928
-
929
- # Create training configuration
930
- # This encapsulates all training parameters
931
- config = TrainingConfig(
932
- model_size=model_size,
933
- max_steps=max_steps,
934
- learning_rate=learning_rate,
935
- batch_size=batch_size
936
- )
937
-
938
- # Step 1: Load model and tokenizer using OpenLLM's approach
939
- print("πŸ”„ Step 1: Loading model and tokenizer...")
940
- status = trainer.load_model_and_tokenizer(model_size)
941
- if "❌" in status:
942
- return status
943
-
944
- # Step 2: Prepare dataset
945
- print("πŸ”„ Step 2: Preparing dataset...")
946
- status = trainer.prepare_dataset()
947
- if "❌" in status:
948
- return status
949
-
950
- # Step 3: Setup training
951
- print("πŸ”„ Step 3: Setting up training...")
952
- status = trainer.setup_training(config)
953
- if "❌" in status:
954
- return status
955
-
956
- # Step 4: Execute training
957
- print("πŸ”„ Step 4: Executing training...")
958
- status = trainer.train_model(config)
959
- if "❌" in status:
960
- return status
961
-
962
- # Step 5: Save and upload model
963
- print("πŸ”„ Step 5: Saving and uploading model...")
964
- status = trainer.save_and_upload_model(config)
965
-
966
- print("πŸŽ‰ Complete training process finished!")
967
- return f"πŸš€ Complete training process finished!\n{status}"
968
-
969
- except Exception as e:
970
- print(f"❌ Training process failed: {str(e)}")
971
- return f"❌ Training process failed: {str(e)}"
972
-
973
- def update_progress():
974
- """
975
- Update the progress display.
976
-
977
- This function is called periodically to update the progress
978
- information displayed in the Gradio interface. It returns the
979
- current training progress from the trainer.
980
-
981
- Returns:
982
- Current training progress dictionary
983
- """
984
- return trainer.get_training_progress()
985
-
986
- # Connect UI Components to Functions
987
- # This connects the Start Training button to the training function
988
- start_btn.click(
989
- fn=start_complete_training,
990
- inputs=[model_size, max_steps, learning_rate, batch_size],
991
- outputs=[status_text]
992
- )
993
-
994
- # Auto-refresh progress every 5 seconds during training
995
- # This ensures the progress display stays up to date
996
- demo.load(update_progress, outputs=[progress_info])
997
-
998
- # Application Footer
999
- # Provides attribution and technical information
1000
- gr.Markdown("---")
1001
- gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
1002
- gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
1003
- gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
1004
-
1005
- return demo
1006
-
1007
- if __name__ == "__main__":
1008
- # Launch the Gradio application
1009
- # This starts the web interface for the training application
1010
- demo = main()
1011
- demo.launch()
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenLLM Training Space Application - Fixed with Uploaded Modules
4
+
5
+ This version imports OpenLLM modules from the uploaded files in the HF Space:
6
+ - Imports model.py and data_loader.py that were uploaded to the Space
7
+ - Uses OpenLLM's actual custom model architecture
8
+ - Compatible with OpenLLM's implementation
9
+
10
+ This application provides a complete training interface for OpenLLM models on Hugging Face Spaces.
11
+ It uses OpenLLM's custom GPTModel architecture instead of Hugging Face Transformers,
12
+ ensuring compatibility with the actual OpenLLM implementation.
13
+
14
+ Key Features:
15
+ - Real model training using OpenLLM's custom architecture
16
+ - SentencePiece tokenization for OpenLLM models
17
+ - Complete training pipeline with progress monitoring
18
+ - Automatic model saving and uploading to Hugging Face Hub
19
+ - Gradio 4.44.1 compatible user interface
20
+
21
+ Technical Architecture:
22
+ - Uses OpenLLM's GPTModel class (not Hugging Face Transformers)
23
+ - Imports custom modules from uploaded files in the Space
24
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
25
+ - Implements OpenLLM's training loop and optimization strategy
26
+ - Saves checkpoints in OpenLLM's format
27
+
28
+ Author: Louis Chua Bean Chong
29
+ License: GPL-3.0
30
+ Version: 2.1.1
31
+ Last Updated: 2024
32
+ """
33
+
34
+ import gradio as gr
35
+ import torch
36
+ import torch.nn as nn
37
+ import os
38
+ import time
39
+ import math
40
+ import gc
41
+ from typing import Dict, Any, Optional
42
+ import threading
43
+ from dataclasses import dataclass
44
+ from pathlib import Path
45
+
46
+ # Import OpenLLM's custom model architecture from uploaded files
47
+ # These files were uploaded to the HF Space and contain OpenLLM's actual implementation
48
+ try:
49
+ # Import from the uploaded files in the HF Space
50
+ # model.py contains GPTModel, GPTConfig, and create_model factory function
51
+ from model import GPTModel, GPTConfig, create_model
52
+ # data_loader.py contains TextDataLoader for OpenLLM's data loading approach
53
+ from data_loader import TextDataLoader
54
+ OPENLLM_AVAILABLE = True
55
+ print("βœ… OpenLLM custom model architecture imported successfully from uploaded files")
56
+ print(" - GPTModel: Custom PyTorch model architecture")
57
+ print(" - GPTConfig: Model configuration dataclass")
58
+ print(" - create_model: Factory function for model creation")
59
+ print(" - TextDataLoader: Custom data loading implementation")
60
+ except ImportError as e:
61
+ print(f"❌ OpenLLM imports failed: {e}")
62
+ print(" This indicates the uploaded OpenLLM source files are not available")
63
+ print(" The training functionality will be disabled")
64
+ OPENLLM_AVAILABLE = False
65
+
66
+ # Try to import sentencepiece - CRITICAL for OpenLLM tokenization
67
+ # OpenLLM uses SentencePiece for tokenization, not Hugging Face tokenizers
68
+ try:
69
+ import sentencepiece as spm
70
+ SENTENCEPIECE_AVAILABLE = True
71
+ print(f"βœ… SentencePiece available: {spm.__version__}")
72
+ print(" - Required for OpenLLM tokenization")
73
+ print(" - Used for loading tokenizer.model files")
74
+ except ImportError:
75
+ SENTENCEPIECE_AVAILABLE = False
76
+ print("❌ SentencePiece not available")
77
+ print(" - This will prevent tokenizer loading")
78
+ print(" - Training functionality will be limited")
79
+
80
+ # Import other dependencies for the complete training pipeline
81
+ try:
82
+ from datasets import load_dataset # For loading training data from HF Hub
83
+ from huggingface_hub import HfApi, hf_hub_download # For model uploads and downloads
84
+ DEPENDENCIES_AVAILABLE = True
85
+ print("βœ… Training dependencies available")
86
+ print(" - datasets: For loading training data")
87
+ print(" - huggingface_hub: For model uploads/downloads")
88
+ except ImportError as e:
89
+ print(f"❌ Dependencies not available: {e}")
90
+ print(" - This will prevent dataset loading and model uploading")
91
+ DEPENDENCIES_AVAILABLE = False
92
+
93
+ @dataclass
94
+ class TrainingConfig:
95
+ """
96
+ Configuration class for training parameters.
97
+
98
+ This dataclass encapsulates all the training hyperparameters and settings
99
+ that control the OpenLLM training process. It provides a clean interface
100
+ for passing configuration between different components of the training pipeline.
101
+
102
+ Attributes:
103
+ model_size: Size of the model to train ("small", "medium", "large")
104
+ max_steps: Maximum number of training iterations
105
+ learning_rate: Learning rate for the optimizer
106
+ batch_size: Number of samples per training batch
107
+ output_dir: Directory to save trained models and checkpoints
108
+ save_steps: Frequency of checkpoint saving (every N steps)
109
+ logging_steps: Frequency of progress logging (every N steps)
110
+ warmup_steps: Number of warmup steps for learning rate scheduling
111
+ gradient_accumulation_steps: Number of steps to accumulate gradients
112
+ """
113
+ model_size: str
114
+ max_steps: int
115
+ learning_rate: float
116
+ batch_size: int
117
+ output_dir: str = "./openllm-trained"
118
+ save_steps: int = 100
119
+ logging_steps: int = 10
120
+ warmup_steps: int = 50
121
+ gradient_accumulation_steps: int = 4
122
+
123
+ class OpenLLMTrainer:
124
+ """
125
+ Complete training implementation using OpenLLM's actual architecture.
126
+
127
+ This class handles the entire training pipeline including:
128
+ - Model loading using OpenLLM's custom GPTModel
129
+ - Tokenizer loading using sentencepiece.SentencePieceProcessor()
130
+ - Dataset preparation using OpenLLM's TextDataLoader
131
+ - Training execution using OpenLLM's approach
132
+ - Model saving and uploading to Hugging Face Hub
133
+
134
+ The trainer implements OpenLLM's actual training methodology rather than
135
+ using Hugging Face Transformers, ensuring compatibility with the real
136
+ OpenLLM implementation.
137
+
138
+ Key Features:
139
+ - Custom model architecture (GPTModel, not PreTrainedModel)
140
+ - SentencePiece tokenization (not Hugging Face tokenizers)
141
+ - OpenLLM's training loop and optimization strategy
142
+ - Gradient accumulation for memory efficiency
143
+ - Learning rate scheduling with warmup
144
+ - Automatic checkpoint saving and model uploading
145
+ """
146
+
147
+ def __init__(self):
148
+ """
149
+ Initialize the trainer with default settings.
150
+
151
+ Sets up the trainer with default values and initializes the Hugging Face
152
+ API for model uploading. All components start as None and are initialized
153
+ during the training process.
154
+ """
155
+ # Core training components - initialized during training
156
+ self.model = None # OpenLLM's GPTModel instance
157
+ self.tokenizer = None # SentencePieceProcessor instance
158
+ self.data_loader = None # OpenLLM's TextDataLoader instance
159
+ self.optimizer = None # PyTorch optimizer (AdamW)
160
+ self.scheduler = None # Learning rate scheduler
161
+
162
+ # Training state management
163
+ self.is_training = False # Flag to track training status
164
+ self.tokenizer_path = None # Path to the tokenizer.model file
165
+
166
+ # Progress tracking for UI updates
167
+ self.training_progress = {
168
+ "status": "Ready", # Current training status
169
+ "current_step": 0, # Current training step
170
+ "total_steps": 0, # Total steps to complete
171
+ "loss": 0.0, # Current training loss
172
+ "learning_rate": 0.0 # Current learning rate
173
+ }
174
+
175
+ # Initialize Hugging Face API for model uploading
176
+ # This allows the trained model to be automatically uploaded to HF Hub
177
+ try:
178
+ self.hf_api = HfApi()
179
+ print("βœ… Hugging Face API initialized for model uploading")
180
+ except Exception as e:
181
+ print(f"Failed to initialize HF API: {e}")
182
+ print(" - Model uploading will be disabled")
183
+ self.hf_api = None
184
+
185
+ def load_model_and_tokenizer(self, model_size: str) -> str:
186
+ """
187
+ Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
188
+
189
+ This method implements OpenLLM's actual model loading strategy:
190
+ 1. Creates a new GPTModel using OpenLLM's factory function
191
+ 2. Downloads the tokenizer.model file from Hugging Face Hub
192
+ 3. Loads the tokenizer using SentencePieceProcessor
193
+ 4. Stores both components for use in training
194
+
195
+ This approach differs from Hugging Face Transformers because:
196
+ - Uses OpenLLM's custom GPTModel (not AutoModelForCausalLM)
197
+ - Uses SentencePiece directly (not AutoTokenizer)
198
+ - Downloads specific files rather than using from_pretrained()
199
+
200
+ Args:
201
+ model_size: Size of the model to load ("small", "medium", "large")
202
+ Determines which pre-trained model to download
203
+
204
+ Returns:
205
+ Status message indicating success or failure
206
+ Success: "βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
207
+ Failure: "❌ Failed to load OpenLLM model and tokenizer: {error details}"
208
+ """
209
+ try:
210
+ # Verify OpenLLM modules are available
211
+ if not OPENLLM_AVAILABLE:
212
+ return "❌ OpenLLM custom model architecture not available"
213
+
214
+ print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
215
+ print(f" - Using OpenLLM's create_model factory function")
216
+ print(f" - Not using Hugging Face Transformers")
217
+
218
+ # Step 1: Create model using OpenLLM's factory function
219
+ # This creates a fresh GPTModel instance with the specified size
220
+ try:
221
+ self.model = create_model(model_size)
222
+ print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
223
+ print(f" - Model type: {type(self.model).__name__}")
224
+ print(f" - Parameters: {self.model.get_num_params():,}")
225
+ print(f" - Architecture: Custom GPTModel (not PreTrainedModel)")
226
+ except Exception as e:
227
+ print(f"❌ Failed to create model: {e}")
228
+ return f"❌ Failed to create OpenLLM model: {str(e)}"
229
+
230
+ # Step 2: Load tokenizer using sentencepiece
231
+ # OpenLLM uses SentencePiece directly, not Hugging Face tokenizers
232
+ try:
233
+ print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
234
+ print(" - Using SentencePiece directly (not AutoTokenizer)")
235
+ print(" - Downloading tokenizer.model from Hugging Face Hub")
236
+
237
+ # Download tokenizer.model from HF Hub
238
+ # This is the actual tokenizer file used by OpenLLM models
239
+ model_name = f"lemms/openllm-{model_size}-extended-7k"
240
+ tokenizer_path = hf_hub_download(
241
+ repo_id=model_name,
242
+ filename="tokenizer.model" # Specific file name for OpenLLM
243
+ )
244
+
245
+ print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
246
+ print(f" - Source: {model_name}")
247
+ print(f" - File: tokenizer.model")
248
+
249
+ # Create SentencePieceProcessor and load the tokenizer
250
+ # This is OpenLLM's actual tokenization approach
251
+ sp_processor = spm.SentencePieceProcessor()
252
+ sp_processor.load(tokenizer_path)
253
+
254
+ # Store tokenizer and its path separately
255
+ # We need the path for the TextDataLoader later
256
+ self.tokenizer = sp_processor
257
+ self.tokenizer_path = tokenizer_path # Store the path separately
258
+
259
+ print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
260
+ print(f" - Vocabulary size: {sp_processor.vocab_size()}")
261
+ print(f" - Tokenizer path: {tokenizer_path}")
262
+ print(f" - Tokenizer type: {type(sp_processor).__name__}")
263
+
264
+ except Exception as e:
265
+ print(f"❌ Failed to load tokenizer: {e}")
266
+ return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
267
+
268
+ return f"βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
269
+
270
+ except Exception as e:
271
+ return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
272
+
273
+ def prepare_dataset(self) -> str:
274
+ """
275
+ Load and prepare the training dataset using OpenLLM's approach.
276
+
277
+ This method implements OpenLLM's data preparation strategy:
278
+ 1. Loads training data from Hugging Face Hub dataset
279
+ 2. Creates a temporary text file for OpenLLM's TextDataLoader
280
+ 3. Initializes OpenLLM's TextDataLoader with the tokenizer
281
+ 4. Prepares the data for training
282
+
283
+ OpenLLM's approach differs from Hugging Face because:
284
+ - Uses a simple text file format (not tokenized datasets)
285
+ - Uses OpenLLM's TextDataLoader (not Hugging Face datasets)
286
+ - Tokenization happens on-the-fly during training
287
+
288
+ Returns:
289
+ Status message indicating success or failure
290
+ Success: "βœ… Successfully prepared dataset with {count} samples"
291
+ Failure: "❌ Failed to prepare dataset: {error details}"
292
+ """
293
+ try:
294
+ # Verify dependencies are available
295
+ if not DEPENDENCIES_AVAILABLE:
296
+ return "❌ Required dependencies not available"
297
+
298
+ print("πŸ”„ Loading training dataset...")
299
+ print(" - Loading from Hugging Face Hub dataset")
300
+ print(" - Using OpenLLM's data preparation approach")
301
+
302
+ # Load dataset from HF Hub
303
+ # This contains the training text data for continuing model training
304
+ dataset = load_dataset("lemms/openllm-training-data")
305
+ print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
306
+ print(f" - Dataset: lemms/openllm-training-data")
307
+ print(f" - Samples: {len(dataset['train'])}")
308
+
309
+ # Create temporary data file for OpenLLM's TextDataLoader
310
+ # OpenLLM expects a simple text file with one text sample per line
311
+ temp_data_file = "temp_training_data.txt"
312
+ with open(temp_data_file, 'w', encoding='utf-8') as f:
313
+ for item in dataset['train']:
314
+ f.write(item['text'] + '\n')
315
+
316
+ print(f"βœ… Temporary data file created: {temp_data_file}")
317
+ print(f" - Format: One text sample per line")
318
+ print(f" - Encoding: UTF-8")
319
+
320
+ # Create OpenLLM's TextDataLoader
321
+ # This is OpenLLM's custom data loading implementation
322
+ try:
323
+ # Use the stored tokenizer path instead of trying to access model_file_path
324
+ # SentencePieceProcessor doesn't have a model_file_path attribute
325
+ tokenizer_path = self.tokenizer_path # Use the stored path
326
+
327
+ print(f"πŸ”„ Creating OpenLLM TextDataLoader...")
328
+ print(f" - Data file: {temp_data_file}")
329
+ print(f" - Tokenizer path: {tokenizer_path}")
330
+ print(f" - Sequence length: 512")
331
+ print(f" - Batch size: 4 (will be overridden by training config)")
332
+
333
+ self.data_loader = TextDataLoader(
334
+ data_file=temp_data_file,
335
+ tokenizer_path=tokenizer_path,
336
+ seq_len=512, # Maximum sequence length for training
337
+ batch_size=4, # Will be overridden by training config
338
+ shuffle=True # Shuffle data for better training
339
+ )
340
+
341
+ print(f"βœ… OpenLLM TextDataLoader created successfully")
342
+ print(f" - DataLoader type: {type(self.data_loader).__name__}")
343
+ print(f" - Uses OpenLLM's custom implementation")
344
+
345
+ except Exception as e:
346
+ print(f"❌ Failed to create TextDataLoader: {e}")
347
+ return f"❌ Failed to create data loader: {str(e)}"
348
+
349
+ return f"βœ… Successfully prepared dataset with {len(dataset['train'])} samples"
350
+
351
+ except Exception as e:
352
+ return f"❌ Failed to prepare dataset: {str(e)}"
353
+
354
+ def setup_training(self, config: TrainingConfig) -> str:
355
+ """
356
+ Set up the training configuration using OpenLLM's approach.
357
+
358
+ This method configures the training environment with:
359
+ 1. Output directory creation
360
+ 2. Optimizer setup with weight decay groups
361
+ 3. Learning rate scheduler with warmup
362
+ 4. Training hyperparameters
363
+
364
+ The setup follows OpenLLM's training methodology:
365
+ - Uses AdamW optimizer with weight decay
366
+ - Implements learning rate warmup followed by cosine annealing
367
+ - Separates parameters for different weight decay rates
368
+ - Uses gradient clipping for stability
369
+
370
+ Args:
371
+ config: Training configuration object containing all hyperparameters
372
+
373
+ Returns:
374
+ Status message indicating success or failure
375
+ Success: "βœ… Training setup completed successfully"
376
+ Failure: "❌ Failed to setup training: {error details}"
377
+ """
378
+ try:
379
+ print("πŸ”„ Setting up training configuration...")
380
+ print(f" - Output directory: {config.output_dir}")
381
+ print(f" - Learning rate: {config.learning_rate}")
382
+ print(f" - Max steps: {config.max_steps}")
383
+
384
+ # Create output directory for saving models and checkpoints
385
+ os.makedirs(config.output_dir, exist_ok=True)
386
+ print(f"βœ… Output directory created: {config.output_dir}")
387
+
388
+ # Set up optimizer (AdamW with weight decay)
389
+ # This follows OpenLLM's optimization strategy
390
+ print("πŸ”„ Setting up AdamW optimizer with weight decay...")
391
+
392
+ # Separate parameters for different weight decay rates
393
+ # This is a common practice for transformer training
394
+ decay_params = [] # Parameters that should have weight decay
395
+ no_decay_params = [] # Parameters that should not have weight decay
396
+
397
+ for name, param in self.model.named_parameters():
398
+ if not param.requires_grad:
399
+ continue
400
+
401
+ # Apply weight decay to all parameters except biases and layer norm weights
402
+ if len(param.shape) == 1 or name.endswith('.bias'):
403
+ no_decay_params.append(param)
404
+ else:
405
+ decay_params.append(param)
406
+
407
+ # Create parameter groups with different weight decay rates
408
+ param_groups = [
409
+ {'params': decay_params, 'weight_decay': 0.01}, # 1% weight decay
410
+ {'params': no_decay_params, 'weight_decay': 0.0} # No weight decay
411
+ ]
412
+
413
+ print(f" - Decay parameters: {len(decay_params)}")
414
+ print(f" - No-decay parameters: {len(no_decay_params)}")
415
+
416
+ # Initialize AdamW optimizer with OpenLLM's recommended settings
417
+ self.optimizer = torch.optim.AdamW(
418
+ param_groups,
419
+ lr=config.learning_rate,
420
+ betas=(0.9, 0.95), # Beta values for momentum
421
+ eps=1e-8 # Epsilon for numerical stability
422
+ )
423
+
424
+ print(f"βœ… AdamW optimizer configured")
425
+ print(f" - Learning rate: {config.learning_rate}")
426
+ print(f" - Betas: (0.9, 0.95)")
427
+ print(f" - Epsilon: 1e-8")
428
+
429
+ # Set up learning rate scheduler
430
+ # OpenLLM uses a warmup followed by cosine annealing
431
+ print("πŸ”„ Setting up learning rate scheduler...")
432
+
433
+ # Warmup scheduler: linearly increase LR from 1% to 100%
434
+ warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
435
+ self.optimizer,
436
+ start_factor=0.01, # Start at 1% of target LR
437
+ end_factor=1.0, # End at 100% of target LR
438
+ total_iters=config.warmup_steps
439
+ )
440
+
441
+ # Main scheduler: cosine annealing after warmup
442
+ main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
443
+ self.optimizer,
444
+ T_max=config.max_steps - config.warmup_steps # Duration of cosine annealing
445
+ )
446
+
447
+ # Combine warmup and main schedulers
448
+ self.scheduler = torch.optim.lr_scheduler.SequentialLR(
449
+ self.optimizer,
450
+ schedulers=[warmup_scheduler, main_scheduler],
451
+ milestones=[config.warmup_steps] # Switch to main scheduler after warmup
452
+ )
453
+
454
+ print(f"βœ… Learning rate scheduler configured")
455
+ print(f" - Warmup steps: {config.warmup_steps}")
456
+ print(f" - Total steps: {config.max_steps}")
457
+ print(f" - Schedule: Linear warmup β†’ Cosine annealing")
458
+
459
+ print("βœ… Training setup completed successfully")
460
+ return f"βœ… Training setup completed successfully"
461
+
462
+ except Exception as e:
463
+ return f"❌ Failed to setup training: {str(e)}"
464
+
465
+ def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
466
+ """
467
+ Execute the actual model training using OpenLLM's approach.
468
+
469
+ This method implements OpenLLM's training loop:
470
+ 1. Sets up training mode and progress tracking
471
+ 2. Iterates through data batches using OpenLLM's TextDataLoader
472
+ 3. Performs forward pass, loss computation, and backward pass
473
+ 4. Implements gradient accumulation for memory efficiency
474
+ 5. Updates model parameters and learning rate
475
+ 6. Saves checkpoints and logs progress
476
+
477
+ The training loop follows OpenLLM's methodology:
478
+ - Uses OpenLLM's GPTModel forward pass (returns logits and loss)
479
+ - Implements gradient accumulation for effective larger batch sizes
480
+ - Uses gradient clipping for training stability
481
+ - Saves checkpoints in OpenLLM's format
482
+ - Updates progress for UI monitoring
483
+
484
+ Args:
485
+ config: Training configuration object containing hyperparameters
486
+ progress_callback: Optional callback function for progress updates
487
+ (Not used in current implementation)
488
+
489
+ Returns:
490
+ Status message indicating success or failure
491
+ Success: "βœ… Training completed successfully! Final step: {step}"
492
+ Failure: "❌ Training failed: {error details}"
493
+ """
494
+ try:
495
+ # Set training state
496
+ self.is_training = True
497
+ self.training_progress["status"] = "Training"
498
+ self.training_progress["total_steps"] = config.max_steps
499
+
500
+ print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
501
+ print(f" - Model: {type(self.model).__name__}")
502
+ print(f" - DataLoader: {type(self.data_loader).__name__}")
503
+ print(f" - Optimizer: {type(self.optimizer).__name__}")
504
+ print(f" - Gradient accumulation: {config.gradient_accumulation_steps}")
505
+
506
+ # Training loop using OpenLLM's approach
507
+ self.model.train() # Set model to training mode
508
+ accumulated_loss = 0.0 # Track loss across accumulation steps
509
+ self.optimizer.zero_grad() # Clear gradients
510
+
511
+ step = 0 # Current training step
512
+ for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
513
+ # Check if we've reached the maximum number of steps
514
+ if step >= config.max_steps:
515
+ break
516
+
517
+ # Forward pass (model computes loss internally when targets provided)
518
+ # OpenLLM's GPTModel returns both logits and loss
519
+ logits, loss = self.model(input_ids, target_ids)
520
+
521
+ # Scale loss for gradient accumulation
522
+ # This allows us to simulate larger batch sizes
523
+ loss = loss / config.gradient_accumulation_steps
524
+ accumulated_loss += loss.item()
525
+
526
+ # Backward pass - compute gradients
527
+ loss.backward()
528
+
529
+ # Update weights every gradient_accumulation_steps
530
+ if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
531
+ # Clip gradients for training stability
532
+ # This prevents exploding gradients
533
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
534
+
535
+ # Update parameters using the optimizer
536
+ self.optimizer.step()
537
+
538
+ # Update learning rate using the scheduler
539
+ self.scheduler.step()
540
+
541
+ # Clear gradients for the next accumulation cycle
542
+ self.optimizer.zero_grad()
543
+
544
+ # Update step count
545
+ step += 1
546
+
547
+ # Update progress for UI monitoring
548
+ self.training_progress["current_step"] = step
549
+ self.training_progress["loss"] = accumulated_loss
550
+ self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
551
+
552
+ # Log progress at specified intervals
553
+ if step % config.logging_steps == 0:
554
+ current_lr = self.scheduler.get_last_lr()[0]
555
+ print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {current_lr:.2e}")
556
+
557
+ # Save checkpoint at specified intervals
558
+ if step % config.save_steps == 0:
559
+ self._save_checkpoint(config.output_dir, step)
560
+ print(f"πŸ’Ύ Checkpoint saved at step {step}")
561
+
562
+ # Reset accumulated loss for the next accumulation cycle
563
+ accumulated_loss = 0.0
564
+
565
+ # Clean up memory periodically
566
+ if step % 100 == 0:
567
+ gc.collect()
568
+ print(f"🧹 Memory cleanup at step {step}")
569
+
570
+ # Save final checkpoint
571
+ self._save_checkpoint(config.output_dir, step, is_best=True)
572
+ print(f"πŸ’Ύ Final checkpoint saved at step {step}")
573
+
574
+ # Update final progress
575
+ self.training_progress["status"] = "Completed"
576
+ self.training_progress["current_step"] = step
577
+
578
+ print(f"βœ… Training completed! Final step: {step}")
579
+ print(f" - Total steps completed: {step}")
580
+ print(f" - Final loss: {self.training_progress['loss']:.4f}")
581
+ print(f" - Final learning rate: {self.training_progress['learning_rate']:.2e}")
582
+
583
+ return f"βœ… Training completed successfully! Final step: {step}"
584
+
585
+ except Exception as e:
586
+ self.training_progress["status"] = "Failed"
587
+ print(f"❌ Training failed: {e}")
588
+ print(f" - Error occurred during training")
589
+ print(f" - Training state: {self.training_progress['status']}")
590
+ return f"❌ Training failed: {str(e)}"
591
+ finally:
592
+ self.is_training = False
593
+
594
+ def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
595
+ """
596
+ Save model checkpoint using OpenLLM's approach.
597
+
598
+ This method saves the model state in OpenLLM's checkpoint format:
599
+ - Model state dictionary
600
+ - Optimizer state dictionary
601
+ - Scheduler state dictionary
602
+ - Model configuration
603
+ - Training step information
604
+
605
+ The checkpoint format is compatible with OpenLLM's loading mechanism
606
+ and can be used to resume training or load the model for inference.
607
+
608
+ Args:
609
+ output_dir: Directory to save the checkpoint
610
+ step: Current training step number
611
+ is_best: Whether this is the best model so far
612
+ """
613
+ try:
614
+ # Create checkpoint dictionary with all necessary components
615
+ checkpoint = {
616
+ 'step': step, # Current training step
617
+ 'model_state_dict': self.model.state_dict(), # Model parameters
618
+ 'optimizer_state_dict': self.optimizer.state_dict(), # Optimizer state
619
+ 'scheduler_state_dict': self.scheduler.state_dict(), # Scheduler state
620
+ 'config': self.model.config.__dict__ # Model configuration
621
+ }
622
+
623
+ # Save latest checkpoint
624
+ checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
625
+ torch.save(checkpoint, checkpoint_path)
626
+
627
+ # Save best checkpoint if this is the best model
628
+ if is_best:
629
+ best_path = os.path.join(output_dir, "best_model.pt")
630
+ torch.save(checkpoint, best_path)
631
+ print(f"πŸ’Ύ Best model saved: {best_path}")
632
+
633
+ print(f"πŸ’Ύ Checkpoint saved: {checkpoint_path}")
634
+
635
+ except Exception as e:
636
+ print(f"❌ Failed to save checkpoint: {e}")
637
+
638
+ def save_and_upload_model(self, config: TrainingConfig) -> str:
639
+ """
640
+ Save the trained model and upload it to Hugging Face Hub.
641
+
642
+ This method completes the training pipeline by:
643
+ 1. Saving the final model checkpoint
644
+ 2. Copying the tokenizer files
645
+ 3. Uploading the complete model to Hugging Face Hub
646
+ 4. Creating a new model repository for the trained model
647
+
648
+ The uploaded model will be available at:
649
+ https://huggingface.co/lemms/openllm-{size}-extended-8k
650
+
651
+ Args:
652
+ config: Training configuration object
653
+
654
+ Returns:
655
+ Status message indicating success or failure
656
+ Success: "βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
657
+ Failure: "❌ Failed to save/upload model: {error details}"
658
+ """
659
+ try:
660
+ print("πŸ”„ Saving trained model...")
661
+ print(f" - Output directory: {config.output_dir}")
662
+ print(f" - Model size: {config.model_size}")
663
+
664
+ # Save the final model checkpoint
665
+ self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
666
+
667
+ # Save tokenizer files
668
+ # Create a tokenizer directory within the output directory
669
+ tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
670
+ os.makedirs(tokenizer_dir, exist_ok=True)
671
+
672
+ # Copy the tokenizer.model file using the stored path
673
+ # This ensures the tokenizer is included with the model
674
+ import shutil
675
+ shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
676
+
677
+ print("βœ… Model saved locally")
678
+ print(f" - Model checkpoint: {config.output_dir}/best_model.pt")
679
+ print(f" - Tokenizer: {tokenizer_dir}/tokenizer.model")
680
+
681
+ # Generate model name for upload
682
+ # The naming convention follows: openllm-{size}-extended-8k
683
+ model_name = f"openllm-{config.model_size}-extended-8k"
684
+ repo_id = f"lemms/{model_name}"
685
+
686
+ # Upload to Hugging Face Hub
687
+ if self.hf_api:
688
+ print(f"πŸ”„ Uploading model to {repo_id}...")
689
+ print(f" - Repository: {repo_id}")
690
+ print(f" - Type: model")
691
+ print(f" - Source: {config.output_dir}")
692
+
693
+ # Upload model files to Hugging Face Hub
694
+ # This creates a new model repository with all the files
695
+ self.hf_api.upload_folder(
696
+ folder_path=config.output_dir,
697
+ repo_id=repo_id,
698
+ repo_type="model",
699
+ commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
700
+ )
701
+
702
+ print(f"βœ… Model uploaded successfully to {repo_id}")
703
+ print(f" - Available at: https://huggingface.co/{repo_id}")
704
+ return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
705
+ else:
706
+ print("⚠️ Hugging Face API not available - model saved locally only")
707
+ return f"βœ… Model saved locally to {config.output_dir}"
708
+
709
+ except Exception as e:
710
+ print(f"❌ Failed to save/upload model: {e}")
711
+ return f"❌ Failed to save/upload model: {str(e)}"
712
+
713
+ def get_training_progress(self) -> Dict[str, Any]:
714
+ """
715
+ Get current training progress information.
716
+
717
+ This method returns a copy of the current training progress
718
+ for display in the Gradio UI. The progress information includes:
719
+ - Current training status
720
+ - Current step and total steps
721
+ - Current loss value
722
+ - Current learning rate
723
+
724
+ Returns:
725
+ Dictionary containing current training progress information
726
+ """
727
+ return self.training_progress.copy()
728
+
729
+ def main():
730
+ """
731
+ Main function that creates the complete Gradio application interface.
732
+
733
+ This function sets up the entire Gradio application with:
734
+ 1. Application header and status information
735
+ 2. Training configuration controls
736
+ 3. Training status and progress display
737
+ 4. Training control buttons
738
+ 5. Instructions and resource links
739
+ 6. Training function implementation
740
+
741
+ The interface provides a complete training experience for OpenLLM models
742
+ with real-time progress monitoring and comprehensive configuration options.
743
+
744
+ Returns:
745
+ Gradio Blocks interface for the training application
746
+ """
747
+
748
+ # Initialize the trainer
749
+ # This creates the OpenLLMTrainer instance that will handle all training operations
750
+ trainer = OpenLLMTrainer()
751
+
752
+ # Create the main Gradio application interface
753
+ # Using Gradio 4.44.1 with Soft theme for modern appearance
754
+ with gr.Blocks(
755
+ title="OpenLLM Training Space - Fixed with Uploaded Modules",
756
+ theme=gr.themes.Soft()
757
+ ) as demo:
758
+
759
+ # Application Header
760
+ # Provides clear identification and description of the application
761
+ gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed with Uploaded Modules")
762
+ gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
763
+ gr.Markdown("---")
764
+
765
+ # Status Information
766
+ # Shows the availability of key components and dependencies
767
+ gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
768
+ gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
769
+ gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
770
+ gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (From Uploaded Files)")
771
+
772
+ # Main Content Area
773
+ # Two-column layout for configuration and status
774
+ with gr.Row():
775
+
776
+ # Left Column: Training Configuration
777
+ # Contains all the training hyperparameters and settings
778
+ with gr.Column(scale=1):
779
+ gr.Markdown("## πŸ“Š Training Configuration")
780
+
781
+ # Model Size Selection
782
+ # Allows users to choose which base model to train from
783
+ model_size = gr.Dropdown(
784
+ choices=["small", "medium", "large"],
785
+ value="small",
786
+ label="Model Size",
787
+ info="Select the base model size to train from"
788
+ )
789
+
790
+ # Training Steps Configuration
791
+ # Controls the number of training iterations
792
+ max_steps = gr.Slider(
793
+ minimum=100,
794
+ maximum=10000,
795
+ value=1000,
796
+ step=100,
797
+ label="Max Training Steps",
798
+ info="Number of training iterations (100-10,000)"
799
+ )
800
+
801
+ # Learning Rate Configuration
802
+ # Controls the learning rate for the optimizer
803
+ learning_rate = gr.Slider(
804
+ minimum=1e-5,
805
+ maximum=1e-3,
806
+ value=3e-4,
807
+ step=1e-5,
808
+ label="Learning Rate",
809
+ info="Training rate (0.00001-0.001)"
810
+ )
811
+
812
+ # Batch Size Configuration
813
+ # Controls the number of samples per training batch
814
+ batch_size = gr.Slider(
815
+ minimum=1,
816
+ maximum=16,
817
+ value=4,
818
+ step=1,
819
+ label="Batch Size",
820
+ info="Samples per training batch (1-16)"
821
+ )
822
+
823
+ # Right Column: Training Status and Controls
824
+ # Contains status display and control buttons
825
+ with gr.Column(scale=1):
826
+ gr.Markdown("## 🎯 Training Status")
827
+
828
+ # Training Status Display
829
+ # Shows current training status and any error messages
830
+ status_text = gr.Textbox(
831
+ value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
832
+ label="Current Status",
833
+ interactive=False,
834
+ lines=5,
835
+ info="Shows current training status and progress updates"
836
+ )
837
+
838
+ # Progress Information
839
+ # Displays detailed training progress in JSON format
840
+ progress_info = gr.JSON(
841
+ value=trainer.get_training_progress(),
842
+ label="Training Progress"
843
+ )
844
+
845
+ # Training Control Buttons
846
+ # Buttons to start and stop training
847
+ with gr.Row():
848
+ start_btn = gr.Button("πŸš€ Start Training", variant="primary")
849
+ stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
850
+
851
+ # Instructions Section
852
+ # Provides detailed instructions for using the training interface
853
+ gr.Markdown("## πŸ“‹ OpenLLM Training Instructions")
854
+ gr.Markdown("""
855
+ This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
856
+
857
+ ### **Step 1: Configure Parameters**
858
+ - **Model Size**: Select the base model to train from (small, medium, large)
859
+ - **Max Steps**: Number of training iterations (100-10,000)
860
+ - **Learning Rate**: Training rate (0.00001-0.001)
861
+ - **Batch Size**: Samples per training batch (1-16)
862
+
863
+ ### **Step 2: Start Training**
864
+ - Click "Start Training" to begin the actual training process
865
+ - Uses OpenLLM's custom GPTModel class from uploaded files
866
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
867
+ - Compatible with OpenLLM's actual implementation
868
+
869
+ ### **Step 3: Monitor Progress**
870
+ - Watch the status updates and progress information
871
+ - Training may take several minutes depending on steps
872
+ - The final model will be uploaded to Hugging Face Hub
873
+
874
+ ### **Step 4: Access Results**
875
+ - Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
876
+ - Check the model repository for your trained model
877
+ - Use the model for inference or further training
878
+ """)
879
+
880
+ # Resource Links Section
881
+ # Provides links to related models and resources
882
+ gr.Markdown("## πŸ”— Model Resources")
883
+ gr.Markdown("""
884
+ - [πŸ“š 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
885
+ - [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
886
+ - [πŸ“Š Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
887
+ - [πŸ“– Main Project](https://github.com/louischua/openllm)
888
+ """)
889
+
890
+ # Training Function Definition
891
+ # This function is called when the Start Training button is clicked
892
+ def start_complete_training(model_size, max_steps, learning_rate, batch_size):
893
+ """
894
+ Execute the complete training process using OpenLLM's approach.
895
+
896
+ This function orchestrates the entire training pipeline:
897
+ 1. Validates OpenLLM availability
898
+ 2. Creates training configuration
899
+ 3. Loads model and tokenizer
900
+ 4. Prepares dataset
901
+ 5. Sets up training environment
902
+ 6. Executes training
903
+ 7. Saves and uploads the trained model
904
+
905
+ The function provides comprehensive error handling and status updates
906
+ throughout the training process.
907
+
908
+ Args:
909
+ model_size: Size of the model to train ("small", "medium", "large")
910
+ max_steps: Maximum number of training steps
911
+ learning_rate: Learning rate for the optimizer
912
+ batch_size: Batch size for training
913
+
914
+ Returns:
915
+ Status message indicating the result of the training process
916
+ """
917
+ # Validate OpenLLM availability
918
+ if not OPENLLM_AVAILABLE:
919
+ return "❌ OpenLLM custom model architecture not available. Please check the installation."
920
+
921
+ try:
922
+ print(f"πŸš€ Starting complete training process...")
923
+ print(f" - Model size: {model_size}")
924
+ print(f" - Max steps: {max_steps}")
925
+ print(f" - Learning rate: {learning_rate}")
926
+ print(f" - Batch size: {batch_size}")
927
+
928
+ # Create training configuration
929
+ # This encapsulates all training parameters
930
+ config = TrainingConfig(
931
+ model_size=model_size,
932
+ max_steps=max_steps,
933
+ learning_rate=learning_rate,
934
+ batch_size=batch_size
935
+ )
936
+
937
+ # Step 1: Load model and tokenizer using OpenLLM's approach
938
+ print("πŸ”„ Step 1: Loading model and tokenizer...")
939
+ status = trainer.load_model_and_tokenizer(model_size)
940
+ if "❌" in status:
941
+ return status
942
+
943
+ # Step 2: Prepare dataset
944
+ print("πŸ”„ Step 2: Preparing dataset...")
945
+ status = trainer.prepare_dataset()
946
+ if "❌" in status:
947
+ return status
948
+
949
+ # Step 3: Setup training
950
+ print("πŸ”„ Step 3: Setting up training...")
951
+ status = trainer.setup_training(config)
952
+ if "❌" in status:
953
+ return status
954
+
955
+ # Step 4: Execute training
956
+ print("πŸ”„ Step 4: Executing training...")
957
+ status = trainer.train_model(config)
958
+ if "❌" in status:
959
+ return status
960
+
961
+ # Step 5: Save and upload model
962
+ print("πŸ”„ Step 5: Saving and uploading model...")
963
+ status = trainer.save_and_upload_model(config)
964
+
965
+ print("πŸŽ‰ Complete training process finished!")
966
+ return f"πŸš€ Complete training process finished!\n{status}"
967
+
968
+ except Exception as e:
969
+ print(f"❌ Training process failed: {str(e)}")
970
+ return f"❌ Training process failed: {str(e)}"
971
+
972
+ def update_progress():
973
+ """
974
+ Update the progress display.
975
+
976
+ This function is called periodically to update the progress
977
+ information displayed in the Gradio interface. It returns the
978
+ current training progress from the trainer.
979
+
980
+ Returns:
981
+ Current training progress dictionary
982
+ """
983
+ return trainer.get_training_progress()
984
+
985
+ # Connect UI Components to Functions
986
+ # This connects the Start Training button to the training function
987
+ start_btn.click(
988
+ fn=start_complete_training,
989
+ inputs=[model_size, max_steps, learning_rate, batch_size],
990
+ outputs=[status_text]
991
+ )
992
+
993
+ # Auto-refresh progress every 5 seconds during training
994
+ # This ensures the progress display stays up to date
995
+ demo.load(update_progress, outputs=[progress_info])
996
+
997
+ # Application Footer
998
+ # Provides attribution and technical information
999
+ gr.Markdown("---")
1000
+ gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
1001
+ gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
1002
+ gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
1003
+
1004
+ return demo
1005
+
1006
+ if __name__ == "__main__":
1007
+ # Launch the Gradio application
1008
+ # This starts the web interface for the training application
1009
+ demo = main()
1010
+ demo.launch()