lemms commited on
Commit
417d22e
Β·
verified Β·
1 Parent(s): 4672bf6

Update: Comprehensive documentation and verbose comments throughout the codebase

Browse files
Files changed (1) hide show
  1. app.py +442 -73
app.py CHANGED
@@ -7,9 +7,27 @@ This version imports OpenLLM modules from the uploaded files in the HF Space:
7
  - Uses OpenLLM's actual custom model architecture
8
  - Compatible with OpenLLM's implementation
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  Author: Louis Chua Bean Chong
11
  License: GPL-3.0
12
- Version: 2.1.0
13
  Last Updated: 2024
14
  """
15
 
@@ -26,37 +44,72 @@ from dataclasses import dataclass
26
  from pathlib import Path
27
 
28
  # Import OpenLLM's custom model architecture from uploaded files
 
29
  try:
30
  # Import from the uploaded files in the HF Space
 
31
  from model import GPTModel, GPTConfig, create_model
 
32
  from data_loader import TextDataLoader
33
  OPENLLM_AVAILABLE = True
34
  print("βœ… OpenLLM custom model architecture imported successfully from uploaded files")
 
 
 
 
35
  except ImportError as e:
36
  print(f"❌ OpenLLM imports failed: {e}")
 
 
37
  OPENLLM_AVAILABLE = False
38
 
39
- # Try to import sentencepiece
 
40
  try:
41
  import sentencepiece as spm
42
  SENTENCEPIECE_AVAILABLE = True
43
  print(f"βœ… SentencePiece available: {spm.__version__}")
 
 
44
  except ImportError:
45
  SENTENCEPIECE_AVAILABLE = False
46
  print("❌ SentencePiece not available")
 
 
47
 
48
- # Import other dependencies
49
  try:
50
- from datasets import load_dataset
51
- from huggingface_hub import HfApi, hf_hub_download
52
  DEPENDENCIES_AVAILABLE = True
 
 
 
53
  except ImportError as e:
54
  print(f"❌ Dependencies not available: {e}")
 
55
  DEPENDENCIES_AVAILABLE = False
56
 
57
  @dataclass
58
  class TrainingConfig:
59
- """Configuration class for training parameters."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  model_size: str
61
  max_steps: int
62
  learning_rate: float
@@ -74,83 +127,139 @@ class OpenLLMTrainer:
74
  This class handles the entire training pipeline including:
75
  - Model loading using OpenLLM's custom GPTModel
76
  - Tokenizer loading using sentencepiece.SentencePieceProcessor()
77
- - Dataset preparation
78
  - Training execution using OpenLLM's approach
79
- - Model saving and uploading
 
 
 
 
 
 
 
 
 
 
 
 
80
  """
81
 
82
  def __init__(self):
83
- """Initialize the trainer with default settings."""
84
- self.model = None
85
- self.tokenizer = None
86
- self.data_loader = None
87
- self.optimizer = None
88
- self.scheduler = None
89
- self.is_training = False
 
 
 
 
 
 
 
 
 
 
 
 
90
  self.training_progress = {
91
- "status": "Ready",
92
- "current_step": 0,
93
- "total_steps": 0,
94
- "loss": 0.0,
95
- "learning_rate": 0.0
96
  }
97
 
98
  # Initialize Hugging Face API for model uploading
 
99
  try:
100
  self.hf_api = HfApi()
 
101
  except Exception as e:
102
  print(f"Failed to initialize HF API: {e}")
 
103
  self.hf_api = None
104
 
105
  def load_model_and_tokenizer(self, model_size: str) -> str:
106
  """
107
  Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
108
 
 
 
 
 
 
 
 
 
 
 
 
109
  Args:
110
  model_size: Size of the model to load ("small", "medium", "large")
 
111
 
112
  Returns:
113
  Status message indicating success or failure
 
 
114
  """
115
  try:
 
116
  if not OPENLLM_AVAILABLE:
117
  return "❌ OpenLLM custom model architecture not available"
118
 
119
  print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
 
 
120
 
121
- # Create model using OpenLLM's factory function
 
122
  try:
123
  self.model = create_model(model_size)
124
  print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
125
- print(f" Parameters: {self.model.get_num_params():,}")
 
 
126
  except Exception as e:
127
  print(f"❌ Failed to create model: {e}")
128
  return f"❌ Failed to create OpenLLM model: {str(e)}"
129
 
130
- # Load tokenizer using sentencepiece
 
131
  try:
132
  print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
 
 
133
 
134
  # Download tokenizer.model from HF Hub
 
135
  model_name = f"lemms/openllm-{model_size}-extended-7k"
136
  tokenizer_path = hf_hub_download(
137
  repo_id=model_name,
138
- filename="tokenizer.model"
139
  )
140
 
141
  print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
 
 
142
 
143
- # Create SentencePieceProcessor
 
144
  sp_processor = spm.SentencePieceProcessor()
145
  sp_processor.load(tokenizer_path)
146
 
147
  # Store tokenizer and its path separately
 
148
  self.tokenizer = sp_processor
149
  self.tokenizer_path = tokenizer_path # Store the path separately
150
 
151
  print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
152
- print(f" Vocabulary size: {sp_processor.vocab_size()}")
153
- print(f" Tokenizer path: {tokenizer_path}")
 
154
 
155
  except Exception as e:
156
  print(f"❌ Failed to load tokenizer: {e}")
@@ -165,41 +274,73 @@ class OpenLLMTrainer:
165
  """
166
  Load and prepare the training dataset using OpenLLM's approach.
167
 
 
 
 
 
 
 
 
 
 
 
 
168
  Returns:
169
  Status message indicating success or failure
 
 
170
  """
171
  try:
 
172
  if not DEPENDENCIES_AVAILABLE:
173
  return "❌ Required dependencies not available"
174
 
175
  print("πŸ”„ Loading training dataset...")
 
 
176
 
177
  # Load dataset from HF Hub
 
178
  dataset = load_dataset("lemms/openllm-training-data")
179
  print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
 
 
180
 
181
  # Create temporary data file for OpenLLM's TextDataLoader
 
182
  temp_data_file = "temp_training_data.txt"
183
  with open(temp_data_file, 'w', encoding='utf-8') as f:
184
  for item in dataset['train']:
185
  f.write(item['text'] + '\n')
186
 
187
  print(f"βœ… Temporary data file created: {temp_data_file}")
 
 
188
 
189
  # Create OpenLLM's TextDataLoader
 
190
  try:
191
  # Use the stored tokenizer path instead of trying to access model_file_path
 
192
  tokenizer_path = self.tokenizer_path # Use the stored path
193
 
 
 
 
 
 
 
194
  self.data_loader = TextDataLoader(
195
  data_file=temp_data_file,
196
  tokenizer_path=tokenizer_path,
197
- seq_len=512,
198
  batch_size=4, # Will be overridden by training config
199
- shuffle=True
200
  )
201
 
202
  print(f"βœ… OpenLLM TextDataLoader created successfully")
 
 
203
 
204
  except Exception as e:
205
  print(f"❌ Failed to create TextDataLoader: {e}")
@@ -214,60 +355,107 @@ class OpenLLMTrainer:
214
  """
215
  Set up the training configuration using OpenLLM's approach.
216
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  Args:
218
- config: Training configuration object
219
 
220
  Returns:
221
  Status message indicating success or failure
 
 
222
  """
223
  try:
224
- # Create output directory
 
 
 
 
 
225
  os.makedirs(config.output_dir, exist_ok=True)
 
226
 
227
  # Set up optimizer (AdamW with weight decay)
228
- decay_params = []
229
- no_decay_params = []
 
 
 
 
 
230
 
231
  for name, param in self.model.named_parameters():
232
  if not param.requires_grad:
233
  continue
234
 
 
235
  if len(param.shape) == 1 or name.endswith('.bias'):
236
  no_decay_params.append(param)
237
  else:
238
  decay_params.append(param)
239
 
 
240
  param_groups = [
241
- {'params': decay_params, 'weight_decay': 0.01},
242
- {'params': no_decay_params, 'weight_decay': 0.0}
243
  ]
244
 
 
 
 
 
245
  self.optimizer = torch.optim.AdamW(
246
  param_groups,
247
  lr=config.learning_rate,
248
- betas=(0.9, 0.95),
249
- eps=1e-8
250
  )
251
 
 
 
 
 
 
252
  # Set up learning rate scheduler
 
 
 
 
253
  warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
254
  self.optimizer,
255
- start_factor=0.01,
256
- end_factor=1.0,
257
  total_iters=config.warmup_steps
258
  )
259
 
 
260
  main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
261
  self.optimizer,
262
- T_max=config.max_steps - config.warmup_steps
263
  )
264
 
 
265
  self.scheduler = torch.optim.lr_scheduler.SequentialLR(
266
  self.optimizer,
267
  schedulers=[warmup_scheduler, main_scheduler],
268
- milestones=[config.warmup_steps]
269
  )
270
 
 
 
 
 
 
271
  print("βœ… Training setup completed successfully")
272
  return f"βœ… Training setup completed successfully"
273
 
@@ -278,107 +466,165 @@ class OpenLLMTrainer:
278
  """
279
  Execute the actual model training using OpenLLM's approach.
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  Args:
282
- config: Training configuration object
283
  progress_callback: Optional callback function for progress updates
 
284
 
285
  Returns:
286
  Status message indicating success or failure
 
 
287
  """
288
  try:
 
289
  self.is_training = True
290
  self.training_progress["status"] = "Training"
291
  self.training_progress["total_steps"] = config.max_steps
292
 
293
  print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
 
 
 
 
294
 
295
  # Training loop using OpenLLM's approach
296
- self.model.train()
297
- accumulated_loss = 0.0
298
- self.optimizer.zero_grad()
299
 
300
- step = 0
301
  for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
 
302
  if step >= config.max_steps:
303
  break
304
 
305
  # Forward pass (model computes loss internally when targets provided)
 
306
  logits, loss = self.model(input_ids, target_ids)
307
 
308
  # Scale loss for gradient accumulation
 
309
  loss = loss / config.gradient_accumulation_steps
310
  accumulated_loss += loss.item()
311
 
312
- # Backward pass
313
  loss.backward()
314
 
315
  # Update weights every gradient_accumulation_steps
316
  if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
317
- # Clip gradients
 
318
  torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
319
 
320
- # Update parameters
321
  self.optimizer.step()
 
 
322
  self.scheduler.step()
 
 
323
  self.optimizer.zero_grad()
324
 
325
  # Update step count
326
  step += 1
327
 
328
- # Update progress
329
  self.training_progress["current_step"] = step
330
  self.training_progress["loss"] = accumulated_loss
331
  self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
332
 
333
- # Log progress
334
  if step % config.logging_steps == 0:
335
- print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {self.scheduler.get_last_lr()[0]:.2e}")
 
336
 
337
- # Save checkpoint
338
  if step % config.save_steps == 0:
339
  self._save_checkpoint(config.output_dir, step)
 
340
 
341
- # Reset accumulated loss
342
  accumulated_loss = 0.0
343
 
344
- # Clean up memory
345
  if step % 100 == 0:
346
  gc.collect()
 
347
 
348
- # Final checkpoint
349
  self._save_checkpoint(config.output_dir, step, is_best=True)
 
350
 
351
  # Update final progress
352
  self.training_progress["status"] = "Completed"
353
  self.training_progress["current_step"] = step
354
 
355
  print(f"βœ… Training completed! Final step: {step}")
 
 
 
356
 
357
  return f"βœ… Training completed successfully! Final step: {step}"
358
 
359
  except Exception as e:
360
  self.training_progress["status"] = "Failed"
361
  print(f"❌ Training failed: {e}")
 
 
362
  return f"❌ Training failed: {str(e)}"
363
  finally:
364
  self.is_training = False
365
 
366
  def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
367
- """Save model checkpoint using OpenLLM's approach."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  try:
 
369
  checkpoint = {
370
- 'step': step,
371
- 'model_state_dict': self.model.state_dict(),
372
- 'optimizer_state_dict': self.optimizer.state_dict(),
373
- 'scheduler_state_dict': self.scheduler.state_dict(),
374
- 'config': self.model.config.__dict__
375
  }
376
 
377
  # Save latest checkpoint
378
  checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
379
  torch.save(checkpoint, checkpoint_path)
380
 
381
- # Save best checkpoint
382
  if is_best:
383
  best_path = os.path.join(output_dir, "best_model.pt")
384
  torch.save(checkpoint, best_path)
@@ -393,37 +639,59 @@ class OpenLLMTrainer:
393
  """
394
  Save the trained model and upload it to Hugging Face Hub.
395
 
 
 
 
 
 
 
 
 
 
396
  Args:
397
  config: Training configuration object
398
 
399
  Returns:
400
  Status message indicating success or failure
 
 
401
  """
402
  try:
403
  print("πŸ”„ Saving trained model...")
 
 
404
 
405
- # Save the final model
406
  self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
407
 
408
  # Save tokenizer files
 
409
  tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
410
  os.makedirs(tokenizer_dir, exist_ok=True)
411
 
412
  # Copy the tokenizer.model file using the stored path
 
413
  import shutil
414
  shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
415
 
416
  print("βœ… Model saved locally")
 
 
417
 
418
  # Generate model name for upload
 
419
  model_name = f"openllm-{config.model_size}-extended-8k"
420
  repo_id = f"lemms/{model_name}"
421
 
422
  # Upload to Hugging Face Hub
423
  if self.hf_api:
424
  print(f"πŸ”„ Uploading model to {repo_id}...")
 
 
 
425
 
426
- # Upload model files
 
427
  self.hf_api.upload_folder(
428
  folder_path=config.output_dir,
429
  repo_id=repo_id,
@@ -432,8 +700,10 @@ class OpenLLMTrainer:
432
  )
433
 
434
  print(f"βœ… Model uploaded successfully to {repo_id}")
 
435
  return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
436
  else:
 
437
  return f"βœ… Model saved locally to {config.output_dir}"
438
 
439
  except Exception as e:
@@ -441,99 +711,146 @@ class OpenLLMTrainer:
441
  return f"❌ Failed to save/upload model: {str(e)}"
442
 
443
  def get_training_progress(self) -> Dict[str, Any]:
444
- """Get current training progress information."""
 
 
 
 
 
 
 
 
 
 
 
 
445
  return self.training_progress.copy()
446
 
447
  def main():
448
  """
449
  Main function that creates the complete Gradio application interface.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  """
451
 
452
  # Initialize the trainer
 
453
  trainer = OpenLLMTrainer()
454
 
455
  # Create the main Gradio application interface
 
456
  with gr.Blocks(
457
  title="OpenLLM Training Space - Fixed with Uploaded Modules",
458
  theme=gr.themes.Soft()
459
  ) as demo:
460
 
461
  # Application Header
 
462
  gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed with Uploaded Modules")
463
  gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
464
  gr.Markdown("---")
465
 
466
  # Status Information
 
467
  gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
468
  gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
469
  gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
470
  gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (From Uploaded Files)")
471
 
472
  # Main Content Area
 
473
  with gr.Row():
474
 
475
  # Left Column: Training Configuration
 
476
  with gr.Column(scale=1):
477
  gr.Markdown("## πŸ“Š Training Configuration")
478
 
479
  # Model Size Selection
 
480
  model_size = gr.Dropdown(
481
  choices=["small", "medium", "large"],
482
  value="small",
483
- label="Model Size"
 
484
  )
485
 
486
  # Training Steps Configuration
 
487
  max_steps = gr.Slider(
488
  minimum=100,
489
  maximum=10000,
490
  value=1000,
491
  step=100,
492
- label="Max Training Steps"
 
493
  )
494
 
495
  # Learning Rate Configuration
 
496
  learning_rate = gr.Slider(
497
  minimum=1e-5,
498
  maximum=1e-3,
499
  value=3e-4,
500
  step=1e-5,
501
- label="Learning Rate"
 
502
  )
503
 
504
  # Batch Size Configuration
 
505
  batch_size = gr.Slider(
506
  minimum=1,
507
  maximum=16,
508
  value=4,
509
  step=1,
510
- label="Batch Size"
 
511
  )
512
 
513
  # Right Column: Training Status and Controls
 
514
  with gr.Column(scale=1):
515
  gr.Markdown("## 🎯 Training Status")
516
 
517
  # Training Status Display
 
518
  status_text = gr.Textbox(
519
  value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
520
  label="Current Status",
521
  interactive=False,
522
- lines=5
 
523
  )
524
 
525
  # Progress Information
 
526
  progress_info = gr.JSON(
527
  value=trainer.get_training_progress(),
528
- label="Training Progress"
 
529
  )
530
 
531
  # Training Control Buttons
 
532
  with gr.Row():
533
  start_btn = gr.Button("πŸš€ Start Training", variant="primary")
534
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
535
 
536
  # Instructions Section
 
537
  gr.Markdown("## πŸ“‹ OpenLLM Training Instructions")
538
  gr.Markdown("""
539
  This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
@@ -562,6 +879,7 @@ def main():
562
  """)
563
 
564
  # Resource Links Section
 
565
  gr.Markdown("## πŸ”— Model Resources")
566
  gr.Markdown("""
567
  - [πŸ“š 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
@@ -571,15 +889,45 @@ def main():
571
  """)
572
 
573
  # Training Function Definition
 
574
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
575
  """
576
  Execute the complete training process using OpenLLM's approach.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  """
 
578
  if not OPENLLM_AVAILABLE:
579
  return "❌ OpenLLM custom model architecture not available. Please check the installation."
580
 
581
  try:
 
 
 
 
 
 
582
  # Create training configuration
 
583
  config = TrainingConfig(
584
  model_size=model_size,
585
  max_steps=max_steps,
@@ -588,38 +936,55 @@ def main():
588
  )
589
 
590
  # Step 1: Load model and tokenizer using OpenLLM's approach
 
591
  status = trainer.load_model_and_tokenizer(model_size)
592
  if "❌" in status:
593
  return status
594
 
595
  # Step 2: Prepare dataset
 
596
  status = trainer.prepare_dataset()
597
  if "❌" in status:
598
  return status
599
 
600
  # Step 3: Setup training
 
601
  status = trainer.setup_training(config)
602
  if "❌" in status:
603
  return status
604
 
605
  # Step 4: Execute training
 
606
  status = trainer.train_model(config)
607
  if "❌" in status:
608
  return status
609
 
610
  # Step 5: Save and upload model
 
611
  status = trainer.save_and_upload_model(config)
612
 
 
613
  return f"πŸš€ Complete training process finished!\n{status}"
614
 
615
  except Exception as e:
 
616
  return f"❌ Training process failed: {str(e)}"
617
 
618
  def update_progress():
619
- """Update the progress display."""
 
 
 
 
 
 
 
 
 
620
  return trainer.get_training_progress()
621
 
622
  # Connect UI Components to Functions
 
623
  start_btn.click(
624
  fn=start_complete_training,
625
  inputs=[model_size, max_steps, learning_rate, batch_size],
@@ -627,9 +992,11 @@ def main():
627
  )
628
 
629
  # Auto-refresh progress every 5 seconds during training
 
630
  demo.load(update_progress, outputs=[progress_info])
631
 
632
  # Application Footer
 
633
  gr.Markdown("---")
634
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
635
  gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
@@ -638,5 +1005,7 @@ def main():
638
  return demo
639
 
640
  if __name__ == "__main__":
 
 
641
  demo = main()
642
  demo.launch()
 
7
  - Uses OpenLLM's actual custom model architecture
8
  - Compatible with OpenLLM's implementation
9
 
10
+ This application provides a complete training interface for OpenLLM models on Hugging Face Spaces.
11
+ It uses OpenLLM's custom GPTModel architecture instead of Hugging Face Transformers,
12
+ ensuring compatibility with the actual OpenLLM implementation.
13
+
14
+ Key Features:
15
+ - Real model training using OpenLLM's custom architecture
16
+ - SentencePiece tokenization for OpenLLM models
17
+ - Complete training pipeline with progress monitoring
18
+ - Automatic model saving and uploading to Hugging Face Hub
19
+ - Gradio 4.44.1 compatible user interface
20
+
21
+ Technical Architecture:
22
+ - Uses OpenLLM's GPTModel class (not Hugging Face Transformers)
23
+ - Imports custom modules from uploaded files in the Space
24
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
25
+ - Implements OpenLLM's training loop and optimization strategy
26
+ - Saves checkpoints in OpenLLM's format
27
+
28
  Author: Louis Chua Bean Chong
29
  License: GPL-3.0
30
+ Version: 2.1.1
31
  Last Updated: 2024
32
  """
33
 
 
44
  from pathlib import Path
45
 
46
  # Import OpenLLM's custom model architecture from uploaded files
47
+ # These files were uploaded to the HF Space and contain OpenLLM's actual implementation
48
  try:
49
  # Import from the uploaded files in the HF Space
50
+ # model.py contains GPTModel, GPTConfig, and create_model factory function
51
  from model import GPTModel, GPTConfig, create_model
52
+ # data_loader.py contains TextDataLoader for OpenLLM's data loading approach
53
  from data_loader import TextDataLoader
54
  OPENLLM_AVAILABLE = True
55
  print("βœ… OpenLLM custom model architecture imported successfully from uploaded files")
56
+ print(" - GPTModel: Custom PyTorch model architecture")
57
+ print(" - GPTConfig: Model configuration dataclass")
58
+ print(" - create_model: Factory function for model creation")
59
+ print(" - TextDataLoader: Custom data loading implementation")
60
  except ImportError as e:
61
  print(f"❌ OpenLLM imports failed: {e}")
62
+ print(" This indicates the uploaded OpenLLM source files are not available")
63
+ print(" The training functionality will be disabled")
64
  OPENLLM_AVAILABLE = False
65
 
66
+ # Try to import sentencepiece - CRITICAL for OpenLLM tokenization
67
+ # OpenLLM uses SentencePiece for tokenization, not Hugging Face tokenizers
68
  try:
69
  import sentencepiece as spm
70
  SENTENCEPIECE_AVAILABLE = True
71
  print(f"βœ… SentencePiece available: {spm.__version__}")
72
+ print(" - Required for OpenLLM tokenization")
73
+ print(" - Used for loading tokenizer.model files")
74
  except ImportError:
75
  SENTENCEPIECE_AVAILABLE = False
76
  print("❌ SentencePiece not available")
77
+ print(" - This will prevent tokenizer loading")
78
+ print(" - Training functionality will be limited")
79
 
80
+ # Import other dependencies for the complete training pipeline
81
  try:
82
+ from datasets import load_dataset # For loading training data from HF Hub
83
+ from huggingface_hub import HfApi, hf_hub_download # For model uploads and downloads
84
  DEPENDENCIES_AVAILABLE = True
85
+ print("βœ… Training dependencies available")
86
+ print(" - datasets: For loading training data")
87
+ print(" - huggingface_hub: For model uploads/downloads")
88
  except ImportError as e:
89
  print(f"❌ Dependencies not available: {e}")
90
+ print(" - This will prevent dataset loading and model uploading")
91
  DEPENDENCIES_AVAILABLE = False
92
 
93
  @dataclass
94
  class TrainingConfig:
95
+ """
96
+ Configuration class for training parameters.
97
+
98
+ This dataclass encapsulates all the training hyperparameters and settings
99
+ that control the OpenLLM training process. It provides a clean interface
100
+ for passing configuration between different components of the training pipeline.
101
+
102
+ Attributes:
103
+ model_size: Size of the model to train ("small", "medium", "large")
104
+ max_steps: Maximum number of training iterations
105
+ learning_rate: Learning rate for the optimizer
106
+ batch_size: Number of samples per training batch
107
+ output_dir: Directory to save trained models and checkpoints
108
+ save_steps: Frequency of checkpoint saving (every N steps)
109
+ logging_steps: Frequency of progress logging (every N steps)
110
+ warmup_steps: Number of warmup steps for learning rate scheduling
111
+ gradient_accumulation_steps: Number of steps to accumulate gradients
112
+ """
113
  model_size: str
114
  max_steps: int
115
  learning_rate: float
 
127
  This class handles the entire training pipeline including:
128
  - Model loading using OpenLLM's custom GPTModel
129
  - Tokenizer loading using sentencepiece.SentencePieceProcessor()
130
+ - Dataset preparation using OpenLLM's TextDataLoader
131
  - Training execution using OpenLLM's approach
132
+ - Model saving and uploading to Hugging Face Hub
133
+
134
+ The trainer implements OpenLLM's actual training methodology rather than
135
+ using Hugging Face Transformers, ensuring compatibility with the real
136
+ OpenLLM implementation.
137
+
138
+ Key Features:
139
+ - Custom model architecture (GPTModel, not PreTrainedModel)
140
+ - SentencePiece tokenization (not Hugging Face tokenizers)
141
+ - OpenLLM's training loop and optimization strategy
142
+ - Gradient accumulation for memory efficiency
143
+ - Learning rate scheduling with warmup
144
+ - Automatic checkpoint saving and model uploading
145
  """
146
 
147
  def __init__(self):
148
+ """
149
+ Initialize the trainer with default settings.
150
+
151
+ Sets up the trainer with default values and initializes the Hugging Face
152
+ API for model uploading. All components start as None and are initialized
153
+ during the training process.
154
+ """
155
+ # Core training components - initialized during training
156
+ self.model = None # OpenLLM's GPTModel instance
157
+ self.tokenizer = None # SentencePieceProcessor instance
158
+ self.data_loader = None # OpenLLM's TextDataLoader instance
159
+ self.optimizer = None # PyTorch optimizer (AdamW)
160
+ self.scheduler = None # Learning rate scheduler
161
+
162
+ # Training state management
163
+ self.is_training = False # Flag to track training status
164
+ self.tokenizer_path = None # Path to the tokenizer.model file
165
+
166
+ # Progress tracking for UI updates
167
  self.training_progress = {
168
+ "status": "Ready", # Current training status
169
+ "current_step": 0, # Current training step
170
+ "total_steps": 0, # Total steps to complete
171
+ "loss": 0.0, # Current training loss
172
+ "learning_rate": 0.0 # Current learning rate
173
  }
174
 
175
  # Initialize Hugging Face API for model uploading
176
+ # This allows the trained model to be automatically uploaded to HF Hub
177
  try:
178
  self.hf_api = HfApi()
179
+ print("βœ… Hugging Face API initialized for model uploading")
180
  except Exception as e:
181
  print(f"Failed to initialize HF API: {e}")
182
+ print(" - Model uploading will be disabled")
183
  self.hf_api = None
184
 
185
  def load_model_and_tokenizer(self, model_size: str) -> str:
186
  """
187
  Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
188
 
189
+ This method implements OpenLLM's actual model loading strategy:
190
+ 1. Creates a new GPTModel using OpenLLM's factory function
191
+ 2. Downloads the tokenizer.model file from Hugging Face Hub
192
+ 3. Loads the tokenizer using SentencePieceProcessor
193
+ 4. Stores both components for use in training
194
+
195
+ This approach differs from Hugging Face Transformers because:
196
+ - Uses OpenLLM's custom GPTModel (not AutoModelForCausalLM)
197
+ - Uses SentencePiece directly (not AutoTokenizer)
198
+ - Downloads specific files rather than using from_pretrained()
199
+
200
  Args:
201
  model_size: Size of the model to load ("small", "medium", "large")
202
+ Determines which pre-trained model to download
203
 
204
  Returns:
205
  Status message indicating success or failure
206
+ Success: "βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
207
+ Failure: "❌ Failed to load OpenLLM model and tokenizer: {error details}"
208
  """
209
  try:
210
+ # Verify OpenLLM modules are available
211
  if not OPENLLM_AVAILABLE:
212
  return "❌ OpenLLM custom model architecture not available"
213
 
214
  print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
215
+ print(f" - Using OpenLLM's create_model factory function")
216
+ print(f" - Not using Hugging Face Transformers")
217
 
218
+ # Step 1: Create model using OpenLLM's factory function
219
+ # This creates a fresh GPTModel instance with the specified size
220
  try:
221
  self.model = create_model(model_size)
222
  print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
223
+ print(f" - Model type: {type(self.model).__name__}")
224
+ print(f" - Parameters: {self.model.get_num_params():,}")
225
+ print(f" - Architecture: Custom GPTModel (not PreTrainedModel)")
226
  except Exception as e:
227
  print(f"❌ Failed to create model: {e}")
228
  return f"❌ Failed to create OpenLLM model: {str(e)}"
229
 
230
+ # Step 2: Load tokenizer using sentencepiece
231
+ # OpenLLM uses SentencePiece directly, not Hugging Face tokenizers
232
  try:
233
  print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
234
+ print(" - Using SentencePiece directly (not AutoTokenizer)")
235
+ print(" - Downloading tokenizer.model from Hugging Face Hub")
236
 
237
  # Download tokenizer.model from HF Hub
238
+ # This is the actual tokenizer file used by OpenLLM models
239
  model_name = f"lemms/openllm-{model_size}-extended-7k"
240
  tokenizer_path = hf_hub_download(
241
  repo_id=model_name,
242
+ filename="tokenizer.model" # Specific file name for OpenLLM
243
  )
244
 
245
  print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
246
+ print(f" - Source: {model_name}")
247
+ print(f" - File: tokenizer.model")
248
 
249
+ # Create SentencePieceProcessor and load the tokenizer
250
+ # This is OpenLLM's actual tokenization approach
251
  sp_processor = spm.SentencePieceProcessor()
252
  sp_processor.load(tokenizer_path)
253
 
254
  # Store tokenizer and its path separately
255
+ # We need the path for the TextDataLoader later
256
  self.tokenizer = sp_processor
257
  self.tokenizer_path = tokenizer_path # Store the path separately
258
 
259
  print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
260
+ print(f" - Vocabulary size: {sp_processor.vocab_size()}")
261
+ print(f" - Tokenizer path: {tokenizer_path}")
262
+ print(f" - Tokenizer type: {type(sp_processor).__name__}")
263
 
264
  except Exception as e:
265
  print(f"❌ Failed to load tokenizer: {e}")
 
274
  """
275
  Load and prepare the training dataset using OpenLLM's approach.
276
 
277
+ This method implements OpenLLM's data preparation strategy:
278
+ 1. Loads training data from Hugging Face Hub dataset
279
+ 2. Creates a temporary text file for OpenLLM's TextDataLoader
280
+ 3. Initializes OpenLLM's TextDataLoader with the tokenizer
281
+ 4. Prepares the data for training
282
+
283
+ OpenLLM's approach differs from Hugging Face because:
284
+ - Uses a simple text file format (not tokenized datasets)
285
+ - Uses OpenLLM's TextDataLoader (not Hugging Face datasets)
286
+ - Tokenization happens on-the-fly during training
287
+
288
  Returns:
289
  Status message indicating success or failure
290
+ Success: "βœ… Successfully prepared dataset with {count} samples"
291
+ Failure: "❌ Failed to prepare dataset: {error details}"
292
  """
293
  try:
294
+ # Verify dependencies are available
295
  if not DEPENDENCIES_AVAILABLE:
296
  return "❌ Required dependencies not available"
297
 
298
  print("πŸ”„ Loading training dataset...")
299
+ print(" - Loading from Hugging Face Hub dataset")
300
+ print(" - Using OpenLLM's data preparation approach")
301
 
302
  # Load dataset from HF Hub
303
+ # This contains the training text data for continuing model training
304
  dataset = load_dataset("lemms/openllm-training-data")
305
  print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
306
+ print(f" - Dataset: lemms/openllm-training-data")
307
+ print(f" - Samples: {len(dataset['train'])}")
308
 
309
  # Create temporary data file for OpenLLM's TextDataLoader
310
+ # OpenLLM expects a simple text file with one text sample per line
311
  temp_data_file = "temp_training_data.txt"
312
  with open(temp_data_file, 'w', encoding='utf-8') as f:
313
  for item in dataset['train']:
314
  f.write(item['text'] + '\n')
315
 
316
  print(f"βœ… Temporary data file created: {temp_data_file}")
317
+ print(f" - Format: One text sample per line")
318
+ print(f" - Encoding: UTF-8")
319
 
320
  # Create OpenLLM's TextDataLoader
321
+ # This is OpenLLM's custom data loading implementation
322
  try:
323
  # Use the stored tokenizer path instead of trying to access model_file_path
324
+ # SentencePieceProcessor doesn't have a model_file_path attribute
325
  tokenizer_path = self.tokenizer_path # Use the stored path
326
 
327
+ print(f"πŸ”„ Creating OpenLLM TextDataLoader...")
328
+ print(f" - Data file: {temp_data_file}")
329
+ print(f" - Tokenizer path: {tokenizer_path}")
330
+ print(f" - Sequence length: 512")
331
+ print(f" - Batch size: 4 (will be overridden by training config)")
332
+
333
  self.data_loader = TextDataLoader(
334
  data_file=temp_data_file,
335
  tokenizer_path=tokenizer_path,
336
+ seq_len=512, # Maximum sequence length for training
337
  batch_size=4, # Will be overridden by training config
338
+ shuffle=True # Shuffle data for better training
339
  )
340
 
341
  print(f"βœ… OpenLLM TextDataLoader created successfully")
342
+ print(f" - DataLoader type: {type(self.data_loader).__name__}")
343
+ print(f" - Uses OpenLLM's custom implementation")
344
 
345
  except Exception as e:
346
  print(f"❌ Failed to create TextDataLoader: {e}")
 
355
  """
356
  Set up the training configuration using OpenLLM's approach.
357
 
358
+ This method configures the training environment with:
359
+ 1. Output directory creation
360
+ 2. Optimizer setup with weight decay groups
361
+ 3. Learning rate scheduler with warmup
362
+ 4. Training hyperparameters
363
+
364
+ The setup follows OpenLLM's training methodology:
365
+ - Uses AdamW optimizer with weight decay
366
+ - Implements learning rate warmup followed by cosine annealing
367
+ - Separates parameters for different weight decay rates
368
+ - Uses gradient clipping for stability
369
+
370
  Args:
371
+ config: Training configuration object containing all hyperparameters
372
 
373
  Returns:
374
  Status message indicating success or failure
375
+ Success: "βœ… Training setup completed successfully"
376
+ Failure: "❌ Failed to setup training: {error details}"
377
  """
378
  try:
379
+ print("πŸ”„ Setting up training configuration...")
380
+ print(f" - Output directory: {config.output_dir}")
381
+ print(f" - Learning rate: {config.learning_rate}")
382
+ print(f" - Max steps: {config.max_steps}")
383
+
384
+ # Create output directory for saving models and checkpoints
385
  os.makedirs(config.output_dir, exist_ok=True)
386
+ print(f"βœ… Output directory created: {config.output_dir}")
387
 
388
  # Set up optimizer (AdamW with weight decay)
389
+ # This follows OpenLLM's optimization strategy
390
+ print("πŸ”„ Setting up AdamW optimizer with weight decay...")
391
+
392
+ # Separate parameters for different weight decay rates
393
+ # This is a common practice for transformer training
394
+ decay_params = [] # Parameters that should have weight decay
395
+ no_decay_params = [] # Parameters that should not have weight decay
396
 
397
  for name, param in self.model.named_parameters():
398
  if not param.requires_grad:
399
  continue
400
 
401
+ # Apply weight decay to all parameters except biases and layer norm weights
402
  if len(param.shape) == 1 or name.endswith('.bias'):
403
  no_decay_params.append(param)
404
  else:
405
  decay_params.append(param)
406
 
407
+ # Create parameter groups with different weight decay rates
408
  param_groups = [
409
+ {'params': decay_params, 'weight_decay': 0.01}, # 1% weight decay
410
+ {'params': no_decay_params, 'weight_decay': 0.0} # No weight decay
411
  ]
412
 
413
+ print(f" - Decay parameters: {len(decay_params)}")
414
+ print(f" - No-decay parameters: {len(no_decay_params)}")
415
+
416
+ # Initialize AdamW optimizer with OpenLLM's recommended settings
417
  self.optimizer = torch.optim.AdamW(
418
  param_groups,
419
  lr=config.learning_rate,
420
+ betas=(0.9, 0.95), # Beta values for momentum
421
+ eps=1e-8 # Epsilon for numerical stability
422
  )
423
 
424
+ print(f"βœ… AdamW optimizer configured")
425
+ print(f" - Learning rate: {config.learning_rate}")
426
+ print(f" - Betas: (0.9, 0.95)")
427
+ print(f" - Epsilon: 1e-8")
428
+
429
  # Set up learning rate scheduler
430
+ # OpenLLM uses a warmup followed by cosine annealing
431
+ print("πŸ”„ Setting up learning rate scheduler...")
432
+
433
+ # Warmup scheduler: linearly increase LR from 1% to 100%
434
  warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
435
  self.optimizer,
436
+ start_factor=0.01, # Start at 1% of target LR
437
+ end_factor=1.0, # End at 100% of target LR
438
  total_iters=config.warmup_steps
439
  )
440
 
441
+ # Main scheduler: cosine annealing after warmup
442
  main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
443
  self.optimizer,
444
+ T_max=config.max_steps - config.warmup_steps # Duration of cosine annealing
445
  )
446
 
447
+ # Combine warmup and main schedulers
448
  self.scheduler = torch.optim.lr_scheduler.SequentialLR(
449
  self.optimizer,
450
  schedulers=[warmup_scheduler, main_scheduler],
451
+ milestones=[config.warmup_steps] # Switch to main scheduler after warmup
452
  )
453
 
454
+ print(f"βœ… Learning rate scheduler configured")
455
+ print(f" - Warmup steps: {config.warmup_steps}")
456
+ print(f" - Total steps: {config.max_steps}")
457
+ print(f" - Schedule: Linear warmup β†’ Cosine annealing")
458
+
459
  print("βœ… Training setup completed successfully")
460
  return f"βœ… Training setup completed successfully"
461
 
 
466
  """
467
  Execute the actual model training using OpenLLM's approach.
468
 
469
+ This method implements OpenLLM's training loop:
470
+ 1. Sets up training mode and progress tracking
471
+ 2. Iterates through data batches using OpenLLM's TextDataLoader
472
+ 3. Performs forward pass, loss computation, and backward pass
473
+ 4. Implements gradient accumulation for memory efficiency
474
+ 5. Updates model parameters and learning rate
475
+ 6. Saves checkpoints and logs progress
476
+
477
+ The training loop follows OpenLLM's methodology:
478
+ - Uses OpenLLM's GPTModel forward pass (returns logits and loss)
479
+ - Implements gradient accumulation for effective larger batch sizes
480
+ - Uses gradient clipping for training stability
481
+ - Saves checkpoints in OpenLLM's format
482
+ - Updates progress for UI monitoring
483
+
484
  Args:
485
+ config: Training configuration object containing hyperparameters
486
  progress_callback: Optional callback function for progress updates
487
+ (Not used in current implementation)
488
 
489
  Returns:
490
  Status message indicating success or failure
491
+ Success: "βœ… Training completed successfully! Final step: {step}"
492
+ Failure: "❌ Training failed: {error details}"
493
  """
494
  try:
495
+ # Set training state
496
  self.is_training = True
497
  self.training_progress["status"] = "Training"
498
  self.training_progress["total_steps"] = config.max_steps
499
 
500
  print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
501
+ print(f" - Model: {type(self.model).__name__}")
502
+ print(f" - DataLoader: {type(self.data_loader).__name__}")
503
+ print(f" - Optimizer: {type(self.optimizer).__name__}")
504
+ print(f" - Gradient accumulation: {config.gradient_accumulation_steps}")
505
 
506
  # Training loop using OpenLLM's approach
507
+ self.model.train() # Set model to training mode
508
+ accumulated_loss = 0.0 # Track loss across accumulation steps
509
+ self.optimizer.zero_grad() # Clear gradients
510
 
511
+ step = 0 # Current training step
512
  for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
513
+ # Check if we've reached the maximum number of steps
514
  if step >= config.max_steps:
515
  break
516
 
517
  # Forward pass (model computes loss internally when targets provided)
518
+ # OpenLLM's GPTModel returns both logits and loss
519
  logits, loss = self.model(input_ids, target_ids)
520
 
521
  # Scale loss for gradient accumulation
522
+ # This allows us to simulate larger batch sizes
523
  loss = loss / config.gradient_accumulation_steps
524
  accumulated_loss += loss.item()
525
 
526
+ # Backward pass - compute gradients
527
  loss.backward()
528
 
529
  # Update weights every gradient_accumulation_steps
530
  if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
531
+ # Clip gradients for training stability
532
+ # This prevents exploding gradients
533
  torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
534
 
535
+ # Update parameters using the optimizer
536
  self.optimizer.step()
537
+
538
+ # Update learning rate using the scheduler
539
  self.scheduler.step()
540
+
541
+ # Clear gradients for the next accumulation cycle
542
  self.optimizer.zero_grad()
543
 
544
  # Update step count
545
  step += 1
546
 
547
+ # Update progress for UI monitoring
548
  self.training_progress["current_step"] = step
549
  self.training_progress["loss"] = accumulated_loss
550
  self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
551
 
552
+ # Log progress at specified intervals
553
  if step % config.logging_steps == 0:
554
+ current_lr = self.scheduler.get_last_lr()[0]
555
+ print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {current_lr:.2e}")
556
 
557
+ # Save checkpoint at specified intervals
558
  if step % config.save_steps == 0:
559
  self._save_checkpoint(config.output_dir, step)
560
+ print(f"πŸ’Ύ Checkpoint saved at step {step}")
561
 
562
+ # Reset accumulated loss for the next accumulation cycle
563
  accumulated_loss = 0.0
564
 
565
+ # Clean up memory periodically
566
  if step % 100 == 0:
567
  gc.collect()
568
+ print(f"🧹 Memory cleanup at step {step}")
569
 
570
+ # Save final checkpoint
571
  self._save_checkpoint(config.output_dir, step, is_best=True)
572
+ print(f"πŸ’Ύ Final checkpoint saved at step {step}")
573
 
574
  # Update final progress
575
  self.training_progress["status"] = "Completed"
576
  self.training_progress["current_step"] = step
577
 
578
  print(f"βœ… Training completed! Final step: {step}")
579
+ print(f" - Total steps completed: {step}")
580
+ print(f" - Final loss: {self.training_progress['loss']:.4f}")
581
+ print(f" - Final learning rate: {self.training_progress['learning_rate']:.2e}")
582
 
583
  return f"βœ… Training completed successfully! Final step: {step}"
584
 
585
  except Exception as e:
586
  self.training_progress["status"] = "Failed"
587
  print(f"❌ Training failed: {e}")
588
+ print(f" - Error occurred during training")
589
+ print(f" - Training state: {self.training_progress['status']}")
590
  return f"❌ Training failed: {str(e)}"
591
  finally:
592
  self.is_training = False
593
 
594
  def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
595
+ """
596
+ Save model checkpoint using OpenLLM's approach.
597
+
598
+ This method saves the model state in OpenLLM's checkpoint format:
599
+ - Model state dictionary
600
+ - Optimizer state dictionary
601
+ - Scheduler state dictionary
602
+ - Model configuration
603
+ - Training step information
604
+
605
+ The checkpoint format is compatible with OpenLLM's loading mechanism
606
+ and can be used to resume training or load the model for inference.
607
+
608
+ Args:
609
+ output_dir: Directory to save the checkpoint
610
+ step: Current training step number
611
+ is_best: Whether this is the best model so far
612
+ """
613
  try:
614
+ # Create checkpoint dictionary with all necessary components
615
  checkpoint = {
616
+ 'step': step, # Current training step
617
+ 'model_state_dict': self.model.state_dict(), # Model parameters
618
+ 'optimizer_state_dict': self.optimizer.state_dict(), # Optimizer state
619
+ 'scheduler_state_dict': self.scheduler.state_dict(), # Scheduler state
620
+ 'config': self.model.config.__dict__ # Model configuration
621
  }
622
 
623
  # Save latest checkpoint
624
  checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
625
  torch.save(checkpoint, checkpoint_path)
626
 
627
+ # Save best checkpoint if this is the best model
628
  if is_best:
629
  best_path = os.path.join(output_dir, "best_model.pt")
630
  torch.save(checkpoint, best_path)
 
639
  """
640
  Save the trained model and upload it to Hugging Face Hub.
641
 
642
+ This method completes the training pipeline by:
643
+ 1. Saving the final model checkpoint
644
+ 2. Copying the tokenizer files
645
+ 3. Uploading the complete model to Hugging Face Hub
646
+ 4. Creating a new model repository for the trained model
647
+
648
+ The uploaded model will be available at:
649
+ https://huggingface.co/lemms/openllm-{size}-extended-8k
650
+
651
  Args:
652
  config: Training configuration object
653
 
654
  Returns:
655
  Status message indicating success or failure
656
+ Success: "βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
657
+ Failure: "❌ Failed to save/upload model: {error details}"
658
  """
659
  try:
660
  print("πŸ”„ Saving trained model...")
661
+ print(f" - Output directory: {config.output_dir}")
662
+ print(f" - Model size: {config.model_size}")
663
 
664
+ # Save the final model checkpoint
665
  self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
666
 
667
  # Save tokenizer files
668
+ # Create a tokenizer directory within the output directory
669
  tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
670
  os.makedirs(tokenizer_dir, exist_ok=True)
671
 
672
  # Copy the tokenizer.model file using the stored path
673
+ # This ensures the tokenizer is included with the model
674
  import shutil
675
  shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
676
 
677
  print("βœ… Model saved locally")
678
+ print(f" - Model checkpoint: {config.output_dir}/best_model.pt")
679
+ print(f" - Tokenizer: {tokenizer_dir}/tokenizer.model")
680
 
681
  # Generate model name for upload
682
+ # The naming convention follows: openllm-{size}-extended-8k
683
  model_name = f"openllm-{config.model_size}-extended-8k"
684
  repo_id = f"lemms/{model_name}"
685
 
686
  # Upload to Hugging Face Hub
687
  if self.hf_api:
688
  print(f"πŸ”„ Uploading model to {repo_id}...")
689
+ print(f" - Repository: {repo_id}")
690
+ print(f" - Type: model")
691
+ print(f" - Source: {config.output_dir}")
692
 
693
+ # Upload model files to Hugging Face Hub
694
+ # This creates a new model repository with all the files
695
  self.hf_api.upload_folder(
696
  folder_path=config.output_dir,
697
  repo_id=repo_id,
 
700
  )
701
 
702
  print(f"βœ… Model uploaded successfully to {repo_id}")
703
+ print(f" - Available at: https://huggingface.co/{repo_id}")
704
  return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
705
  else:
706
+ print("⚠️ Hugging Face API not available - model saved locally only")
707
  return f"βœ… Model saved locally to {config.output_dir}"
708
 
709
  except Exception as e:
 
711
  return f"❌ Failed to save/upload model: {str(e)}"
712
 
713
  def get_training_progress(self) -> Dict[str, Any]:
714
+ """
715
+ Get current training progress information.
716
+
717
+ This method returns a copy of the current training progress
718
+ for display in the Gradio UI. The progress information includes:
719
+ - Current training status
720
+ - Current step and total steps
721
+ - Current loss value
722
+ - Current learning rate
723
+
724
+ Returns:
725
+ Dictionary containing current training progress information
726
+ """
727
  return self.training_progress.copy()
728
 
729
  def main():
730
  """
731
  Main function that creates the complete Gradio application interface.
732
+
733
+ This function sets up the entire Gradio application with:
734
+ 1. Application header and status information
735
+ 2. Training configuration controls
736
+ 3. Training status and progress display
737
+ 4. Training control buttons
738
+ 5. Instructions and resource links
739
+ 6. Training function implementation
740
+
741
+ The interface provides a complete training experience for OpenLLM models
742
+ with real-time progress monitoring and comprehensive configuration options.
743
+
744
+ Returns:
745
+ Gradio Blocks interface for the training application
746
  """
747
 
748
  # Initialize the trainer
749
+ # This creates the OpenLLMTrainer instance that will handle all training operations
750
  trainer = OpenLLMTrainer()
751
 
752
  # Create the main Gradio application interface
753
+ # Using Gradio 4.44.1 with Soft theme for modern appearance
754
  with gr.Blocks(
755
  title="OpenLLM Training Space - Fixed with Uploaded Modules",
756
  theme=gr.themes.Soft()
757
  ) as demo:
758
 
759
  # Application Header
760
+ # Provides clear identification and description of the application
761
  gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed with Uploaded Modules")
762
  gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
763
  gr.Markdown("---")
764
 
765
  # Status Information
766
+ # Shows the availability of key components and dependencies
767
  gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
768
  gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
769
  gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
770
  gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (From Uploaded Files)")
771
 
772
  # Main Content Area
773
+ # Two-column layout for configuration and status
774
  with gr.Row():
775
 
776
  # Left Column: Training Configuration
777
+ # Contains all the training hyperparameters and settings
778
  with gr.Column(scale=1):
779
  gr.Markdown("## πŸ“Š Training Configuration")
780
 
781
  # Model Size Selection
782
+ # Allows users to choose which base model to train from
783
  model_size = gr.Dropdown(
784
  choices=["small", "medium", "large"],
785
  value="small",
786
+ label="Model Size",
787
+ info="Select the base model size to train from"
788
  )
789
 
790
  # Training Steps Configuration
791
+ # Controls the number of training iterations
792
  max_steps = gr.Slider(
793
  minimum=100,
794
  maximum=10000,
795
  value=1000,
796
  step=100,
797
+ label="Max Training Steps",
798
+ info="Number of training iterations (100-10,000)"
799
  )
800
 
801
  # Learning Rate Configuration
802
+ # Controls the learning rate for the optimizer
803
  learning_rate = gr.Slider(
804
  minimum=1e-5,
805
  maximum=1e-3,
806
  value=3e-4,
807
  step=1e-5,
808
+ label="Learning Rate",
809
+ info="Training rate (0.00001-0.001)"
810
  )
811
 
812
  # Batch Size Configuration
813
+ # Controls the number of samples per training batch
814
  batch_size = gr.Slider(
815
  minimum=1,
816
  maximum=16,
817
  value=4,
818
  step=1,
819
+ label="Batch Size",
820
+ info="Samples per training batch (1-16)"
821
  )
822
 
823
  # Right Column: Training Status and Controls
824
+ # Contains status display and control buttons
825
  with gr.Column(scale=1):
826
  gr.Markdown("## 🎯 Training Status")
827
 
828
  # Training Status Display
829
+ # Shows current training status and any error messages
830
  status_text = gr.Textbox(
831
  value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
832
  label="Current Status",
833
  interactive=False,
834
+ lines=5,
835
+ info="Shows current training status and progress updates"
836
  )
837
 
838
  # Progress Information
839
+ # Displays detailed training progress in JSON format
840
  progress_info = gr.JSON(
841
  value=trainer.get_training_progress(),
842
+ label="Training Progress",
843
+ info="Real-time training progress information"
844
  )
845
 
846
  # Training Control Buttons
847
+ # Buttons to start and stop training
848
  with gr.Row():
849
  start_btn = gr.Button("πŸš€ Start Training", variant="primary")
850
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
851
 
852
  # Instructions Section
853
+ # Provides detailed instructions for using the training interface
854
  gr.Markdown("## πŸ“‹ OpenLLM Training Instructions")
855
  gr.Markdown("""
856
  This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
 
879
  """)
880
 
881
  # Resource Links Section
882
+ # Provides links to related models and resources
883
  gr.Markdown("## πŸ”— Model Resources")
884
  gr.Markdown("""
885
  - [πŸ“š 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
 
889
  """)
890
 
891
  # Training Function Definition
892
+ # This function is called when the Start Training button is clicked
893
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
894
  """
895
  Execute the complete training process using OpenLLM's approach.
896
+
897
+ This function orchestrates the entire training pipeline:
898
+ 1. Validates OpenLLM availability
899
+ 2. Creates training configuration
900
+ 3. Loads model and tokenizer
901
+ 4. Prepares dataset
902
+ 5. Sets up training environment
903
+ 6. Executes training
904
+ 7. Saves and uploads the trained model
905
+
906
+ The function provides comprehensive error handling and status updates
907
+ throughout the training process.
908
+
909
+ Args:
910
+ model_size: Size of the model to train ("small", "medium", "large")
911
+ max_steps: Maximum number of training steps
912
+ learning_rate: Learning rate for the optimizer
913
+ batch_size: Batch size for training
914
+
915
+ Returns:
916
+ Status message indicating the result of the training process
917
  """
918
+ # Validate OpenLLM availability
919
  if not OPENLLM_AVAILABLE:
920
  return "❌ OpenLLM custom model architecture not available. Please check the installation."
921
 
922
  try:
923
+ print(f"πŸš€ Starting complete training process...")
924
+ print(f" - Model size: {model_size}")
925
+ print(f" - Max steps: {max_steps}")
926
+ print(f" - Learning rate: {learning_rate}")
927
+ print(f" - Batch size: {batch_size}")
928
+
929
  # Create training configuration
930
+ # This encapsulates all training parameters
931
  config = TrainingConfig(
932
  model_size=model_size,
933
  max_steps=max_steps,
 
936
  )
937
 
938
  # Step 1: Load model and tokenizer using OpenLLM's approach
939
+ print("πŸ”„ Step 1: Loading model and tokenizer...")
940
  status = trainer.load_model_and_tokenizer(model_size)
941
  if "❌" in status:
942
  return status
943
 
944
  # Step 2: Prepare dataset
945
+ print("πŸ”„ Step 2: Preparing dataset...")
946
  status = trainer.prepare_dataset()
947
  if "❌" in status:
948
  return status
949
 
950
  # Step 3: Setup training
951
+ print("πŸ”„ Step 3: Setting up training...")
952
  status = trainer.setup_training(config)
953
  if "❌" in status:
954
  return status
955
 
956
  # Step 4: Execute training
957
+ print("πŸ”„ Step 4: Executing training...")
958
  status = trainer.train_model(config)
959
  if "❌" in status:
960
  return status
961
 
962
  # Step 5: Save and upload model
963
+ print("πŸ”„ Step 5: Saving and uploading model...")
964
  status = trainer.save_and_upload_model(config)
965
 
966
+ print("πŸŽ‰ Complete training process finished!")
967
  return f"πŸš€ Complete training process finished!\n{status}"
968
 
969
  except Exception as e:
970
+ print(f"❌ Training process failed: {str(e)}")
971
  return f"❌ Training process failed: {str(e)}"
972
 
973
  def update_progress():
974
+ """
975
+ Update the progress display.
976
+
977
+ This function is called periodically to update the progress
978
+ information displayed in the Gradio interface. It returns the
979
+ current training progress from the trainer.
980
+
981
+ Returns:
982
+ Current training progress dictionary
983
+ """
984
  return trainer.get_training_progress()
985
 
986
  # Connect UI Components to Functions
987
+ # This connects the Start Training button to the training function
988
  start_btn.click(
989
  fn=start_complete_training,
990
  inputs=[model_size, max_steps, learning_rate, batch_size],
 
992
  )
993
 
994
  # Auto-refresh progress every 5 seconds during training
995
+ # This ensures the progress display stays up to date
996
  demo.load(update_progress, outputs=[progress_info])
997
 
998
  # Application Footer
999
+ # Provides attribution and technical information
1000
  gr.Markdown("---")
1001
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
1002
  gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
 
1005
  return demo
1006
 
1007
  if __name__ == "__main__":
1008
+ # Launch the Gradio application
1009
+ # This starts the web interface for the training application
1010
  demo = main()
1011
  demo.launch()