lemms commited on
Commit
4daff55
·
verified ·
1 Parent(s): fda995e

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +209 -980
app.py CHANGED
@@ -1,1024 +1,253 @@
1
  #!/usr/bin/env python3
2
  """
3
- OpenLLM Training Space Application - Fixed with Uploaded Modules
4
 
5
- This version imports OpenLLM modules from the uploaded files in the HF Space:
6
- - Imports model.py and data_loader.py that were uploaded to the Space
7
- - Uses OpenLLM's actual custom model architecture
8
- - Compatible with OpenLLM's implementation
9
-
10
- This application provides a complete training interface for OpenLLM models on Hugging Face Spaces.
11
- It uses OpenLLM's custom GPTModel architecture instead of Hugging Face Transformers,
12
- ensuring compatibility with the actual OpenLLM implementation.
13
-
14
- Key Features:
15
- - Real model training using OpenLLM's custom architecture
16
- - SentencePiece tokenization for OpenLLM models
17
- - Complete training pipeline with progress monitoring
18
- - Automatic model saving and uploading to Hugging Face Hub
19
- - Gradio 4.44.1 compatible user interface
20
-
21
- Technical Architecture:
22
- - Uses OpenLLM's GPTModel class (not Hugging Face Transformers)
23
- - Imports custom modules from uploaded files in the Space
24
- - Uses sentencepiece.SentencePieceProcessor() for tokenization
25
- - Implements OpenLLM's training loop and optimization strategy
26
- - Saves checkpoints in OpenLLM's format
27
 
28
  Author: Louis Chua Bean Chong
29
- License: GPL-3.0
30
- Version: 2.1.1
31
- Last Updated: 2024
32
  """
33
 
34
- import gradio as gr
35
- import torch
36
- import torch.nn as nn
37
  import os
38
- import time
39
- import math
40
- import gc
41
- from typing import Dict, Any, Optional
42
- import threading
43
- from dataclasses import dataclass
44
  from pathlib import Path
45
 
46
- # Import OpenLLM's custom model architecture from uploaded files
47
- # These files were uploaded to the HF Space and contain OpenLLM's actual implementation
48
- try:
49
- # Import from the uploaded files in the HF Space
50
- # model.py contains GPTModel, GPTConfig, and create_model factory function
51
- from model import GPTModel, GPTConfig, create_model
52
- # data_loader.py contains TextDataLoader for OpenLLM's data loading approach
53
- from data_loader import TextDataLoader
54
- OPENLLM_AVAILABLE = True
55
- print("✅ OpenLLM custom model architecture imported successfully from uploaded files")
56
- print(" - GPTModel: Custom PyTorch model architecture")
57
- print(" - GPTConfig: Model configuration dataclass")
58
- print(" - create_model: Factory function for model creation")
59
- print(" - TextDataLoader: Custom data loading implementation")
60
- except ImportError as e:
61
- print(f"❌ OpenLLM imports failed: {e}")
62
- print(" This indicates the uploaded OpenLLM source files are not available")
63
- print(" The training functionality will be disabled")
64
- OPENLLM_AVAILABLE = False
65
 
66
- # Try to import sentencepiece - CRITICAL for OpenLLM tokenization
67
- # OpenLLM uses SentencePiece for tokenization, not Hugging Face tokenizers
68
  try:
69
- import sentencepiece as spm
70
- SENTENCEPIECE_AVAILABLE = True
71
- print(f"✅ SentencePiece available: {spm.__version__}")
72
- print(" - Required for OpenLLM tokenization")
73
- print(" - Used for loading tokenizer.model files")
74
- except ImportError:
75
- SENTENCEPIECE_AVAILABLE = False
76
- print("❌ SentencePiece not available")
77
- print(" - This will prevent tokenizer loading")
78
- print(" - Training functionality will be limited")
79
 
80
- # Import other dependencies for the complete training pipeline
81
- try:
82
- from datasets import load_dataset # For loading training data from HF Hub
83
- from huggingface_hub import HfApi, hf_hub_download # For model uploads and downloads
84
- DEPENDENCIES_AVAILABLE = True
85
- print("✅ Training dependencies available")
86
- print(" - datasets: For loading training data")
87
- print(" - huggingface_hub: For model uploads/downloads")
88
  except ImportError as e:
89
- print(f"❌ Dependencies not available: {e}")
90
- print(" - This will prevent dataset loading and model uploading")
91
- DEPENDENCIES_AVAILABLE = False
92
 
93
- @dataclass
94
- class TrainingConfig:
95
- """
96
- Configuration class for training parameters.
97
-
98
- This dataclass encapsulates all the training hyperparameters and settings
99
- that control the OpenLLM training process. It provides a clean interface
100
- for passing configuration between different components of the training pipeline.
101
-
102
- Attributes:
103
- model_size: Size of the model to train ("small", "medium", "large")
104
- max_steps: Maximum number of training iterations
105
- learning_rate: Learning rate for the optimizer
106
- batch_size: Number of samples per training batch
107
- output_dir: Directory to save trained models and checkpoints
108
- save_steps: Frequency of checkpoint saving (every N steps)
109
- logging_steps: Frequency of progress logging (every N steps)
110
- warmup_steps: Number of warmup steps for learning rate scheduling
111
- gradient_accumulation_steps: Number of steps to accumulate gradients
112
- """
113
- model_size: str
114
- max_steps: int
115
- learning_rate: float
116
- batch_size: int
117
- output_dir: str = "./openllm-trained"
118
- save_steps: int = 100
119
- logging_steps: int = 10
120
- warmup_steps: int = 50
121
- gradient_accumulation_steps: int = 4
122
 
123
- class OpenLLMTrainer:
124
- """
125
- Complete training implementation using OpenLLM's actual architecture.
126
-
127
- This class handles the entire training pipeline including:
128
- - Model loading using OpenLLM's custom GPTModel
129
- - Tokenizer loading using sentencepiece.SentencePieceProcessor()
130
- - Dataset preparation using OpenLLM's TextDataLoader
131
- - Training execution using OpenLLM's approach
132
- - Model saving and uploading to Hugging Face Hub
133
-
134
- The trainer implements OpenLLM's actual training methodology rather than
135
- using Hugging Face Transformers, ensuring compatibility with the real
136
- OpenLLM implementation.
137
-
138
- Key Features:
139
- - Custom model architecture (GPTModel, not PreTrainedModel)
140
- - SentencePiece tokenization (not Hugging Face tokenizers)
141
- - OpenLLM's training loop and optimization strategy
142
- - Gradient accumulation for memory efficiency
143
- - Learning rate scheduling with warmup
144
- - Automatic checkpoint saving and model uploading
145
- """
146
-
147
- def __init__(self):
148
- """
149
- Initialize the trainer with default settings.
150
-
151
- Sets up the trainer with default values and initializes the Hugging Face
152
- API for model uploading. All components start as None and are initialized
153
- during the training process.
154
- """
155
- # Core training components - initialized during training
156
- self.model = None # OpenLLM's GPTModel instance
157
- self.tokenizer = None # SentencePieceProcessor instance
158
- self.data_loader = None # OpenLLM's TextDataLoader instance
159
- self.optimizer = None # PyTorch optimizer (AdamW)
160
- self.scheduler = None # Learning rate scheduler
161
-
162
- # Training state management
163
- self.is_training = False # Flag to track training status
164
- self.tokenizer_path = None # Path to the tokenizer.model file
165
-
166
- # Progress tracking for UI updates
167
- self.training_progress = {
168
- "status": "Ready", # Current training status
169
- "current_step": 0, # Current training step
170
- "total_steps": 0, # Total steps to complete
171
- "loss": 0.0, # Current training loss
172
- "learning_rate": 0.0 # Current learning rate
173
- }
174
-
175
- # Initialize Hugging Face API for model uploading
176
- # This allows the trained model to be automatically uploaded to HF Hub
177
- try:
178
- self.hf_api = HfApi()
179
- print("✅ Hugging Face API initialized for model uploading")
180
- except Exception as e:
181
- print(f"Failed to initialize HF API: {e}")
182
- print(" - Model uploading will be disabled")
183
- self.hf_api = None
184
-
185
- def load_model_and_tokenizer(self, model_size: str) -> str:
186
- """
187
- Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
188
-
189
- This method implements OpenLLM's actual model loading strategy:
190
- 1. Creates a new GPTModel using OpenLLM's factory function
191
- 2. Downloads the tokenizer.model file from Hugging Face Hub
192
- 3. Loads the tokenizer using SentencePieceProcessor
193
- 4. Stores both components for use in training
194
-
195
- This approach differs from Hugging Face Transformers because:
196
- - Uses OpenLLM's custom GPTModel (not AutoModelForCausalLM)
197
- - Uses SentencePiece directly (not AutoTokenizer)
198
- - Downloads specific files rather than using from_pretrained()
199
-
200
- Args:
201
- model_size: Size of the model to load ("small", "medium", "large")
202
- Determines which pre-trained model to download
203
-
204
- Returns:
205
- Status message indicating success or failure
206
- Success: "✅ Successfully loaded OpenLLM {model_size} model with custom architecture"
207
- Failure: "❌ Failed to load OpenLLM model and tokenizer: {error details}"
208
- """
209
- try:
210
- # Verify OpenLLM modules are available
211
- if not OPENLLM_AVAILABLE:
212
- return "❌ OpenLLM custom model architecture not available"
213
-
214
- print(f"🔄 Loading OpenLLM {model_size} model using custom architecture...")
215
- print(f" - Using OpenLLM's create_model factory function")
216
- print(f" - Not using Hugging Face Transformers")
217
-
218
- # Step 1: Create model using OpenLLM's factory function
219
- # This creates a fresh GPTModel instance with the specified size
220
- try:
221
- self.model = create_model(model_size)
222
- print(f"✅ OpenLLM {model_size} model created: {type(self.model).__name__}")
223
- print(f" - Model type: {type(self.model).__name__}")
224
- print(f" - Parameters: {self.model.get_num_params():,}")
225
- print(f" - Architecture: Custom GPTModel (not PreTrainedModel)")
226
- except Exception as e:
227
- print(f"❌ Failed to create model: {e}")
228
- return f"❌ Failed to create OpenLLM model: {str(e)}"
229
-
230
- # Step 2: Load tokenizer using sentencepiece
231
- # OpenLLM uses SentencePiece directly, not Hugging Face tokenizers
232
- try:
233
- print("🔄 Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
234
- print(" - Using SentencePiece directly (not AutoTokenizer)")
235
- print(" - Downloading tokenizer.model from Hugging Face Hub")
236
-
237
- # Download tokenizer.model from HF Hub
238
- # This is the actual tokenizer file used by OpenLLM models
239
- model_name = f"lemms/openllm-{model_size}-extended-7k"
240
- tokenizer_path = hf_hub_download(
241
- repo_id=model_name,
242
- filename="tokenizer.model" # Specific file name for OpenLLM
243
- )
244
-
245
- print(f"✅ Tokenizer downloaded to: {tokenizer_path}")
246
- print(f" - Source: {model_name}")
247
- print(f" - File: tokenizer.model")
248
-
249
- # Create SentencePieceProcessor and load the tokenizer
250
- # This is OpenLLM's actual tokenization approach
251
- sp_processor = spm.SentencePieceProcessor()
252
- sp_processor.load(tokenizer_path)
253
-
254
- # Store tokenizer and its path separately
255
- # We need the path for the TextDataLoader later
256
- self.tokenizer = sp_processor
257
- self.tokenizer_path = tokenizer_path # Store the path separately
258
-
259
- print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
260
- print(f" - Vocabulary size: {sp_processor.vocab_size()}")
261
- print(f" - Tokenizer path: {tokenizer_path}")
262
- print(f" - Tokenizer type: {type(sp_processor).__name__}")
263
-
264
- except Exception as e:
265
- print(f"❌ Failed to load tokenizer: {e}")
266
- return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
267
-
268
- return f"✅ Successfully loaded OpenLLM {model_size} model with custom architecture"
269
-
270
- except Exception as e:
271
- return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
272
-
273
- def prepare_dataset(self) -> str:
274
- """
275
- Load and prepare the training dataset using OpenLLM's approach.
276
-
277
- This method implements OpenLLM's data preparation strategy:
278
- 1. Loads training data from Hugging Face Hub dataset
279
- 2. Creates a temporary text file for OpenLLM's TextDataLoader
280
- 3. Initializes OpenLLM's TextDataLoader with the tokenizer
281
- 4. Prepares the data for training
282
-
283
- OpenLLM's approach differs from Hugging Face because:
284
- - Uses a simple text file format (not tokenized datasets)
285
- - Uses OpenLLM's TextDataLoader (not Hugging Face datasets)
286
- - Tokenization happens on-the-fly during training
287
-
288
- Returns:
289
- Status message indicating success or failure
290
- Success: "✅ Successfully prepared dataset with {count} samples"
291
- Failure: "❌ Failed to prepare dataset: {error details}"
292
- """
293
- try:
294
- # Verify dependencies are available
295
- if not DEPENDENCIES_AVAILABLE:
296
- return "❌ Required dependencies not available"
297
-
298
- print("🔄 Loading training dataset...")
299
- print(" - Loading from Hugging Face Hub dataset")
300
- print(" - Using OpenLLM's data preparation approach")
301
-
302
- # Load dataset from HF Hub
303
- # This contains the training text data for continuing model training
304
- dataset = load_dataset("lemms/openllm-training-data")
305
- print(f"✅ Dataset loaded: {len(dataset['train'])} samples")
306
- print(f" - Dataset: lemms/openllm-training-data")
307
- print(f" - Samples: {len(dataset['train'])}")
308
-
309
- # Create temporary data file for OpenLLM's TextDataLoader
310
- # OpenLLM expects a simple text file with one text sample per line
311
- temp_data_file = "temp_training_data.txt"
312
- with open(temp_data_file, 'w', encoding='utf-8') as f:
313
- for item in dataset['train']:
314
- f.write(item['text'] + '\n')
315
-
316
- print(f"✅ Temporary data file created: {temp_data_file}")
317
- print(f" - Format: One text sample per line")
318
- print(f" - Encoding: UTF-8")
319
-
320
- # Create OpenLLM's TextDataLoader
321
- # This is OpenLLM's custom data loading implementation
322
- try:
323
- # Use the stored tokenizer path instead of trying to access model_file_path
324
- # SentencePieceProcessor doesn't have a model_file_path attribute
325
- tokenizer_path = self.tokenizer_path # Use the stored path
326
-
327
- print(f"🔄 Creating OpenLLM TextDataLoader...")
328
- print(f" - Data file: {temp_data_file}")
329
- print(f" - Tokenizer path: {tokenizer_path}")
330
- print(f" - Sequence length: 512")
331
- print(f" - Batch size: 4 (will be overridden by training config)")
332
-
333
- self.data_loader = TextDataLoader(
334
- data_file=temp_data_file,
335
- tokenizer_path=tokenizer_path,
336
- seq_len=512, # Maximum sequence length for training
337
- batch_size=4, # Will be overridden by training config
338
- shuffle=True # Shuffle data for better training
339
- )
340
-
341
- print(f"✅ OpenLLM TextDataLoader created successfully")
342
- print(f" - DataLoader type: {type(self.data_loader).__name__}")
343
- print(f" - Uses OpenLLM's custom implementation")
344
-
345
- except Exception as e:
346
- print(f"❌ Failed to create TextDataLoader: {e}")
347
- return f"❌ Failed to create data loader: {str(e)}"
348
-
349
- return f"✅ Successfully prepared dataset with {len(dataset['train'])} samples"
350
-
351
- except Exception as e:
352
- return f"❌ Failed to prepare dataset: {str(e)}"
353
-
354
- def setup_training(self, config: TrainingConfig) -> str:
355
- """
356
- Set up the training configuration using OpenLLM's approach.
357
-
358
- This method configures the training environment with:
359
- 1. Output directory creation
360
- 2. Optimizer setup with weight decay groups
361
- 3. Learning rate scheduler with warmup
362
- 4. Training hyperparameters
363
-
364
- The setup follows OpenLLM's training methodology:
365
- - Uses AdamW optimizer with weight decay
366
- - Implements learning rate warmup followed by cosine annealing
367
- - Separates parameters for different weight decay rates
368
- - Uses gradient clipping for stability
369
-
370
- Args:
371
- config: Training configuration object containing all hyperparameters
372
-
373
- Returns:
374
- Status message indicating success or failure
375
- Success: "✅ Training setup completed successfully"
376
- Failure: "❌ Failed to setup training: {error details}"
377
- """
378
- try:
379
- print("🔄 Setting up training configuration...")
380
- print(f" - Output directory: {config.output_dir}")
381
- print(f" - Learning rate: {config.learning_rate}")
382
- print(f" - Max steps: {config.max_steps}")
383
-
384
- # Create output directory for saving models and checkpoints
385
- os.makedirs(config.output_dir, exist_ok=True)
386
- print(f"✅ Output directory created: {config.output_dir}")
387
-
388
- # Set up optimizer (AdamW with weight decay)
389
- # This follows OpenLLM's optimization strategy
390
- print("🔄 Setting up AdamW optimizer with weight decay...")
391
-
392
- # Separate parameters for different weight decay rates
393
- # This is a common practice for transformer training
394
- decay_params = [] # Parameters that should have weight decay
395
- no_decay_params = [] # Parameters that should not have weight decay
396
-
397
- for name, param in self.model.named_parameters():
398
- if not param.requires_grad:
399
- continue
400
-
401
- # Apply weight decay to all parameters except biases and layer norm weights
402
- if len(param.shape) == 1 or name.endswith('.bias'):
403
- no_decay_params.append(param)
404
- else:
405
- decay_params.append(param)
406
-
407
- # Create parameter groups with different weight decay rates
408
- param_groups = [
409
- {'params': decay_params, 'weight_decay': 0.01}, # 1% weight decay
410
- {'params': no_decay_params, 'weight_decay': 0.0} # No weight decay
411
- ]
412
-
413
- print(f" - Decay parameters: {len(decay_params)}")
414
- print(f" - No-decay parameters: {len(no_decay_params)}")
415
-
416
- # Initialize AdamW optimizer with OpenLLM's recommended settings
417
- self.optimizer = torch.optim.AdamW(
418
- param_groups,
419
- lr=config.learning_rate,
420
- betas=(0.9, 0.95), # Beta values for momentum
421
- eps=1e-8 # Epsilon for numerical stability
422
- )
423
-
424
- print(f"✅ AdamW optimizer configured")
425
- print(f" - Learning rate: {config.learning_rate}")
426
- print(f" - Betas: (0.9, 0.95)")
427
- print(f" - Epsilon: 1e-8")
428
-
429
- # Set up learning rate scheduler
430
- # OpenLLM uses a warmup followed by cosine annealing
431
- print("🔄 Setting up learning rate scheduler...")
432
-
433
- # Warmup scheduler: linearly increase LR from 1% to 100%
434
- warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
435
- self.optimizer,
436
- start_factor=0.01, # Start at 1% of target LR
437
- end_factor=1.0, # End at 100% of target LR
438
- total_iters=config.warmup_steps
439
- )
440
-
441
- # Main scheduler: cosine annealing after warmup
442
- main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
443
- self.optimizer,
444
- T_max=config.max_steps - config.warmup_steps # Duration of cosine annealing
445
- )
446
-
447
- # Combine warmup and main schedulers
448
- self.scheduler = torch.optim.lr_scheduler.SequentialLR(
449
- self.optimizer,
450
- schedulers=[warmup_scheduler, main_scheduler],
451
- milestones=[config.warmup_steps] # Switch to main scheduler after warmup
452
- )
453
-
454
- print(f"✅ Learning rate scheduler configured")
455
- print(f" - Warmup steps: {config.warmup_steps}")
456
- print(f" - Total steps: {config.max_steps}")
457
- print(f" - Schedule: Linear warmup → Cosine annealing")
458
-
459
- print("✅ Training setup completed successfully")
460
- return f"✅ Training setup completed successfully"
461
-
462
- except Exception as e:
463
- return f"❌ Failed to setup training: {str(e)}"
464
-
465
- def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
466
- """
467
- Execute the actual model training using OpenLLM's approach.
468
-
469
- This method implements OpenLLM's training loop:
470
- 1. Sets up training mode and progress tracking
471
- 2. Iterates through data batches using OpenLLM's TextDataLoader
472
- 3. Performs forward pass, loss computation, and backward pass
473
- 4. Implements gradient accumulation for memory efficiency
474
- 5. Updates model parameters and learning rate
475
- 6. Saves checkpoints and logs progress
476
-
477
- The training loop follows OpenLLM's methodology:
478
- - Uses OpenLLM's GPTModel forward pass (returns logits and loss)
479
- - Implements gradient accumulation for effective larger batch sizes
480
- - Uses gradient clipping for training stability
481
- - Saves checkpoints in OpenLLM's format
482
- - Updates progress for UI monitoring
483
-
484
- Args:
485
- config: Training configuration object containing hyperparameters
486
- progress_callback: Optional callback function for progress updates
487
- (Not used in current implementation)
488
-
489
- Returns:
490
- Status message indicating success or failure
491
- Success: "✅ Training completed successfully! Final step: {step}"
492
- Failure: "❌ Training failed: {error details}"
493
- """
494
  try:
495
- # Set training state
496
- self.is_training = True
497
- self.training_progress["status"] = "Training"
498
- self.training_progress["total_steps"] = config.max_steps
499
-
500
- print(f"🚀 Starting OpenLLM training for {config.max_steps} steps...")
501
- print(f" - Model: {type(self.model).__name__}")
502
- print(f" - DataLoader: {type(self.data_loader).__name__}")
503
- print(f" - Optimizer: {type(self.optimizer).__name__}")
504
- print(f" - Gradient accumulation: {config.gradient_accumulation_steps}")
505
-
506
- # Training loop using OpenLLM's approach
507
- self.model.train() # Set model to training mode
508
- accumulated_loss = 0.0 # Track loss across accumulation steps
509
- self.optimizer.zero_grad() # Clear gradients
510
-
511
- step = 0 # Current training step
512
- for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
513
- # Check if we've reached the maximum number of steps
514
- if step >= config.max_steps:
515
- break
516
-
517
- # Forward pass (model computes loss internally when targets provided)
518
- # OpenLLM's GPTModel returns both logits and loss
519
- logits, loss = self.model(input_ids, target_ids)
520
-
521
- # Scale loss for gradient accumulation
522
- # This allows us to simulate larger batch sizes
523
- loss = loss / config.gradient_accumulation_steps
524
- accumulated_loss += loss.item()
525
-
526
- # Backward pass - compute gradients
527
- loss.backward()
528
-
529
- # Update weights every gradient_accumulation_steps
530
- if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
531
- # Clip gradients for training stability
532
- # This prevents exploding gradients
533
- torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
534
-
535
- # Update parameters using the optimizer
536
- self.optimizer.step()
537
-
538
- # Update learning rate using the scheduler
539
- self.scheduler.step()
540
-
541
- # Clear gradients for the next accumulation cycle
542
- self.optimizer.zero_grad()
543
-
544
- # Update step count
545
- step += 1
546
-
547
- # Update progress for UI monitoring
548
- self.training_progress["current_step"] = step
549
- self.training_progress["loss"] = accumulated_loss
550
- self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
551
-
552
- # Log progress at specified intervals
553
- if step % config.logging_steps == 0:
554
- current_lr = self.scheduler.get_last_lr()[0]
555
- print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {current_lr:.2e}")
556
-
557
- # Save checkpoint at specified intervals
558
- if step % config.save_steps == 0:
559
- self._save_checkpoint(config.output_dir, step)
560
- print(f"💾 Checkpoint saved at step {step}")
561
-
562
- # Reset accumulated loss for the next accumulation cycle
563
- accumulated_loss = 0.0
564
-
565
- # Clean up memory periodically
566
- if step % 100 == 0:
567
- gc.collect()
568
- print(f"🧹 Memory cleanup at step {step}")
569
-
570
- # Save final checkpoint
571
- self._save_checkpoint(config.output_dir, step, is_best=True)
572
- print(f"💾 Final checkpoint saved at step {step}")
573
-
574
- # Update final progress
575
- self.training_progress["status"] = "Completed"
576
- self.training_progress["current_step"] = step
577
-
578
- print(f"✅ Training completed! Final step: {step}")
579
- print(f" - Total steps completed: {step}")
580
- print(f" - Final loss: {self.training_progress['loss']:.4f}")
581
- print(f" - Final learning rate: {self.training_progress['learning_rate']:.2e}")
582
-
583
- return f"✅ Training completed successfully! Final step: {step}"
584
-
585
  except Exception as e:
586
- self.training_progress["status"] = "Failed"
587
- print(f"❌ Training failed: {e}")
588
- print(f" - Error occurred during training")
589
- print(f" - Training state: {self.training_progress['status']}")
590
- return f"❌ Training failed: {str(e)}"
591
- finally:
592
- self.is_training = False
593
-
594
- def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
595
- """
596
- Save model checkpoint using OpenLLM's approach.
597
-
598
- This method saves the model state in OpenLLM's checkpoint format:
599
- - Model state dictionary
600
- - Optimizer state dictionary
601
- - Scheduler state dictionary
602
- - Model configuration
603
- - Training step information
604
-
605
- The checkpoint format is compatible with OpenLLM's loading mechanism
606
- and can be used to resume training or load the model for inference.
607
-
608
- Args:
609
- output_dir: Directory to save the checkpoint
610
- step: Current training step number
611
- is_best: Whether this is the best model so far
612
- """
613
  try:
614
- # Create checkpoint dictionary with all necessary components
615
- checkpoint = {
616
- 'step': step, # Current training step
617
- 'model_state_dict': self.model.state_dict(), # Model parameters
618
- 'optimizer_state_dict': self.optimizer.state_dict(), # Optimizer state
619
- 'scheduler_state_dict': self.scheduler.state_dict(), # Scheduler state
620
- 'config': self.model.config.__dict__ # Model configuration
621
- }
622
-
623
- # Save latest checkpoint
624
- checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
625
- torch.save(checkpoint, checkpoint_path)
626
-
627
- # Save best checkpoint if this is the best model
628
- if is_best:
629
- best_path = os.path.join(output_dir, "best_model.pt")
630
- torch.save(checkpoint, best_path)
631
- print(f"💾 Best model saved: {best_path}")
632
-
633
- print(f"💾 Checkpoint saved: {checkpoint_path}")
634
-
 
 
 
 
 
 
 
 
 
 
635
  except Exception as e:
636
- print(f"❌ Failed to save checkpoint: {e}")
637
-
638
- def save_and_upload_model(self, config: TrainingConfig) -> str:
639
- """
640
- Save the trained model and upload it to Hugging Face Hub.
641
-
642
- This method completes the training pipeline by:
643
- 1. Saving the final model checkpoint
644
- 2. Copying the tokenizer files
645
- 3. Uploading the complete model to Hugging Face Hub
646
- 4. Creating a new model repository for the trained model
647
-
648
- The uploaded model will be available at:
649
- https://huggingface.co/lemms/openllm-{size}-extended-8k
650
-
651
- Args:
652
- config: Training configuration object
653
-
654
- Returns:
655
- Status message indicating success or failure
656
- Success: "✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
657
- Failure: "❌ Failed to save/upload model: {error details}"
658
- """
659
  try:
660
- print("🔄 Saving trained model...")
661
- print(f" - Output directory: {config.output_dir}")
662
- print(f" - Model size: {config.model_size}")
663
-
664
- # Save the final model checkpoint
665
- self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
666
-
667
- # Save tokenizer files
668
- # Create a tokenizer directory within the output directory
669
- tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
670
- os.makedirs(tokenizer_dir, exist_ok=True)
671
-
672
- # Copy the tokenizer.model file using the stored path
673
- # This ensures the tokenizer is included with the model
674
- import shutil
675
- shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
676
-
677
- print("✅ Model saved locally")
678
- print(f" - Model checkpoint: {config.output_dir}/best_model.pt")
679
- print(f" - Tokenizer: {tokenizer_dir}/tokenizer.model")
680
-
681
- # Generate model name for upload
682
- # The naming convention follows: openllm-{size}-extended-8k
683
- model_name = f"openllm-{config.model_size}-extended-8k"
684
- repo_id = f"lemms/{model_name}"
685
-
686
- # Upload to Hugging Face Hub
687
- if self.hf_api:
688
- print(f"🔄 Uploading model to {repo_id}...")
689
- print(f" - Repository: {repo_id}")
690
- print(f" - Type: model")
691
- print(f" - Source: {config.output_dir}")
692
-
693
- # Create the repository first if it doesn't exist
694
- try:
695
- from huggingface_hub import create_repo
696
- create_repo(
697
- repo_id=repo_id,
698
- repo_type="model",
699
- exist_ok=True,
700
- private=False
701
- )
702
- print(f"✅ Repository {repo_id} ready for upload")
703
- except Exception as create_error:
704
- print(f"⚠️ Repository creation warning: {create_error}")
705
- print(" Continuing with upload attempt...")
706
-
707
- # Upload model files to Hugging Face Hub
708
- # This creates a new model repository with all the files
709
- self.hf_api.upload_folder(
710
- folder_path=config.output_dir,
711
- repo_id=repo_id,
712
- repo_type="model",
713
- commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
714
- )
715
-
716
- print(f"✅ Model uploaded successfully to {repo_id}")
717
- print(f" - Available at: https://huggingface.co/{repo_id}")
718
- return f"✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
719
  else:
720
- print("⚠️ Hugging Face API not available - model saved locally only")
721
- return f"✅ Model saved locally to {config.output_dir}"
722
-
 
 
 
 
 
 
 
 
 
 
723
  except Exception as e:
724
- print(f"❌ Failed to save/upload model: {e}")
725
- return f"❌ Failed to save/upload model: {str(e)}"
726
-
727
- def get_training_progress(self) -> Dict[str, Any]:
728
- """
729
- Get current training progress information.
730
-
731
- This method returns a copy of the current training progress
732
- for display in the Gradio UI. The progress information includes:
733
- - Current training status
734
- - Current step and total steps
735
- - Current loss value
736
- - Current learning rate
737
-
738
- Returns:
739
- Dictionary containing current training progress information
740
- """
741
- return self.training_progress.copy()
742
 
743
- def main():
744
- """
745
- Main function that creates the complete Gradio application interface.
746
-
747
- This function sets up the entire Gradio application with:
748
- 1. Application header and status information
749
- 2. Training configuration controls
750
- 3. Training status and progress display
751
- 4. Training control buttons
752
- 5. Instructions and resource links
753
- 6. Training function implementation
754
-
755
- The interface provides a complete training experience for OpenLLM models
756
- with real-time progress monitoring and comprehensive configuration options.
757
-
758
- Returns:
759
- Gradio Blocks interface for the training application
760
- """
761
-
762
- # Initialize the trainer
763
- # This creates the OpenLLMTrainer instance that will handle all training operations
764
- trainer = OpenLLMTrainer()
765
-
766
- # Create the main Gradio application interface
767
- # Using Gradio 4.44.1 with Soft theme for modern appearance
768
  with gr.Blocks(
769
- title="OpenLLM Training Space - Fixed with Uploaded Modules",
770
- theme=gr.themes.Soft()
771
- ) as demo:
 
 
 
 
 
 
772
 
773
- # Application Header
774
- # Provides clear identification and description of the application
775
- gr.Markdown("# 🚀 OpenLLM Training Space - Fixed with Uploaded Modules")
776
- gr.Markdown("### *Uses OpenLLM's Custom Model Architecture from Uploaded Files*")
777
- gr.Markdown("---")
778
 
779
- # Status Information
780
- # Shows the availability of key components and dependencies
781
- gr.Markdown(f"**OpenLLM Available**: {'✅ Yes' if OPENLLM_AVAILABLE else '❌ No'}")
782
- gr.Markdown(f"**SentencePiece Available**: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
783
- gr.Markdown(f"**Dependencies Available**: {'✅ Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
784
- gr.Markdown("**Architecture**: ✅ OpenLLM Custom GPTModel (From Uploaded Files)")
785
 
786
- # Main Content Area
787
- # Two-column layout for configuration and status
788
- with gr.Row():
789
-
790
- # Left Column: Training Configuration
791
- # Contains all the training hyperparameters and settings
792
- with gr.Column(scale=1):
793
- gr.Markdown("## 📊 Training Configuration")
794
-
795
- # Model Size Selection
796
- # Allows users to choose which base model to train from
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
  model_size = gr.Dropdown(
798
  choices=["small", "medium", "large"],
799
  value="small",
800
  label="Model Size",
801
- info="Select the base model size to train from"
802
  )
803
-
804
- # Training Steps Configuration
805
- # Controls the number of training iterations
806
- max_steps = gr.Slider(
807
- minimum=100,
808
- maximum=10000,
809
- value=1000,
810
- step=100,
811
- label="Max Training Steps",
812
- info="Number of training iterations (100-10,000)"
813
- )
814
-
815
- # Learning Rate Configuration
816
- # Controls the learning rate for the optimizer
817
- learning_rate = gr.Slider(
818
- minimum=1e-5,
819
- maximum=1e-3,
820
- value=3e-4,
821
- step=1e-5,
822
- label="Learning Rate",
823
- info="Training rate (0.00001-0.001)"
824
- )
825
-
826
- # Batch Size Configuration
827
- # Controls the number of samples per training batch
828
- batch_size = gr.Slider(
829
- minimum=1,
830
- maximum=16,
831
- value=4,
832
- step=1,
833
- label="Batch Size",
834
- info="Samples per training batch (1-16)"
835
  )
 
 
 
 
 
 
 
 
 
 
836
 
837
- # Right Column: Training Status and Controls
838
- # Contains status display and control buttons
839
- with gr.Column(scale=1):
840
- gr.Markdown("## 🎯 Training Status")
841
-
842
- # Training Status Display
843
- # Shows current training status and any error messages
844
- status_text = gr.Textbox(
845
- value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
846
- label="Current Status",
847
- interactive=False,
848
- lines=5,
849
- info="Shows current training status and progress updates"
850
- )
851
-
852
- # Progress Information
853
- # Displays detailed training progress in JSON format
854
- progress_info = gr.JSON(
855
- value=trainer.get_training_progress(),
856
- label="Training Progress"
857
- )
858
-
859
- # Training Control Buttons
860
- # Buttons to start and stop training
861
- with gr.Row():
862
- start_btn = gr.Button("🚀 Start Training", variant="primary")
863
- stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
864
-
865
- # Instructions Section
866
- # Provides detailed instructions for using the training interface
867
- gr.Markdown("## 📋 OpenLLM Training Instructions")
868
- gr.Markdown("""
869
- This interface uses **OpenLLM's actual custom model architecture** from uploaded files:
870
-
871
- ### **Step 1: Configure Parameters**
872
- - **Model Size**: Select the base model to train from (small, medium, large)
873
- - **Max Steps**: Number of training iterations (100-10,000)
874
- - **Learning Rate**: Training rate (0.00001-0.001)
875
- - **Batch Size**: Samples per training batch (1-16)
876
-
877
- ### **Step 2: Start Training**
878
- - Click "Start Training" to begin the actual training process
879
- - Uses OpenLLM's custom GPTModel class from uploaded files
880
- - Uses sentencepiece.SentencePieceProcessor() for tokenization
881
- - Compatible with OpenLLM's actual implementation
882
-
883
- ### **Step 3: Monitor Progress**
884
- - Watch the status updates and progress information
885
- - Training may take several minutes depending on steps
886
- - The final model will be uploaded to Hugging Face Hub
887
-
888
- ### **Step 4: Access Results**
889
- - Trained models are automatically pushed to: `lemms/openllm-{size}-extended-8k`
890
- - Check the model repository for your trained model
891
- - Use the model for inference or further training
892
- """)
893
-
894
- # Resource Links Section
895
- # Provides links to related models and resources
896
- gr.Markdown("## 🔗 Model Resources")
897
- gr.Markdown("""
898
- - [📚 7k Small Model](https://huggingface.co/lemms/openllm-small-extended-7k)
899
- - [🎯 8k Small Model](https://huggingface.co/lemms/openllm-small-extended-8k)
900
- - [📊 Training Dataset](https://huggingface.co/datasets/lemms/openllm-training-data)
901
- - [📖 Main Project](https://github.com/louischua/openllm)
902
- """)
903
-
904
- # Training Function Definition
905
- # This function is called when the Start Training button is clicked
906
- def start_complete_training(model_size, max_steps, learning_rate, batch_size):
907
- """
908
- Execute the complete training process using OpenLLM's approach.
909
 
910
- This function orchestrates the entire training pipeline:
911
- 1. Validates OpenLLM availability
912
- 2. Creates training configuration
913
- 3. Loads model and tokenizer
914
- 4. Prepares dataset
915
- 5. Sets up training environment
916
- 6. Executes training
917
- 7. Saves and uploads the trained model
918
 
919
- The function provides comprehensive error handling and status updates
920
- throughout the training process.
 
 
 
921
 
922
- Args:
923
- model_size: Size of the model to train ("small", "medium", "large")
924
- max_steps: Maximum number of training steps
925
- learning_rate: Learning rate for the optimizer
926
- batch_size: Batch size for training
927
-
928
- Returns:
929
- Status message indicating the result of the training process
930
- """
931
- # Validate OpenLLM availability
932
- if not OPENLLM_AVAILABLE:
933
- return "❌ OpenLLM custom model architecture not available. Please check the installation."
934
 
935
- try:
936
- print(f"🚀 Starting complete training process...")
937
- print(f" - Model size: {model_size}")
938
- print(f" - Max steps: {max_steps}")
939
- print(f" - Learning rate: {learning_rate}")
940
- print(f" - Batch size: {batch_size}")
941
-
942
- # Create training configuration
943
- # This encapsulates all training parameters
944
- config = TrainingConfig(
945
- model_size=model_size,
946
- max_steps=max_steps,
947
- learning_rate=learning_rate,
948
- batch_size=batch_size
949
- )
950
-
951
- # Step 1: Load model and tokenizer using OpenLLM's approach
952
- print("🔄 Step 1: Loading model and tokenizer...")
953
- status = trainer.load_model_and_tokenizer(model_size)
954
- if "❌" in status:
955
- return status
956
-
957
- # Step 2: Prepare dataset
958
- print("🔄 Step 2: Preparing dataset...")
959
- status = trainer.prepare_dataset()
960
- if "❌" in status:
961
- return status
962
-
963
- # Step 3: Setup training
964
- print("🔄 Step 3: Setting up training...")
965
- status = trainer.setup_training(config)
966
- if "❌" in status:
967
- return status
968
-
969
- # Step 4: Execute training
970
- print("🔄 Step 4: Executing training...")
971
- status = trainer.train_model(config)
972
- if "❌" in status:
973
- return status
974
-
975
- # Step 5: Save and upload model
976
- print("🔄 Step 5: Saving and uploading model...")
977
- status = trainer.save_and_upload_model(config)
978
-
979
- print("🎉 Complete training process finished!")
980
- return f"🚀 Complete training process finished!\n{status}"
981
-
982
- except Exception as e:
983
- print(f"❌ Training process failed: {str(e)}")
984
- return f"❌ Training process failed: {str(e)}"
985
-
986
- def update_progress():
987
- """
988
- Update the progress display.
989
 
990
- This function is called periodically to update the progress
991
- information displayed in the Gradio interface. It returns the
992
- current training progress from the trainer.
993
 
994
- Returns:
995
- Current training progress dictionary
 
 
996
  """
997
- return trainer.get_training_progress()
998
-
999
- # Connect UI Components to Functions
1000
- # This connects the Start Training button to the training function
1001
- start_btn.click(
1002
- fn=start_complete_training,
1003
- inputs=[model_size, max_steps, learning_rate, batch_size],
1004
- outputs=[status_text]
1005
- )
1006
-
1007
- # Auto-refresh progress every 5 seconds during training
1008
- # This ensures the progress display stays up to date
1009
- demo.load(update_progress, outputs=[progress_info])
1010
-
1011
- # Application Footer
1012
- # Provides attribution and technical information
1013
- gr.Markdown("---")
1014
- gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
1015
- gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (From Uploaded Files)")
1016
- gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
1017
-
1018
- return demo
1019
 
1020
  if __name__ == "__main__":
1021
- # Launch the Gradio application
1022
- # This starts the web interface for the training application
1023
- demo = main()
1024
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ OpenLLM Training Space - Main Application
4
 
5
+ This is the main entry point for the Hugging Face Space.
6
+ It provides a web interface for running OpenLLM training with authentication.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  Author: Louis Chua Bean Chong
9
+ License: GPLv3
 
 
10
  """
11
 
 
 
 
12
  import os
13
+ import sys
 
 
 
 
 
14
  from pathlib import Path
15
 
16
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Import our authentication and training modules
 
19
  try:
20
+ from openllm_training_with_auth import OpenLLMTrainingManager
21
+ from space_auth_test import test_space_authentication
 
 
 
 
 
 
 
 
22
 
23
+ MODULES_AVAILABLE = True
 
 
 
 
 
 
 
24
  except ImportError as e:
25
+ MODULES_AVAILABLE = False
26
+ print(f" Required modules not available: {e}")
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def create_space_interface():
30
+ """Create the Gradio interface for the Space."""
31
+
32
+ def run_authentication_test():
33
+ """Run the authentication test and return results."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
+ if not MODULES_AVAILABLE:
36
+ return "❌ Required modules not available. Please check deployment."
37
+
38
+ # Capture output from authentication test
39
+ import contextlib
40
+ import io
41
+
42
+ output = io.StringIO()
43
+ with contextlib.redirect_stdout(output):
44
+ success = test_space_authentication()
45
+
46
+ result = output.getvalue()
47
+
48
+ if success:
49
+ return f"✅ Authentication Test Results:\n\n{result}"
50
+ else:
51
+ return f"❌ Authentication Test Failed:\n\n{result}"
52
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  except Exception as e:
54
+ return f" Error running authentication test: {e}"
55
+
56
+ def run_training(model_size, training_steps):
57
+ """Run the OpenLLM training with authentication."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
+ if not MODULES_AVAILABLE:
60
+ return "❌ Required modules not available. Please check deployment."
61
+
62
+ # Security mitigation: Input validation and sanitization
63
+ if not isinstance(model_size, str) or model_size not in ["small", "medium", "large"]:
64
+ return "❌ Invalid model size. Must be 'small', 'medium', or 'large'."
65
+
66
+ if (
67
+ not isinstance(training_steps, (int, float))
68
+ or training_steps < 1000
69
+ or training_steps > 50000
70
+ ):
71
+ return "❌ Invalid training steps. Must be between 1000 and 50000."
72
+
73
+ # Sanitize inputs
74
+ model_size = str(model_size).strip().lower()
75
+ training_steps = int(float(training_steps))
76
+
77
+ # Capture output from training
78
+ import contextlib
79
+ import io
80
+
81
+ output = io.StringIO()
82
+ with contextlib.redirect_stdout(output):
83
+ training_manager = OpenLLMTrainingManager()
84
+ repo_id = training_manager.run_training(model_size=model_size, steps=training_steps)
85
+
86
+ result = output.getvalue()
87
+
88
+ return f"✅ Training Results:\n\n{result}\n\n🎉 Model available at: https://huggingface.co/{repo_id}"
89
+
90
  except Exception as e:
91
+ return f"❌ Error running training: {e}"
92
+
93
+ def check_space_environment():
94
+ """Check the Space environment and configuration."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  try:
96
+ # Check if we're in a Space
97
+ space_vars = ["SPACE_ID", "SPACE_HOST", "SPACE_REPO_ID"]
98
+ is_space = any(os.getenv(var) for var in space_vars)
99
+
100
+ # Check HF_TOKEN
101
+ hf_token = os.getenv("HF_TOKEN")
102
+
103
+ result = "🔍 Space Environment Check:\n\n"
104
+
105
+ if is_space:
106
+ result += "✅ Running in Hugging Face Space environment\n"
107
+ for var in space_vars:
108
+ value = os.getenv(var)
109
+ if value:
110
+ result += f" - {var}: {value}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  else:
112
+ result += "ℹ️ Running in local environment\n"
113
+
114
+ if hf_token:
115
+ result += f"✅ HF access token found: {hf_token[:8]}...{hf_token[-4:]}\n"
116
+ result += " - Source: HF access token in Space settings\n"
117
+ else:
118
+ result += "❌ HF access token not found\n"
119
+ result += " - Please set HF_TOKEN in Space settings with HF access token\n"
120
+
121
+ result += f"\n📁 Available modules: {'✅' if MODULES_AVAILABLE else '❌'}"
122
+
123
+ return result
124
+
125
  except Exception as e:
126
+ return f"❌ Error checking environment: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ # Create the Gradio interface with security mitigations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  with gr.Blocks(
130
+ title="OpenLLM Training Space",
131
+ theme=gr.themes.Soft(),
132
+ # Security mitigations
133
+ analytics_enabled=False, # Disable analytics
134
+ show_error=False, # Don't expose error details
135
+ ) as interface:
136
+ gr.Markdown(
137
+ """
138
+ # 🚀 OpenLLM Training Space
139
 
140
+ Welcome to the OpenLLM Training Space! This Space provides a complete environment for training OpenLLM models with automatic Hugging Face authentication and model upload.
 
 
 
 
141
 
142
+ ## 🔐 Authentication
143
+
144
+ This Space uses HF access token for secure authentication. The HF_TOKEN is automatically available from your Space settings.
 
 
 
145
 
146
+ ## 📋 Available Actions
147
+
148
+ 1. **Environment Check**: Verify Space configuration and authentication
149
+ 2. **Authentication Test**: Test Hugging Face authentication
150
+ 3. **Run Training**: Start OpenLLM training with automatic upload
151
+ """
152
+ )
153
+
154
+ with gr.Tab("🔍 Environment Check"):
155
+ gr.Markdown("Check the Space environment and configuration.")
156
+ env_check_btn = gr.Button("Check Environment", variant="primary")
157
+ env_output = gr.Textbox(label="Environment Status", lines=10, interactive=False)
158
+ env_check_btn.click(check_space_environment, outputs=env_output)
159
+
160
+ with gr.Tab("🔐 Authentication Test"):
161
+ gr.Markdown("Test Hugging Face authentication using HF access token.")
162
+ auth_test_btn = gr.Button("Run Authentication Test", variant="primary")
163
+ auth_output = gr.Textbox(label="Authentication Results", lines=15, interactive=False)
164
+ auth_test_btn.click(run_authentication_test, outputs=auth_output)
165
+
166
+ with gr.Tab("🚀 Run Training"):
167
+ gr.Markdown(
168
+ """
169
+ Start OpenLLM training with automatic model upload.
170
+
171
+ **Training Parameters:**
172
+ - **Model Size**: Choose the model size (small, medium, large)
173
+ - **Training Steps**: Number of training steps (default: 8000)
174
+
175
+ **Expected Results:**
176
+ - Training will complete successfully
177
+ - Model will be uploaded to Hugging Face Hub
178
+ - Repository will be created with proper model files
179
+ """
180
+ )
181
+
182
+ with gr.Row():
183
  model_size = gr.Dropdown(
184
  choices=["small", "medium", "large"],
185
  value="small",
186
  label="Model Size",
187
+ info="Choose the model size for training",
188
  )
189
+ training_steps = gr.Number(
190
+ value=8000,
191
+ label="Training Steps",
192
+ info="Number of training steps",
193
+ minimum=1000,
194
+ maximum=50000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  )
196
+
197
+ train_btn = gr.Button("Start Training", variant="primary", size="lg")
198
+ train_output = gr.Textbox(label="Training Results", lines=20, interactive=False)
199
+
200
+ train_btn.click(run_training, inputs=[model_size, training_steps], outputs=train_output)
201
+
202
+ with gr.Tab("📚 Documentation"):
203
+ gr.Markdown(
204
+ """
205
+ ## 📖 Available Documentation
206
 
207
+ - **HUGGINGFACE_SPACE_SETUP_GUIDE.md**: Complete setup guide
208
+ - **SPACE_AUTHENTICATION_SUMMARY.md**: Authentication summary
209
+ - **SPACE_READY_SUMMARY.md**: Deployment summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ ## 🔧 Available Scripts
 
 
 
 
 
 
 
212
 
213
+ - **space_auth_test.py**: Authentication verification
214
+ - **openllm_training_with_auth.py**: Complete training script
215
+ - **integrate_auth_into_training.py**: Integration guide
216
+ - **setup_hf_space_auth.py**: Space authentication setup
217
+ - **verify_space_auth.py**: Space verification script
218
 
219
+ ## 🎯 Quick Start
 
 
 
 
 
 
 
 
 
 
 
220
 
221
+ 1. Check the environment to verify configuration
222
+ 2. Run authentication test to ensure GitHub secrets are working
223
+ 3. Start training with your desired parameters
224
+ 4. Monitor the training progress and model upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ ## 🔒 Security
 
 
227
 
228
+ - HF_TOKEN is securely stored in GitHub repository secrets
229
+ - No hardcoded tokens in any scripts
230
+ - Automatic cleanup of test repositories
231
+ - Proper error handling and logging
232
  """
233
+ )
234
+
235
+ return interface
236
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  if __name__ == "__main__":
239
+ # Create and launch the interface
240
+ interface = create_space_interface()
241
+ interface.launch(
242
+ server_name="0.0.0.0",
243
+ server_port=7860,
244
+ share=False,
245
+ # Security mitigations for Gradio vulnerabilities
246
+ allowed_paths=[], # Restrict file access
247
+ auth=None, # Disable authentication to prevent code injection
248
+ show_error=False, # Don't expose error details
249
+ quiet=True, # Reduce logging
250
+ # Disable potentially vulnerable features
251
+ enable_queue=False,
252
+ max_threads=1, # Limit concurrent requests
253
+ )