lemms commited on
Commit
f1b5f6b
Β·
verified Β·
1 Parent(s): cd40de7

Fix: Use OpenLLM custom GPTModel architecture instead of Hugging Face Transformers

Browse files
Files changed (1) hide show
  1. app.py +244 -283
app.py CHANGED
@@ -1,59 +1,61 @@
1
  #!/usr/bin/env python3
2
  """
3
- OpenLLM Training Space Application - Custom Model Architecture Fix
4
 
5
- This version handles the custom GPT model architecture by:
6
- - Updating transformers to latest version
7
- - Using alternative model loading approaches
8
- - Handling custom model architectures properly
 
9
 
10
  Author: Louis Chua Bean Chong
11
  License: GPL-3.0
12
- Version: 2.0.8
13
  Last Updated: 2024
14
  """
15
 
16
  import gradio as gr
17
  import torch
 
18
  import os
19
  import time
 
 
20
  from typing import Dict, Any, Optional
21
  import threading
22
  from dataclasses import dataclass
 
23
 
24
- # First, try to update transformers to latest version
25
  try:
26
- import subprocess
27
- print("πŸ”„ Updating transformers to latest version...")
28
- subprocess.run(["pip", "install", "--upgrade", "transformers"], check=True)
29
- print("βœ… Transformers updated successfully")
30
- except Exception as e:
31
- print(f"⚠️ Could not update transformers: {e}")
32
-
33
- # Import training dependencies with robust error handling
34
- try:
35
- from transformers import (
36
- AutoModelForCausalLM,
37
- TrainingArguments,
38
- Trainer,
39
- DataCollatorForLanguageModeling
40
- )
41
- from datasets import load_dataset
42
- from huggingface_hub import HfApi
43
- TRAINING_AVAILABLE = True
44
- print("βœ… Transformers imported successfully")
45
  except ImportError as e:
46
- print(f"Training dependencies not available: {e}")
47
- TRAINING_AVAILABLE = False
48
 
49
- # Try to import sentencepiece with fallback
50
  try:
51
  import sentencepiece as spm
52
  SENTENCEPIECE_AVAILABLE = True
53
  print(f"βœ… SentencePiece available: {spm.__version__}")
54
  except ImportError:
55
  SENTENCEPIECE_AVAILABLE = False
56
- print("❌ SentencePiece not available - will use fallback methods")
 
 
 
 
 
 
 
 
 
57
 
58
  @dataclass
59
  class TrainingConfig:
@@ -70,13 +72,13 @@ class TrainingConfig:
70
 
71
  class OpenLLMTrainer:
72
  """
73
- Complete training implementation for OpenLLM models with custom architecture handling.
74
 
75
  This class handles the entire training pipeline including:
76
- - Model loading with custom architecture support
77
  - Tokenizer loading using sentencepiece.SentencePieceProcessor()
78
  - Dataset preparation
79
- - Training execution
80
  - Model saving and uploading
81
  """
82
 
@@ -84,8 +86,9 @@ class OpenLLMTrainer:
84
  """Initialize the trainer with default settings."""
85
  self.model = None
86
  self.tokenizer = None
87
- self.trainer = None
88
- self.training_thread = None
 
89
  self.is_training = False
90
  self.training_progress = {
91
  "status": "Ready",
@@ -104,7 +107,7 @@ class OpenLLMTrainer:
104
 
105
  def load_model_and_tokenizer(self, model_size: str) -> str:
106
  """
107
- Load the pre-trained OpenLLM model and tokenizer with custom architecture handling.
108
 
109
  Args:
110
  model_size: Size of the model to load ("small", "medium", "large")
@@ -113,129 +116,26 @@ class OpenLLMTrainer:
113
  Status message indicating success or failure
114
  """
115
  try:
116
- # Map model size to actual model repository
117
- model_mapping = {
118
- "small": "lemms/openllm-small-extended-7k",
119
- "medium": "lemms/openllm-medium-extended-7k", # Placeholder
120
- "large": "lemms/openllm-large-extended-7k" # Placeholder
121
- }
122
-
123
- model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
124
 
125
- print(f"πŸ”„ Loading OpenLLM model: {model_name}")
126
- print("πŸ“ Handling custom GPT architecture...")
127
 
128
- # Try multiple approaches to load the model
129
- model_loaded = False
130
-
131
- # Approach 1: Try with latest transformers and trust_remote_code
132
  try:
133
- print("πŸ”„ Attempting to load model with latest transformers...")
134
- self.model = AutoModelForCausalLM.from_pretrained(
135
- model_name,
136
- torch_dtype=torch.float16,
137
- device_map="auto" if torch.cuda.is_available() else None,
138
- trust_remote_code=True,
139
- revision="main" # Use main branch for latest code
140
- )
141
- model_loaded = True
142
- print(f"βœ… Model loaded successfully with latest transformers: {type(self.model).__name__}")
143
-
144
- except Exception as e1:
145
- print(f"❌ Approach 1 failed: {e1}")
146
-
147
- # Approach 2: Try installing transformers from source
148
- try:
149
- print("πŸ”„ Installing transformers from source...")
150
- subprocess.run(["pip", "install", "git+https://github.com/huggingface/transformers.git"], check=True)
151
-
152
- # Reload transformers
153
- import importlib
154
- import transformers
155
- importlib.reload(transformers)
156
- from transformers import AutoModelForCausalLM
157
-
158
- print("πŸ”„ Attempting to load model with source transformers...")
159
- self.model = AutoModelForCausalLM.from_pretrained(
160
- model_name,
161
- torch_dtype=torch.float16,
162
- device_map="auto" if torch.cuda.is_available() else None,
163
- trust_remote_code=True
164
- )
165
- model_loaded = True
166
- print(f"βœ… Model loaded successfully with source transformers: {type(self.model).__name__}")
167
-
168
- except Exception as e2:
169
- print(f"❌ Approach 2 failed: {e2}")
170
-
171
- # Approach 3: Try loading as a generic model
172
- try:
173
- print("πŸ”„ Attempting to load as generic model...")
174
- from transformers import AutoModel
175
-
176
- self.model = AutoModel.from_pretrained(
177
- model_name,
178
- torch_dtype=torch.float16,
179
- device_map="auto" if torch.cuda.is_available() else None,
180
- trust_remote_code=True
181
- )
182
- model_loaded = True
183
- print(f"βœ… Model loaded as generic model: {type(self.model).__name__}")
184
-
185
- except Exception as e3:
186
- print(f"❌ Approach 3 failed: {e3}")
187
- return f"❌ Failed to load OpenLLM model: All approaches failed. Latest error: {str(e3)}"
188
-
189
- # Load tokenizer using the same approach as local training code
190
  try:
191
  print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
192
 
193
- # Create a custom tokenizer class that wraps SentencePieceProcessor
194
- class OpenLLMTokenizer:
195
- def __init__(self, sp_processor):
196
- self.sp_processor = sp_processor
197
- self.pad_token = "<pad>"
198
- self.eos_token = "</s>"
199
- self.bos_token = "<s>"
200
- self.unk_token = "<unk>"
201
-
202
- def __call__(self, texts, **kwargs):
203
- """Tokenize texts using SentencePieceProcessor."""
204
- if isinstance(texts, str):
205
- texts = [texts]
206
-
207
- results = []
208
- for text in texts:
209
- # Encode text to token IDs
210
- token_ids = self.sp_processor.encode(text)
211
-
212
- # Create attention mask (all tokens are attended to)
213
- attention_mask = [1] * len(token_ids)
214
-
215
- results.append({
216
- 'input_ids': token_ids,
217
- 'attention_mask': attention_mask
218
- })
219
-
220
- return results
221
-
222
- def encode(self, text, **kwargs):
223
- """Encode text to token IDs."""
224
- return self.sp_processor.encode(text)
225
-
226
- def decode(self, token_ids, **kwargs):
227
- """Decode token IDs to text."""
228
- return self.sp_processor.decode(token_ids)
229
-
230
- def save_pretrained(self, path):
231
- """Save tokenizer files."""
232
- # The SentencePieceProcessor is already saved as tokenizer.model
233
- pass
234
-
235
- # Download and load the tokenizer.model file
236
- from huggingface_hub import hf_hub_download
237
-
238
- print("πŸ”„ Downloading tokenizer.model from HF Hub...")
239
  tokenizer_path = hf_hub_download(
240
  repo_id=model_name,
241
  filename="tokenizer.model"
@@ -243,12 +143,12 @@ class OpenLLMTrainer:
243
 
244
  print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
245
 
246
- # Load using SentencePieceProcessor (same as local code)
247
  sp_processor = spm.SentencePieceProcessor()
248
  sp_processor.load(tokenizer_path)
249
 
250
- # Wrap in our custom tokenizer class for HF Trainer compatibility
251
- self.tokenizer = OpenLLMTokenizer(sp_processor)
252
 
253
  print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
254
  print(f" Vocabulary size: {sp_processor.vocab_size()}")
@@ -257,80 +157,63 @@ class OpenLLMTrainer:
257
  print(f"❌ Failed to load tokenizer: {e}")
258
  return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
259
 
260
- return f"βœ… Successfully loaded OpenLLM {model_size} model from {model_name}"
261
 
262
  except Exception as e:
263
  return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
264
 
265
  def prepare_dataset(self) -> str:
266
  """
267
- Load and prepare the training dataset.
268
 
269
  Returns:
270
  Status message indicating success or failure
271
  """
272
  try:
273
- # Load the training dataset
 
 
274
  print("πŸ”„ Loading training dataset...")
 
 
275
  dataset = load_dataset("lemms/openllm-training-data")
276
  print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
277
 
278
- # Tokenize the dataset using our custom tokenizer
279
- def tokenize_function(examples):
280
- try:
281
- # Use our custom tokenizer
282
- tokenized = self.tokenizer(examples["text"])
283
-
284
- # Extract input_ids and attention_mask
285
- input_ids = [item['input_ids'] for item in tokenized]
286
- attention_mask = [item['attention_mask'] for item in tokenized]
287
-
288
- # Pad sequences to max_length
289
- max_length = 512
290
- padded_input_ids = []
291
- padded_attention_mask = []
292
-
293
- for ids, mask in zip(input_ids, attention_mask):
294
- if len(ids) > max_length:
295
- ids = ids[:max_length]
296
- mask = mask[:max_length]
297
- else:
298
- # Pad with pad_token_id
299
- pad_length = max_length - len(ids)
300
- ids = ids + [0] * pad_length # 0 is pad_token_id
301
- mask = mask + [0] * pad_length
302
-
303
- padded_input_ids.append(ids)
304
- padded_attention_mask.append(mask)
305
-
306
- return {
307
- "input_ids": padded_input_ids,
308
- "attention_mask": padded_attention_mask
309
- }
310
-
311
- except Exception as e:
312
- print(f"Tokenization error: {e}")
313
- # Fallback: return empty tensors
314
- return {"input_ids": [], "attention_mask": []}
315
-
316
- print("πŸ”„ Tokenizing dataset...")
317
- tokenized_dataset = dataset["train"].map(
318
- tokenize_function,
319
- batched=True,
320
- remove_columns=dataset["train"].column_names
321
- )
322
 
323
- self.dataset = tokenized_dataset
324
- print(f"βœ… Dataset tokenized successfully: {len(tokenized_dataset)} samples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
- return f"βœ… Successfully prepared dataset with {len(tokenized_dataset)} samples"
327
 
328
  except Exception as e:
329
  return f"❌ Failed to prepare dataset: {str(e)}"
330
 
331
  def setup_training(self, config: TrainingConfig) -> str:
332
  """
333
- Set up the training configuration and trainer.
334
 
335
  Args:
336
  config: Training configuration object
@@ -342,43 +225,51 @@ class OpenLLMTrainer:
342
  # Create output directory
343
  os.makedirs(config.output_dir, exist_ok=True)
344
 
345
- # Set up training arguments
346
- training_args = TrainingArguments(
347
- output_dir=config.output_dir,
348
- num_train_epochs=1,
349
- per_device_train_batch_size=config.batch_size,
350
- per_device_eval_batch_size=config.batch_size,
351
- learning_rate=config.learning_rate,
352
- max_steps=config.max_steps,
353
- save_steps=config.save_steps,
354
- logging_steps=config.logging_steps,
355
- warmup_steps=config.warmup_steps,
356
- gradient_accumulation_steps=config.gradient_accumulation_steps,
357
- evaluation_strategy="no", # Disable evaluation for faster training
358
- save_strategy="steps",
359
- logging_dir=f"{config.output_dir}/logs",
360
- report_to=None, # Disable wandb/tensorboard reporting
361
- remove_unused_columns=False,
362
- dataloader_pin_memory=False,
363
- fp16=torch.cuda.is_available(), # Use mixed precision if GPU available
364
- dataloader_num_workers=0, # Reduce memory usage
 
 
 
 
 
 
 
 
 
 
 
365
  )
366
 
367
- # Set up data collator
368
- data_collator = DataCollatorForLanguageModeling(
369
- tokenizer=self.tokenizer,
370
- mlm=False, # We're doing causal language modeling, not masked
371
  )
372
 
373
- # Initialize trainer
374
- self.trainer = Trainer(
375
- model=self.model,
376
- args=training_args,
377
- train_dataset=self.dataset,
378
- tokenizer=self.tokenizer,
379
- data_collator=data_collator,
380
  )
381
 
 
382
  return f"βœ… Training setup completed successfully"
383
 
384
  except Exception as e:
@@ -386,7 +277,7 @@ class OpenLLMTrainer:
386
 
387
  def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
388
  """
389
- Execute the actual model training.
390
 
391
  Args:
392
  config: Training configuration object
@@ -402,17 +293,69 @@ class OpenLLMTrainer:
402
 
403
  print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
404
 
405
- # Start training
406
- train_result = self.trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
  # Update final progress
409
  self.training_progress["status"] = "Completed"
410
- self.training_progress["current_step"] = config.max_steps
411
- self.training_progress["loss"] = train_result.training_loss
412
 
413
- print(f"βœ… Training completed! Final loss: {train_result.training_loss:.4f}")
414
 
415
- return f"βœ… Training completed successfully! Final loss: {train_result.training_loss:.4f}"
416
 
417
  except Exception as e:
418
  self.training_progress["status"] = "Failed"
@@ -421,6 +364,32 @@ class OpenLLMTrainer:
421
  finally:
422
  self.is_training = False
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  def save_and_upload_model(self, config: TrainingConfig) -> str:
425
  """
426
  Save the trained model and upload it to Hugging Face Hub.
@@ -434,25 +403,16 @@ class OpenLLMTrainer:
434
  try:
435
  print("πŸ”„ Saving trained model...")
436
 
437
- # Save the model locally
438
- self.trainer.save_model()
439
 
440
  # Save tokenizer files
441
- if hasattr(self.tokenizer, 'sp_processor'):
442
- # Save the SentencePieceProcessor files
443
- tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
444
- os.makedirs(tokenizer_dir, exist_ok=True)
445
-
446
- # Copy the original tokenizer.model file
447
- import shutil
448
- from huggingface_hub import hf_hub_download
449
-
450
- model_name = f"lemms/openllm-{config.model_size}-extended-7k"
451
- tokenizer_path = hf_hub_download(
452
- repo_id=model_name,
453
- filename="tokenizer.model"
454
- )
455
- shutil.copy2(tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
456
 
457
  print("βœ… Model saved locally")
458
 
@@ -495,19 +455,20 @@ def main():
495
 
496
  # Create the main Gradio application interface
497
  with gr.Blocks(
498
- title="OpenLLM Training Space - Custom Architecture Fix",
499
  theme=gr.themes.Soft()
500
  ) as demo:
501
 
502
  # Application Header
503
- gr.Markdown("# πŸš€ OpenLLM Training Space - Custom Architecture Fix")
504
- gr.Markdown("### *Handles Custom GPT Model Architecture*")
505
  gr.Markdown("---")
506
 
507
  # Status Information
508
- gr.Markdown(f"**Training Available**: {'βœ… Yes' if TRAINING_AVAILABLE else '❌ No'}")
509
- gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
510
- gr.Markdown("**Custom Architecture**: βœ… Multiple loading approaches")
 
511
 
512
  # Main Content Area
513
  with gr.Row():
@@ -556,39 +517,39 @@ def main():
556
 
557
  # Training Status Display
558
  status_text = gr.Textbox(
559
- value="Ready to start training" if TRAINING_AVAILABLE else "Training dependencies not available",
560
  label="Current Status",
561
  interactive=False,
562
  lines=5
563
  )
564
 
565
- # Progress Information - Simplified for maximum compatibility
566
  progress_info = gr.JSON(
567
  value=trainer.get_training_progress(),
568
  label="Training Progress"
569
  )
570
 
571
- # Training Control Buttons - Removed disabled parameter for compatibility
572
  with gr.Row():
573
  start_btn = gr.Button("πŸš€ Start Training", variant="primary")
574
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
575
 
576
  # Instructions Section
577
- gr.Markdown("## πŸ“‹ Custom Architecture Training Instructions")
578
  gr.Markdown("""
579
- This interface handles **OpenLLM's custom GPT architecture**:
580
 
581
  ### **Step 1: Configure Parameters**
582
- - **Model Size**: Select the base model to train from (7k models)
583
  - **Max Steps**: Number of training iterations (100-10,000)
584
  - **Learning Rate**: Training rate (0.00001-0.001)
585
  - **Batch Size**: Samples per training batch (1-16)
586
 
587
  ### **Step 2: Start Training**
588
  - Click "Start Training" to begin the actual training process
589
- - Automatically updates transformers to latest version
590
- - Uses multiple approaches to load custom GPT architecture
591
- - Handles custom model types properly
592
 
593
  ### **Step 3: Monitor Progress**
594
  - Watch the status updates and progress information
@@ -613,10 +574,10 @@ def main():
613
  # Training Function Definition
614
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
615
  """
616
- Execute the complete training process with custom architecture handling.
617
  """
618
- if not TRAINING_AVAILABLE:
619
- return "❌ Training dependencies not available. Please check the installation."
620
 
621
  try:
622
  # Create training configuration
@@ -627,7 +588,7 @@ def main():
627
  batch_size=batch_size
628
  )
629
 
630
- # Step 1: Load model and tokenizer with custom architecture handling
631
  status = trainer.load_model_and_tokenizer(model_size)
632
  if "❌" in status:
633
  return status
@@ -672,8 +633,8 @@ def main():
672
  # Application Footer
673
  gr.Markdown("---")
674
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
675
- gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
676
- gr.Markdown("**Custom Architecture**: Multiple loading approaches for GPT model")
677
 
678
  return demo
679
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ OpenLLM Training Space Application - OpenLLM Compatible
4
 
5
+ This version uses OpenLLM's actual custom model architecture and loading approach:
6
+ - Uses custom GPTModel class (not Hugging Face Transformers)
7
+ - Loads models using torch.load() and load_state_dict()
8
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
9
+ - Compatible with OpenLLM's actual implementation
10
 
11
  Author: Louis Chua Bean Chong
12
  License: GPL-3.0
13
+ Version: 2.0.9
14
  Last Updated: 2024
15
  """
16
 
17
  import gradio as gr
18
  import torch
19
+ import torch.nn as nn
20
  import os
21
  import time
22
+ import math
23
+ import gc
24
  from typing import Dict, Any, Optional
25
  import threading
26
  from dataclasses import dataclass
27
+ from pathlib import Path
28
 
29
+ # Import OpenLLM's custom model architecture
30
  try:
31
+ # Try to import from local OpenLLM code
32
+ import sys
33
+ sys.path.append('core/src')
34
+ from model import GPTModel, GPTConfig, create_model
35
+ from data_loader import TextDataLoader
36
+ OPENLLM_AVAILABLE = True
37
+ print("βœ… OpenLLM custom model architecture imported successfully")
 
 
 
 
 
 
 
 
 
 
 
 
38
  except ImportError as e:
39
+ print(f"❌ OpenLLM imports failed: {e}")
40
+ OPENLLM_AVAILABLE = False
41
 
42
+ # Try to import sentencepiece
43
  try:
44
  import sentencepiece as spm
45
  SENTENCEPIECE_AVAILABLE = True
46
  print(f"βœ… SentencePiece available: {spm.__version__}")
47
  except ImportError:
48
  SENTENCEPIECE_AVAILABLE = False
49
+ print("❌ SentencePiece not available")
50
+
51
+ # Import other dependencies
52
+ try:
53
+ from datasets import load_dataset
54
+ from huggingface_hub import HfApi, hf_hub_download
55
+ DEPENDENCIES_AVAILABLE = True
56
+ except ImportError as e:
57
+ print(f"❌ Dependencies not available: {e}")
58
+ DEPENDENCIES_AVAILABLE = False
59
 
60
  @dataclass
61
  class TrainingConfig:
 
72
 
73
  class OpenLLMTrainer:
74
  """
75
+ Complete training implementation using OpenLLM's actual architecture.
76
 
77
  This class handles the entire training pipeline including:
78
+ - Model loading using OpenLLM's custom GPTModel
79
  - Tokenizer loading using sentencepiece.SentencePieceProcessor()
80
  - Dataset preparation
81
+ - Training execution using OpenLLM's approach
82
  - Model saving and uploading
83
  """
84
 
 
86
  """Initialize the trainer with default settings."""
87
  self.model = None
88
  self.tokenizer = None
89
+ self.data_loader = None
90
+ self.optimizer = None
91
+ self.scheduler = None
92
  self.is_training = False
93
  self.training_progress = {
94
  "status": "Ready",
 
107
 
108
  def load_model_and_tokenizer(self, model_size: str) -> str:
109
  """
110
+ Load the pre-trained OpenLLM model and tokenizer using OpenLLM's approach.
111
 
112
  Args:
113
  model_size: Size of the model to load ("small", "medium", "large")
 
116
  Status message indicating success or failure
117
  """
118
  try:
119
+ if not OPENLLM_AVAILABLE:
120
+ return "❌ OpenLLM custom model architecture not available"
 
 
 
 
 
 
121
 
122
+ print(f"πŸ”„ Loading OpenLLM {model_size} model using custom architecture...")
 
123
 
124
+ # Create model using OpenLLM's factory function
 
 
 
125
  try:
126
+ self.model = create_model(model_size)
127
+ print(f"βœ… OpenLLM {model_size} model created: {type(self.model).__name__}")
128
+ print(f" Parameters: {self.model.get_num_params():,}")
129
+ except Exception as e:
130
+ print(f"❌ Failed to create model: {e}")
131
+ return f"❌ Failed to create OpenLLM model: {str(e)}"
132
+
133
+ # Load tokenizer using sentencepiece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  try:
135
  print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
136
 
137
+ # Download tokenizer.model from HF Hub
138
+ model_name = f"lemms/openllm-{model_size}-extended-7k"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  tokenizer_path = hf_hub_download(
140
  repo_id=model_name,
141
  filename="tokenizer.model"
 
143
 
144
  print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
145
 
146
+ # Create SentencePieceProcessor
147
  sp_processor = spm.SentencePieceProcessor()
148
  sp_processor.load(tokenizer_path)
149
 
150
+ # Store tokenizer for later use
151
+ self.tokenizer = sp_processor
152
 
153
  print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
154
  print(f" Vocabulary size: {sp_processor.vocab_size()}")
 
157
  print(f"❌ Failed to load tokenizer: {e}")
158
  return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
159
 
160
+ return f"βœ… Successfully loaded OpenLLM {model_size} model with custom architecture"
161
 
162
  except Exception as e:
163
  return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
164
 
165
  def prepare_dataset(self) -> str:
166
  """
167
+ Load and prepare the training dataset using OpenLLM's approach.
168
 
169
  Returns:
170
  Status message indicating success or failure
171
  """
172
  try:
173
+ if not DEPENDENCIES_AVAILABLE:
174
+ return "❌ Required dependencies not available"
175
+
176
  print("πŸ”„ Loading training dataset...")
177
+
178
+ # Load dataset from HF Hub
179
  dataset = load_dataset("lemms/openllm-training-data")
180
  print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
181
 
182
+ # Create temporary data file for OpenLLM's TextDataLoader
183
+ temp_data_file = "temp_training_data.txt"
184
+ with open(temp_data_file, 'w', encoding='utf-8') as f:
185
+ for item in dataset['train']:
186
+ f.write(item['text'] + '\n')
187
+
188
+ print(f"βœ… Temporary data file created: {temp_data_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ # Create OpenLLM's TextDataLoader
191
+ try:
192
+ # Get tokenizer path
193
+ tokenizer_path = self.tokenizer.model_file_path
194
+
195
+ self.data_loader = TextDataLoader(
196
+ data_file=temp_data_file,
197
+ tokenizer_path=tokenizer_path,
198
+ seq_len=512,
199
+ batch_size=4, # Will be overridden by training config
200
+ shuffle=True
201
+ )
202
+
203
+ print(f"βœ… OpenLLM TextDataLoader created successfully")
204
+
205
+ except Exception as e:
206
+ print(f"❌ Failed to create TextDataLoader: {e}")
207
+ return f"❌ Failed to create data loader: {str(e)}"
208
 
209
+ return f"βœ… Successfully prepared dataset with {len(dataset['train'])} samples"
210
 
211
  except Exception as e:
212
  return f"❌ Failed to prepare dataset: {str(e)}"
213
 
214
  def setup_training(self, config: TrainingConfig) -> str:
215
  """
216
+ Set up the training configuration using OpenLLM's approach.
217
 
218
  Args:
219
  config: Training configuration object
 
225
  # Create output directory
226
  os.makedirs(config.output_dir, exist_ok=True)
227
 
228
+ # Set up optimizer (AdamW with weight decay)
229
+ decay_params = []
230
+ no_decay_params = []
231
+
232
+ for name, param in self.model.named_parameters():
233
+ if not param.requires_grad:
234
+ continue
235
+
236
+ if len(param.shape) == 1 or name.endswith('.bias'):
237
+ no_decay_params.append(param)
238
+ else:
239
+ decay_params.append(param)
240
+
241
+ param_groups = [
242
+ {'params': decay_params, 'weight_decay': 0.01},
243
+ {'params': no_decay_params, 'weight_decay': 0.0}
244
+ ]
245
+
246
+ self.optimizer = torch.optim.AdamW(
247
+ param_groups,
248
+ lr=config.learning_rate,
249
+ betas=(0.9, 0.95),
250
+ eps=1e-8
251
+ )
252
+
253
+ # Set up learning rate scheduler
254
+ warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
255
+ self.optimizer,
256
+ start_factor=0.01,
257
+ end_factor=1.0,
258
+ total_iters=config.warmup_steps
259
  )
260
 
261
+ main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
262
+ self.optimizer,
263
+ T_max=config.max_steps - config.warmup_steps
 
264
  )
265
 
266
+ self.scheduler = torch.optim.lr_scheduler.SequentialLR(
267
+ self.optimizer,
268
+ schedulers=[warmup_scheduler, main_scheduler],
269
+ milestones=[config.warmup_steps]
 
 
 
270
  )
271
 
272
+ print("βœ… Training setup completed successfully")
273
  return f"βœ… Training setup completed successfully"
274
 
275
  except Exception as e:
 
277
 
278
  def train_model(self, config: TrainingConfig, progress_callback=None) -> str:
279
  """
280
+ Execute the actual model training using OpenLLM's approach.
281
 
282
  Args:
283
  config: Training configuration object
 
293
 
294
  print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
295
 
296
+ # Training loop using OpenLLM's approach
297
+ self.model.train()
298
+ accumulated_loss = 0.0
299
+ self.optimizer.zero_grad()
300
+
301
+ step = 0
302
+ for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader):
303
+ if step >= config.max_steps:
304
+ break
305
+
306
+ # Forward pass (model computes loss internally when targets provided)
307
+ logits, loss = self.model(input_ids, target_ids)
308
+
309
+ # Scale loss for gradient accumulation
310
+ loss = loss / config.gradient_accumulation_steps
311
+ accumulated_loss += loss.item()
312
+
313
+ # Backward pass
314
+ loss.backward()
315
+
316
+ # Update weights every gradient_accumulation_steps
317
+ if (batch_idx + 1) % config.gradient_accumulation_steps == 0:
318
+ # Clip gradients
319
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
320
+
321
+ # Update parameters
322
+ self.optimizer.step()
323
+ self.scheduler.step()
324
+ self.optimizer.zero_grad()
325
+
326
+ # Update step count
327
+ step += 1
328
+
329
+ # Update progress
330
+ self.training_progress["current_step"] = step
331
+ self.training_progress["loss"] = accumulated_loss
332
+ self.training_progress["learning_rate"] = self.scheduler.get_last_lr()[0]
333
+
334
+ # Log progress
335
+ if step % config.logging_steps == 0:
336
+ print(f"Step {step}/{config.max_steps} | Loss: {accumulated_loss:.4f} | LR: {self.scheduler.get_last_lr()[0]:.2e}")
337
+
338
+ # Save checkpoint
339
+ if step % config.save_steps == 0:
340
+ self._save_checkpoint(config.output_dir, step)
341
+
342
+ # Reset accumulated loss
343
+ accumulated_loss = 0.0
344
+
345
+ # Clean up memory
346
+ if step % 100 == 0:
347
+ gc.collect()
348
+
349
+ # Final checkpoint
350
+ self._save_checkpoint(config.output_dir, step, is_best=True)
351
 
352
  # Update final progress
353
  self.training_progress["status"] = "Completed"
354
+ self.training_progress["current_step"] = step
 
355
 
356
+ print(f"βœ… Training completed! Final step: {step}")
357
 
358
+ return f"βœ… Training completed successfully! Final step: {step}"
359
 
360
  except Exception as e:
361
  self.training_progress["status"] = "Failed"
 
364
  finally:
365
  self.is_training = False
366
 
367
+ def _save_checkpoint(self, output_dir: str, step: int, is_best: bool = False) -> None:
368
+ """Save model checkpoint using OpenLLM's approach."""
369
+ try:
370
+ checkpoint = {
371
+ 'step': step,
372
+ 'model_state_dict': self.model.state_dict(),
373
+ 'optimizer_state_dict': self.optimizer.state_dict(),
374
+ 'scheduler_state_dict': self.scheduler.state_dict(),
375
+ 'config': self.model.config.__dict__
376
+ }
377
+
378
+ # Save latest checkpoint
379
+ checkpoint_path = os.path.join(output_dir, f"checkpoint_step_{step}.pt")
380
+ torch.save(checkpoint, checkpoint_path)
381
+
382
+ # Save best checkpoint
383
+ if is_best:
384
+ best_path = os.path.join(output_dir, "best_model.pt")
385
+ torch.save(checkpoint, best_path)
386
+ print(f"πŸ’Ύ Best model saved: {best_path}")
387
+
388
+ print(f"πŸ’Ύ Checkpoint saved: {checkpoint_path}")
389
+
390
+ except Exception as e:
391
+ print(f"❌ Failed to save checkpoint: {e}")
392
+
393
  def save_and_upload_model(self, config: TrainingConfig) -> str:
394
  """
395
  Save the trained model and upload it to Hugging Face Hub.
 
403
  try:
404
  print("πŸ”„ Saving trained model...")
405
 
406
+ # Save the final model
407
+ self._save_checkpoint(config.output_dir, config.max_steps, is_best=True)
408
 
409
  # Save tokenizer files
410
+ tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
411
+ os.makedirs(tokenizer_dir, exist_ok=True)
412
+
413
+ # Copy the tokenizer.model file
414
+ import shutil
415
+ shutil.copy2(self.tokenizer.model_file_path, os.path.join(tokenizer_dir, "tokenizer.model"))
 
 
 
 
 
 
 
 
 
416
 
417
  print("βœ… Model saved locally")
418
 
 
455
 
456
  # Create the main Gradio application interface
457
  with gr.Blocks(
458
+ title="OpenLLM Training Space - OpenLLM Compatible",
459
  theme=gr.themes.Soft()
460
  ) as demo:
461
 
462
  # Application Header
463
+ gr.Markdown("# πŸš€ OpenLLM Training Space - OpenLLM Compatible")
464
+ gr.Markdown("### *Uses OpenLLM's Custom Model Architecture*")
465
  gr.Markdown("---")
466
 
467
  # Status Information
468
+ gr.Markdown(f"**OpenLLM Available**: {'βœ… Yes' if OPENLLM_AVAILABLE else '❌ No'}")
469
+ gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No'}")
470
+ gr.Markdown(f"**Dependencies Available**: {'βœ… Yes' if DEPENDENCIES_AVAILABLE else '❌ No'}")
471
+ gr.Markdown("**Architecture**: βœ… OpenLLM Custom GPTModel (Not Hugging Face)")
472
 
473
  # Main Content Area
474
  with gr.Row():
 
517
 
518
  # Training Status Display
519
  status_text = gr.Textbox(
520
+ value="Ready to start training" if OPENLLM_AVAILABLE else "OpenLLM not available",
521
  label="Current Status",
522
  interactive=False,
523
  lines=5
524
  )
525
 
526
+ # Progress Information
527
  progress_info = gr.JSON(
528
  value=trainer.get_training_progress(),
529
  label="Training Progress"
530
  )
531
 
532
+ # Training Control Buttons
533
  with gr.Row():
534
  start_btn = gr.Button("πŸš€ Start Training", variant="primary")
535
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
536
 
537
  # Instructions Section
538
+ gr.Markdown("## πŸ“‹ OpenLLM Compatible Training Instructions")
539
  gr.Markdown("""
540
+ This interface uses **OpenLLM's actual custom model architecture**:
541
 
542
  ### **Step 1: Configure Parameters**
543
+ - **Model Size**: Select the base model to train from (small, medium, large)
544
  - **Max Steps**: Number of training iterations (100-10,000)
545
  - **Learning Rate**: Training rate (0.00001-0.001)
546
  - **Batch Size**: Samples per training batch (1-16)
547
 
548
  ### **Step 2: Start Training**
549
  - Click "Start Training" to begin the actual training process
550
+ - Uses OpenLLM's custom GPTModel class (not Hugging Face Transformers)
551
+ - Uses sentencepiece.SentencePieceProcessor() for tokenization
552
+ - Compatible with OpenLLM's actual implementation
553
 
554
  ### **Step 3: Monitor Progress**
555
  - Watch the status updates and progress information
 
574
  # Training Function Definition
575
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
576
  """
577
+ Execute the complete training process using OpenLLM's approach.
578
  """
579
+ if not OPENLLM_AVAILABLE:
580
+ return "❌ OpenLLM custom model architecture not available. Please check the installation."
581
 
582
  try:
583
  # Create training configuration
 
588
  batch_size=batch_size
589
  )
590
 
591
+ # Step 1: Load model and tokenizer using OpenLLM's approach
592
  status = trainer.load_model_and_tokenizer(model_size)
593
  if "❌" in status:
594
  return status
 
633
  # Application Footer
634
  gr.Markdown("---")
635
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
636
+ gr.Markdown("**Architecture**: OpenLLM Custom GPTModel (Not Hugging Face Transformers)")
637
+ gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor()")
638
 
639
  return demo
640