lemms commited on
Commit
d6ce9fb
Β·
verified Β·
1 Parent(s): 4744625

Fix: Use sentencepiece.SentencePieceProcessor() like local training code instead of AutoTokenizer

Browse files
Files changed (1) hide show
  1. app.py +171 -58
app.py CHANGED
@@ -1,13 +1,15 @@
1
  #!/usr/bin/env python3
2
  """
3
- OpenLLM Training Space Application - Fixed Version for Tokenizer Issues
4
 
5
- This version includes robust error handling and alternative tokenizer loading
6
- methods to resolve the SentencePieceTokenizer import issue.
 
 
7
 
8
  Author: Louis Chua Bean Chong
9
  License: GPL-3.0
10
- Version: 2.0.5
11
  Last Updated: 2024
12
  """
13
 
@@ -23,7 +25,6 @@ from dataclasses import dataclass
23
  try:
24
  from transformers import (
25
  AutoModelForCausalLM,
26
- AutoTokenizer,
27
  TrainingArguments,
28
  Trainer,
29
  DataCollatorForLanguageModeling
@@ -37,9 +38,9 @@ except ImportError as e:
37
 
38
  # Try to import sentencepiece with fallback
39
  try:
40
- import sentencepiece
41
  SENTENCEPIECE_AVAILABLE = True
42
- print(f"βœ… SentencePiece available: {sentencepiece.__version__}")
43
  except ImportError:
44
  SENTENCEPIECE_AVAILABLE = False
45
  print("❌ SentencePiece not available - will use fallback methods")
@@ -59,10 +60,11 @@ class TrainingConfig:
59
 
60
  class OpenLLMTrainer:
61
  """
62
- Complete training implementation for OpenLLM models with robust tokenizer handling.
63
 
64
  This class handles the entire training pipeline including:
65
- - Model and tokenizer loading with fallback methods
 
66
  - Dataset preparation
67
  - Training execution
68
  - Model saving and uploading
@@ -92,7 +94,7 @@ class OpenLLMTrainer:
92
 
93
  def load_model_and_tokenizer(self, model_size: str) -> str:
94
  """
95
- Load the pre-trained OpenLLM model and tokenizer with robust error handling.
96
 
97
  Args:
98
  model_size: Size of the model to load ("small", "medium", "large")
@@ -110,47 +112,100 @@ class OpenLLMTrainer:
110
 
111
  model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
112
 
113
- # Load OpenLLM custom tokenizer with trust_remote_code
 
 
 
114
  try:
115
- print("πŸ”„ Loading OpenLLM custom tokenizer...")
116
- self.tokenizer = AutoTokenizer.from_pretrained(
117
  model_name,
118
- trust_remote_code=True, # CRITICAL for OpenLLM custom tokenizer classes
119
- use_fast=False # Use slow tokenizer for compatibility
 
120
  )
121
- print(f"βœ… OpenLLM custom tokenizer loaded: {type(self.tokenizer).__name__}")
122
 
123
- # Add padding token if not present
124
- if self.tokenizer.pad_token is None:
125
- self.tokenizer.pad_token = self.tokenizer.eos_token
126
- print("βœ… Added padding token")
127
-
128
  except Exception as e:
129
- print(f"❌ Failed to load OpenLLM custom tokenizer: {e}")
130
- return f"❌ Failed to load OpenLLM custom tokenizer: {str(e)}"
131
 
132
- # Add padding token if not present
133
- if self.tokenizer.pad_token is None:
134
- self.tokenizer.pad_token = self.tokenizer.eos_token
135
-
136
- # Load model with robust error handling
137
  try:
138
- print("πŸ”„ Loading model...")
139
- self.model = AutoModelForCausalLM.from_pretrained(
140
- model_name,
141
- torch_dtype=torch.float16, # Use half precision for memory efficiency
142
- device_map="auto" if torch.cuda.is_available() else None,
143
- trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  )
145
- print("βœ… Model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
146
  except Exception as e:
147
- print(f"❌ Model loading failed: {e}")
148
- return f"❌ Failed to load model: {str(e)}"
149
 
150
- return f"βœ… Successfully loaded {model_size} model from {model_name}"
151
 
152
  except Exception as e:
153
- return f"❌ Failed to load model and tokenizer: {str(e)}"
154
 
155
  def prepare_dataset(self) -> str:
156
  """
@@ -161,23 +216,49 @@ class OpenLLMTrainer:
161
  """
162
  try:
163
  # Load the training dataset
 
164
  dataset = load_dataset("lemms/openllm-training-data")
 
165
 
166
- # Tokenize the dataset with robust error handling
167
  def tokenize_function(examples):
168
  try:
169
- return self.tokenizer(
170
- examples["text"],
171
- truncation=True,
172
- padding="max_length",
173
- max_length=512,
174
- return_tensors="pt"
175
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  except Exception as e:
177
  print(f"Tokenization error: {e}")
178
  # Fallback: return empty tensors
179
  return {"input_ids": [], "attention_mask": []}
180
 
 
181
  tokenized_dataset = dataset["train"].map(
182
  tokenize_function,
183
  batched=True,
@@ -185,6 +266,7 @@ class OpenLLMTrainer:
185
  )
186
 
187
  self.dataset = tokenized_dataset
 
188
 
189
  return f"βœ… Successfully prepared dataset with {len(tokenized_dataset)} samples"
190
 
@@ -263,6 +345,8 @@ class OpenLLMTrainer:
263
  self.training_progress["status"] = "Training"
264
  self.training_progress["total_steps"] = config.max_steps
265
 
 
 
266
  # Start training
267
  train_result = self.trainer.train()
268
 
@@ -271,10 +355,13 @@ class OpenLLMTrainer:
271
  self.training_progress["current_step"] = config.max_steps
272
  self.training_progress["loss"] = train_result.training_loss
273
 
 
 
274
  return f"βœ… Training completed successfully! Final loss: {train_result.training_loss:.4f}"
275
 
276
  except Exception as e:
277
  self.training_progress["status"] = "Failed"
 
278
  return f"❌ Training failed: {str(e)}"
279
  finally:
280
  self.is_training = False
@@ -290,9 +377,29 @@ class OpenLLMTrainer:
290
  Status message indicating success or failure
291
  """
292
  try:
 
 
293
  # Save the model locally
294
  self.trainer.save_model()
295
- self.tokenizer.save_pretrained(config.output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  # Generate model name for upload
298
  model_name = f"openllm-{config.model_size}-extended-8k"
@@ -300,6 +407,8 @@ class OpenLLMTrainer:
300
 
301
  # Upload to Hugging Face Hub
302
  if self.hf_api:
 
 
303
  # Upload model files
304
  self.hf_api.upload_folder(
305
  folder_path=config.output_dir,
@@ -308,11 +417,13 @@ class OpenLLMTrainer:
308
  commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
309
  )
310
 
 
311
  return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
312
  else:
313
  return f"βœ… Model saved locally to {config.output_dir}"
314
 
315
  except Exception as e:
 
316
  return f"❌ Failed to save/upload model: {str(e)}"
317
 
318
  def get_training_progress(self) -> Dict[str, Any]:
@@ -329,18 +440,19 @@ def main():
329
 
330
  # Create the main Gradio application interface
331
  with gr.Blocks(
332
- title="OpenLLM Training Space - Fixed Version",
333
  theme=gr.themes.Soft()
334
  ) as demo:
335
 
336
  # Application Header
337
- gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed Implementation")
338
- gr.Markdown("### *Robust Tokenizer Handling - Gradio 4.44.1 Compatible*")
339
  gr.Markdown("---")
340
 
341
  # Status Information
342
  gr.Markdown(f"**Training Available**: {'βœ… Yes' if TRAINING_AVAILABLE else '❌ No'}")
343
  gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
 
344
 
345
  # Main Content Area
346
  with gr.Row():
@@ -407,9 +519,9 @@ def main():
407
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
408
 
409
  # Instructions Section
410
- gr.Markdown("## πŸ“‹ Fixed Training Instructions")
411
  gr.Markdown("""
412
- This interface provides **robust model training** with enhanced error handling:
413
 
414
  ### **Step 1: Configure Parameters**
415
  - **Model Size**: Select the base model to train from (7k models)
@@ -419,8 +531,9 @@ def main():
419
 
420
  ### **Step 2: Start Training**
421
  - Click "Start Training" to begin the actual training process
422
- - The system will use multiple fallback methods for tokenizer loading
423
- - Enhanced error handling for dependency issues
 
424
 
425
  ### **Step 3: Monitor Progress**
426
  - Watch the status updates and progress information
@@ -445,7 +558,7 @@ def main():
445
  # Training Function Definition
446
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
447
  """
448
- Execute the complete training process with robust error handling.
449
  """
450
  if not TRAINING_AVAILABLE:
451
  return "❌ Training dependencies not available. Please check the installation."
@@ -459,7 +572,7 @@ def main():
459
  batch_size=batch_size
460
  )
461
 
462
- # Step 1: Load model and tokenizer
463
  status = trainer.load_model_and_tokenizer(model_size)
464
  if "❌" in status:
465
  return status
@@ -505,7 +618,7 @@ def main():
505
  gr.Markdown("---")
506
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
507
  gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
508
- gr.Markdown("**Enhanced Error Handling**: Multiple tokenizer loading methods")
509
 
510
  return demo
511
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ OpenLLM Training Space Application - Local Training Code Compatible
4
 
5
+ This version uses the same tokenizer loading approach as the local OpenLLM training code:
6
+ - Uses sentencepiece.SentencePieceProcessor() directly
7
+ - Loads tokenizer from tokenizer.model file
8
+ - Compatible with OpenLLM's actual implementation
9
 
10
  Author: Louis Chua Bean Chong
11
  License: GPL-3.0
12
+ Version: 2.0.7
13
  Last Updated: 2024
14
  """
15
 
 
25
  try:
26
  from transformers import (
27
  AutoModelForCausalLM,
 
28
  TrainingArguments,
29
  Trainer,
30
  DataCollatorForLanguageModeling
 
38
 
39
  # Try to import sentencepiece with fallback
40
  try:
41
+ import sentencepiece as spm
42
  SENTENCEPIECE_AVAILABLE = True
43
+ print(f"βœ… SentencePiece available: {spm.__version__}")
44
  except ImportError:
45
  SENTENCEPIECE_AVAILABLE = False
46
  print("❌ SentencePiece not available - will use fallback methods")
 
60
 
61
  class OpenLLMTrainer:
62
  """
63
+ Complete training implementation for OpenLLM models using local training approach.
64
 
65
  This class handles the entire training pipeline including:
66
+ - Model loading with trust_remote_code for custom model classes
67
+ - Tokenizer loading using sentencepiece.SentencePieceProcessor() (same as local code)
68
  - Dataset preparation
69
  - Training execution
70
  - Model saving and uploading
 
94
 
95
  def load_model_and_tokenizer(self, model_size: str) -> str:
96
  """
97
+ Load the pre-trained OpenLLM model and tokenizer using local training approach.
98
 
99
  Args:
100
  model_size: Size of the model to load ("small", "medium", "large")
 
112
 
113
  model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
114
 
115
+ print(f"πŸ”„ Loading OpenLLM model: {model_name}")
116
+ print("πŸ“ Using local training approach: sentencepiece.SentencePieceProcessor()")
117
+
118
+ # Load model with trust_remote_code for custom model classes
119
  try:
120
+ print("πŸ”„ Loading OpenLLM model...")
121
+ self.model = AutoModelForCausalLM.from_pretrained(
122
  model_name,
123
+ torch_dtype=torch.float16, # Use half precision for memory efficiency
124
+ device_map="auto" if torch.cuda.is_available() else None,
125
+ trust_remote_code=True # CRITICAL for custom model classes
126
  )
127
+ print(f"βœ… OpenLLM model loaded successfully: {type(self.model).__name__}")
128
 
 
 
 
 
 
129
  except Exception as e:
130
+ print(f"❌ Failed to load model: {e}")
131
+ return f"❌ Failed to load OpenLLM model: {str(e)}"
132
 
133
+ # Load tokenizer using the same approach as local training code
 
 
 
 
134
  try:
135
+ print("πŸ”„ Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
136
+
137
+ # Create a custom tokenizer class that wraps SentencePieceProcessor
138
+ # This is needed for Hugging Face Trainer compatibility
139
+ class OpenLLMTokenizer:
140
+ def __init__(self, sp_processor):
141
+ self.sp_processor = sp_processor
142
+ self.pad_token = "<pad>"
143
+ self.eos_token = "</s>"
144
+ self.bos_token = "<s>"
145
+ self.unk_token = "<unk>"
146
+
147
+ def __call__(self, texts, **kwargs):
148
+ """Tokenize texts using SentencePieceProcessor."""
149
+ if isinstance(texts, str):
150
+ texts = [texts]
151
+
152
+ results = []
153
+ for text in texts:
154
+ # Encode text to token IDs
155
+ token_ids = self.sp_processor.encode(text)
156
+
157
+ # Create attention mask (all tokens are attended to)
158
+ attention_mask = [1] * len(token_ids)
159
+
160
+ results.append({
161
+ 'input_ids': token_ids,
162
+ 'attention_mask': attention_mask
163
+ })
164
+
165
+ return results
166
+
167
+ def encode(self, text, **kwargs):
168
+ """Encode text to token IDs."""
169
+ return self.sp_processor.encode(text)
170
+
171
+ def decode(self, token_ids, **kwargs):
172
+ """Decode token IDs to text."""
173
+ return self.sp_processor.decode(token_ids)
174
+
175
+ def save_pretrained(self, path):
176
+ """Save tokenizer files."""
177
+ # The SentencePieceProcessor is already saved as tokenizer.model
178
+ pass
179
+
180
+ # Download and load the tokenizer.model file
181
+ from huggingface_hub import hf_hub_download
182
+
183
+ print("πŸ”„ Downloading tokenizer.model from HF Hub...")
184
+ tokenizer_path = hf_hub_download(
185
+ repo_id=model_name,
186
+ filename="tokenizer.model"
187
  )
188
+
189
+ print(f"βœ… Tokenizer downloaded to: {tokenizer_path}")
190
+
191
+ # Load using SentencePieceProcessor (same as local code)
192
+ sp_processor = spm.SentencePieceProcessor()
193
+ sp_processor.load(tokenizer_path)
194
+
195
+ # Wrap in our custom tokenizer class for HF Trainer compatibility
196
+ self.tokenizer = OpenLLMTokenizer(sp_processor)
197
+
198
+ print(f"βœ… Tokenizer loaded successfully using SentencePieceProcessor")
199
+ print(f" Vocabulary size: {sp_processor.vocab_size()}")
200
+
201
  except Exception as e:
202
+ print(f"❌ Failed to load tokenizer: {e}")
203
+ return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
204
 
205
+ return f"βœ… Successfully loaded OpenLLM {model_size} model from {model_name}"
206
 
207
  except Exception as e:
208
+ return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
209
 
210
  def prepare_dataset(self) -> str:
211
  """
 
216
  """
217
  try:
218
  # Load the training dataset
219
+ print("πŸ”„ Loading training dataset...")
220
  dataset = load_dataset("lemms/openllm-training-data")
221
+ print(f"βœ… Dataset loaded: {len(dataset['train'])} samples")
222
 
223
+ # Tokenize the dataset using our custom tokenizer
224
  def tokenize_function(examples):
225
  try:
226
+ # Use our custom tokenizer
227
+ tokenized = self.tokenizer(examples["text"])
228
+
229
+ # Extract input_ids and attention_mask
230
+ input_ids = [item['input_ids'] for item in tokenized]
231
+ attention_mask = [item['attention_mask'] for item in tokenized]
232
+
233
+ # Pad sequences to max_length
234
+ max_length = 512
235
+ padded_input_ids = []
236
+ padded_attention_mask = []
237
+
238
+ for ids, mask in zip(input_ids, attention_mask):
239
+ if len(ids) > max_length:
240
+ ids = ids[:max_length]
241
+ mask = mask[:max_length]
242
+ else:
243
+ # Pad with pad_token_id
244
+ pad_length = max_length - len(ids)
245
+ ids = ids + [0] * pad_length # 0 is pad_token_id
246
+ mask = mask + [0] * pad_length
247
+
248
+ padded_input_ids.append(ids)
249
+ padded_attention_mask.append(mask)
250
+
251
+ return {
252
+ "input_ids": padded_input_ids,
253
+ "attention_mask": padded_attention_mask
254
+ }
255
+
256
  except Exception as e:
257
  print(f"Tokenization error: {e}")
258
  # Fallback: return empty tensors
259
  return {"input_ids": [], "attention_mask": []}
260
 
261
+ print("πŸ”„ Tokenizing dataset...")
262
  tokenized_dataset = dataset["train"].map(
263
  tokenize_function,
264
  batched=True,
 
266
  )
267
 
268
  self.dataset = tokenized_dataset
269
+ print(f"βœ… Dataset tokenized successfully: {len(tokenized_dataset)} samples")
270
 
271
  return f"βœ… Successfully prepared dataset with {len(tokenized_dataset)} samples"
272
 
 
345
  self.training_progress["status"] = "Training"
346
  self.training_progress["total_steps"] = config.max_steps
347
 
348
+ print(f"πŸš€ Starting OpenLLM training for {config.max_steps} steps...")
349
+
350
  # Start training
351
  train_result = self.trainer.train()
352
 
 
355
  self.training_progress["current_step"] = config.max_steps
356
  self.training_progress["loss"] = train_result.training_loss
357
 
358
+ print(f"βœ… Training completed! Final loss: {train_result.training_loss:.4f}")
359
+
360
  return f"βœ… Training completed successfully! Final loss: {train_result.training_loss:.4f}"
361
 
362
  except Exception as e:
363
  self.training_progress["status"] = "Failed"
364
+ print(f"❌ Training failed: {e}")
365
  return f"❌ Training failed: {str(e)}"
366
  finally:
367
  self.is_training = False
 
377
  Status message indicating success or failure
378
  """
379
  try:
380
+ print("πŸ”„ Saving trained model...")
381
+
382
  # Save the model locally
383
  self.trainer.save_model()
384
+
385
+ # Save tokenizer files
386
+ if hasattr(self.tokenizer, 'sp_processor'):
387
+ # Save the SentencePieceProcessor files
388
+ tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
389
+ os.makedirs(tokenizer_dir, exist_ok=True)
390
+
391
+ # Copy the original tokenizer.model file
392
+ import shutil
393
+ from huggingface_hub import hf_hub_download
394
+
395
+ model_name = f"lemms/openllm-{config.model_size}-extended-7k"
396
+ tokenizer_path = hf_hub_download(
397
+ repo_id=model_name,
398
+ filename="tokenizer.model"
399
+ )
400
+ shutil.copy2(tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
401
+
402
+ print("βœ… Model saved locally")
403
 
404
  # Generate model name for upload
405
  model_name = f"openllm-{config.model_size}-extended-8k"
 
407
 
408
  # Upload to Hugging Face Hub
409
  if self.hf_api:
410
+ print(f"πŸ”„ Uploading model to {repo_id}...")
411
+
412
  # Upload model files
413
  self.hf_api.upload_folder(
414
  folder_path=config.output_dir,
 
417
  commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
418
  )
419
 
420
+ print(f"βœ… Model uploaded successfully to {repo_id}")
421
  return f"βœ… Model saved and uploaded to https://huggingface.co/{repo_id}"
422
  else:
423
  return f"βœ… Model saved locally to {config.output_dir}"
424
 
425
  except Exception as e:
426
+ print(f"❌ Failed to save/upload model: {e}")
427
  return f"❌ Failed to save/upload model: {str(e)}"
428
 
429
  def get_training_progress(self) -> Dict[str, Any]:
 
440
 
441
  # Create the main Gradio application interface
442
  with gr.Blocks(
443
+ title="OpenLLM Training Space - Local Code Compatible",
444
  theme=gr.themes.Soft()
445
  ) as demo:
446
 
447
  # Application Header
448
+ gr.Markdown("# πŸš€ OpenLLM Training Space - Local Code Compatible")
449
+ gr.Markdown("### *Uses sentencepiece.SentencePieceProcessor() Like Local Training*")
450
  gr.Markdown("---")
451
 
452
  # Status Information
453
  gr.Markdown(f"**Training Available**: {'βœ… Yes' if TRAINING_AVAILABLE else '❌ No'}")
454
  gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
455
+ gr.Markdown("**Tokenizer Approach**: βœ… sentencepiece.SentencePieceProcessor() (Local Code Compatible)")
456
 
457
  # Main Content Area
458
  with gr.Row():
 
519
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
520
 
521
  # Instructions Section
522
+ gr.Markdown("## πŸ“‹ Local Code Compatible Training Instructions")
523
  gr.Markdown("""
524
+ This interface uses the **same tokenizer approach as local OpenLLM training**:
525
 
526
  ### **Step 1: Configure Parameters**
527
  - **Model Size**: Select the base model to train from (7k models)
 
531
 
532
  ### **Step 2: Start Training**
533
  - Click "Start Training" to begin the actual training process
534
+ - Uses `sentencepiece.SentencePieceProcessor()` directly (like local code)
535
+ - Downloads tokenizer.model from HF Hub and loads with SentencePieceProcessor
536
+ - Compatible with OpenLLM's actual implementation
537
 
538
  ### **Step 3: Monitor Progress**
539
  - Watch the status updates and progress information
 
558
  # Training Function Definition
559
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
560
  """
561
+ Execute the complete training process with local code compatible approach.
562
  """
563
  if not TRAINING_AVAILABLE:
564
  return "❌ Training dependencies not available. Please check the installation."
 
572
  batch_size=batch_size
573
  )
574
 
575
+ # Step 1: Load model and tokenizer using local approach
576
  status = trainer.load_model_and_tokenizer(model_size)
577
  if "❌" in status:
578
  return status
 
618
  gr.Markdown("---")
619
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
620
  gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
621
+ gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor() (Local Code Compatible)")
622
 
623
  return demo
624