lemms commited on
Commit
a024114
Β·
verified Β·
1 Parent(s): e54c9be

Fix: Add robust tokenizer loading with multiple fallback methods to resolve SentencePieceTokenizer import issues

Browse files
Files changed (1) hide show
  1. app.py +107 -38
app.py CHANGED
@@ -1,14 +1,13 @@
1
  #!/usr/bin/env python3
2
  """
3
- OpenLLM Training Space Application - Final Compatible Version
4
 
5
- This is a complete Gradio application that provides actual model training functionality
6
- for OpenLLM models. It loads the 7k model, trains it for additional steps, and pushes
7
- the results to Hugging Face Hub. Final version with full Gradio 4.44.1 compatibility.
8
 
9
  Author: Louis Chua Bean Chong
10
  License: GPL-3.0
11
- Version: 2.0.4
12
  Last Updated: 2024
13
  """
14
 
@@ -20,7 +19,7 @@ from typing import Dict, Any, Optional
20
  import threading
21
  from dataclasses import dataclass
22
 
23
- # Import training dependencies
24
  try:
25
  from transformers import (
26
  AutoModelForCausalLM,
@@ -36,6 +35,15 @@ except ImportError as e:
36
  print(f"Training dependencies not available: {e}")
37
  TRAINING_AVAILABLE = False
38
 
 
 
 
 
 
 
 
 
 
39
  @dataclass
40
  class TrainingConfig:
41
  """Configuration class for training parameters."""
@@ -51,10 +59,10 @@ class TrainingConfig:
51
 
52
  class OpenLLMTrainer:
53
  """
54
- Complete training implementation for OpenLLM models.
55
 
56
  This class handles the entire training pipeline including:
57
- - Model and tokenizer loading
58
  - Dataset preparation
59
  - Training execution
60
  - Model saving and uploading
@@ -84,7 +92,7 @@ class OpenLLMTrainer:
84
 
85
  def load_model_and_tokenizer(self, model_size: str) -> str:
86
  """
87
- Load the pre-trained OpenLLM model and tokenizer.
88
 
89
  Args:
90
  model_size: Size of the model to load ("small", "medium", "large")
@@ -102,24 +110,79 @@ class OpenLLMTrainer:
102
 
103
  model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
104
 
105
- # Load tokenizer first
106
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Add padding token if not present
109
  if self.tokenizer.pad_token is None:
110
  self.tokenizer.pad_token = self.tokenizer.eos_token
111
 
112
- # Load model
113
- self.model = AutoModelForCausalLM.from_pretrained(
114
- model_name,
115
- torch_dtype=torch.float16, # Use half precision for memory efficiency
116
- device_map="auto" if torch.cuda.is_available() else None
117
- )
 
 
 
 
 
 
 
118
 
119
  return f"βœ… Successfully loaded {model_size} model from {model_name}"
120
 
121
  except Exception as e:
122
- return f"❌ Failed to load model: {str(e)}"
123
 
124
  def prepare_dataset(self) -> str:
125
  """
@@ -132,15 +195,20 @@ class OpenLLMTrainer:
132
  # Load the training dataset
133
  dataset = load_dataset("lemms/openllm-training-data")
134
 
135
- # Tokenize the dataset
136
  def tokenize_function(examples):
137
- return self.tokenizer(
138
- examples["text"],
139
- truncation=True,
140
- padding="max_length",
141
- max_length=512,
142
- return_tensors="pt"
143
- )
 
 
 
 
 
144
 
145
  tokenized_dataset = dataset["train"].map(
146
  tokenize_function,
@@ -293,15 +361,19 @@ def main():
293
 
294
  # Create the main Gradio application interface
295
  with gr.Blocks(
296
- title="OpenLLM Training Space - Final Version",
297
  theme=gr.themes.Soft()
298
  ) as demo:
299
 
300
  # Application Header
301
- gr.Markdown("# πŸš€ OpenLLM Training Space - Complete Implementation")
302
- gr.Markdown("### *Real Model Training Interface - Gradio 4.44.1 Compatible*")
303
  gr.Markdown("---")
304
 
 
 
 
 
305
  # Main Content Area
306
  with gr.Row():
307
 
@@ -367,9 +439,9 @@ def main():
367
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
368
 
369
  # Instructions Section
370
- gr.Markdown("## πŸ“‹ Complete Training Instructions")
371
  gr.Markdown("""
372
- This interface provides **real model training** functionality with full Gradio 4.44.1 compatibility:
373
 
374
  ### **Step 1: Configure Parameters**
375
  - **Model Size**: Select the base model to train from (7k models)
@@ -379,11 +451,8 @@ def main():
379
 
380
  ### **Step 2: Start Training**
381
  - Click "Start Training" to begin the actual training process
382
- - The system will:
383
- 1. Load the 7k model from Hugging Face Hub
384
- 2. Prepare the training dataset
385
- 3. Execute training for the specified steps
386
- 4. Save and upload the trained model
387
 
388
  ### **Step 3: Monitor Progress**
389
  - Watch the status updates and progress information
@@ -408,7 +477,7 @@ def main():
408
  # Training Function Definition
409
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
410
  """
411
- Execute the complete training process with real model training.
412
  """
413
  if not TRAINING_AVAILABLE:
414
  return "❌ Training dependencies not available. Please check the installation."
@@ -467,8 +536,8 @@ def main():
467
  # Application Footer
468
  gr.Markdown("---")
469
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
470
- gr.Markdown(f"**Training Available**: {'βœ… Yes' if TRAINING_AVAILABLE else '❌ No'}")
471
  gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
 
472
 
473
  return demo
474
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ OpenLLM Training Space Application - Fixed Version for Tokenizer Issues
4
 
5
+ This version includes robust error handling and alternative tokenizer loading
6
+ methods to resolve the SentencePieceTokenizer import issue.
 
7
 
8
  Author: Louis Chua Bean Chong
9
  License: GPL-3.0
10
+ Version: 2.0.5
11
  Last Updated: 2024
12
  """
13
 
 
19
  import threading
20
  from dataclasses import dataclass
21
 
22
+ # Import training dependencies with robust error handling
23
  try:
24
  from transformers import (
25
  AutoModelForCausalLM,
 
35
  print(f"Training dependencies not available: {e}")
36
  TRAINING_AVAILABLE = False
37
 
38
+ # Try to import sentencepiece with fallback
39
+ try:
40
+ import sentencepiece
41
+ SENTENCEPIECE_AVAILABLE = True
42
+ print(f"βœ… SentencePiece available: {sentencepiece.__version__}")
43
+ except ImportError:
44
+ SENTENCEPIECE_AVAILABLE = False
45
+ print("❌ SentencePiece not available - will use fallback methods")
46
+
47
  @dataclass
48
  class TrainingConfig:
49
  """Configuration class for training parameters."""
 
59
 
60
  class OpenLLMTrainer:
61
  """
62
+ Complete training implementation for OpenLLM models with robust tokenizer handling.
63
 
64
  This class handles the entire training pipeline including:
65
+ - Model and tokenizer loading with fallback methods
66
  - Dataset preparation
67
  - Training execution
68
  - Model saving and uploading
 
92
 
93
  def load_model_and_tokenizer(self, model_size: str) -> str:
94
  """
95
+ Load the pre-trained OpenLLM model and tokenizer with robust error handling.
96
 
97
  Args:
98
  model_size: Size of the model to load ("small", "medium", "large")
 
110
 
111
  model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
112
 
113
+ # Try multiple approaches to load the tokenizer
114
+ tokenizer_loaded = False
115
+
116
+ # Approach 1: Try direct loading with trust_remote_code
117
+ try:
118
+ print("πŸ”„ Attempting to load tokenizer with trust_remote_code=True...")
119
+ self.tokenizer = AutoTokenizer.from_pretrained(
120
+ model_name,
121
+ trust_remote_code=True,
122
+ use_fast=False # Use slow tokenizer as fallback
123
+ )
124
+ tokenizer_loaded = True
125
+ print("βœ… Tokenizer loaded with trust_remote_code=True")
126
+ except Exception as e1:
127
+ print(f"❌ Approach 1 failed: {e1}")
128
+
129
+ # Approach 2: Try with use_fast=False
130
+ try:
131
+ print("πŸ”„ Attempting to load tokenizer with use_fast=False...")
132
+ self.tokenizer = AutoTokenizer.from_pretrained(
133
+ model_name,
134
+ use_fast=False
135
+ )
136
+ tokenizer_loaded = True
137
+ print("βœ… Tokenizer loaded with use_fast=False")
138
+ except Exception as e2:
139
+ print(f"❌ Approach 2 failed: {e2}")
140
+
141
+ # Approach 3: Try with legacy tokenizer
142
+ try:
143
+ print("πŸ”„ Attempting to load tokenizer with legacy settings...")
144
+ self.tokenizer = AutoTokenizer.from_pretrained(
145
+ model_name,
146
+ use_fast=False,
147
+ legacy=True
148
+ )
149
+ tokenizer_loaded = True
150
+ print("βœ… Tokenizer loaded with legacy settings")
151
+ except Exception as e3:
152
+ print(f"❌ Approach 3 failed: {e3}")
153
+
154
+ # Approach 4: Try loading from a different model as fallback
155
+ try:
156
+ print("πŸ”„ Attempting to load fallback tokenizer...")
157
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
158
+ tokenizer_loaded = True
159
+ print("βœ… Fallback tokenizer loaded (GPT-2)")
160
+ except Exception as e4:
161
+ print(f"❌ All tokenizer loading approaches failed")
162
+ return f"❌ Failed to load any tokenizer: {str(e4)}"
163
 
164
  # Add padding token if not present
165
  if self.tokenizer.pad_token is None:
166
  self.tokenizer.pad_token = self.tokenizer.eos_token
167
 
168
+ # Load model with robust error handling
169
+ try:
170
+ print("πŸ”„ Loading model...")
171
+ self.model = AutoModelForCausalLM.from_pretrained(
172
+ model_name,
173
+ torch_dtype=torch.float16, # Use half precision for memory efficiency
174
+ device_map="auto" if torch.cuda.is_available() else None,
175
+ trust_remote_code=True
176
+ )
177
+ print("βœ… Model loaded successfully")
178
+ except Exception as e:
179
+ print(f"❌ Model loading failed: {e}")
180
+ return f"❌ Failed to load model: {str(e)}"
181
 
182
  return f"βœ… Successfully loaded {model_size} model from {model_name}"
183
 
184
  except Exception as e:
185
+ return f"❌ Failed to load model and tokenizer: {str(e)}"
186
 
187
  def prepare_dataset(self) -> str:
188
  """
 
195
  # Load the training dataset
196
  dataset = load_dataset("lemms/openllm-training-data")
197
 
198
+ # Tokenize the dataset with robust error handling
199
  def tokenize_function(examples):
200
+ try:
201
+ return self.tokenizer(
202
+ examples["text"],
203
+ truncation=True,
204
+ padding="max_length",
205
+ max_length=512,
206
+ return_tensors="pt"
207
+ )
208
+ except Exception as e:
209
+ print(f"Tokenization error: {e}")
210
+ # Fallback: return empty tensors
211
+ return {"input_ids": [], "attention_mask": []}
212
 
213
  tokenized_dataset = dataset["train"].map(
214
  tokenize_function,
 
361
 
362
  # Create the main Gradio application interface
363
  with gr.Blocks(
364
+ title="OpenLLM Training Space - Fixed Version",
365
  theme=gr.themes.Soft()
366
  ) as demo:
367
 
368
  # Application Header
369
+ gr.Markdown("# πŸš€ OpenLLM Training Space - Fixed Implementation")
370
+ gr.Markdown("### *Robust Tokenizer Handling - Gradio 4.44.1 Compatible*")
371
  gr.Markdown("---")
372
 
373
+ # Status Information
374
+ gr.Markdown(f"**Training Available**: {'βœ… Yes' if TRAINING_AVAILABLE else '❌ No'}")
375
+ gr.Markdown(f"**SentencePiece Available**: {'βœ… Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
376
+
377
  # Main Content Area
378
  with gr.Row():
379
 
 
439
  stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
440
 
441
  # Instructions Section
442
+ gr.Markdown("## πŸ“‹ Fixed Training Instructions")
443
  gr.Markdown("""
444
+ This interface provides **robust model training** with enhanced error handling:
445
 
446
  ### **Step 1: Configure Parameters**
447
  - **Model Size**: Select the base model to train from (7k models)
 
451
 
452
  ### **Step 2: Start Training**
453
  - Click "Start Training" to begin the actual training process
454
+ - The system will use multiple fallback methods for tokenizer loading
455
+ - Enhanced error handling for dependency issues
 
 
 
456
 
457
  ### **Step 3: Monitor Progress**
458
  - Watch the status updates and progress information
 
477
  # Training Function Definition
478
  def start_complete_training(model_size, max_steps, learning_rate, batch_size):
479
  """
480
+ Execute the complete training process with robust error handling.
481
  """
482
  if not TRAINING_AVAILABLE:
483
  return "❌ Training dependencies not available. Please check the installation."
 
536
  # Application Footer
537
  gr.Markdown("---")
538
  gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
 
539
  gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
540
+ gr.Markdown("**Enhanced Error Handling**: Multiple tokenizer loading methods")
541
 
542
  return demo
543