lemms commited on
Commit
4672bf6
·
verified ·
1 Parent(s): e02f3cd

Fix: Store tokenizer path separately to avoid SentencePieceProcessor attribute error

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -144,11 +144,13 @@ class OpenLLMTrainer:
144
  sp_processor = spm.SentencePieceProcessor()
145
  sp_processor.load(tokenizer_path)
146
 
147
- # Store tokenizer for later use
148
  self.tokenizer = sp_processor
 
149
 
150
  print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
151
  print(f" Vocabulary size: {sp_processor.vocab_size()}")
 
152
 
153
  except Exception as e:
154
  print(f"❌ Failed to load tokenizer: {e}")
@@ -186,8 +188,8 @@ class OpenLLMTrainer:
186
 
187
  # Create OpenLLM's TextDataLoader
188
  try:
189
- # Get tokenizer path
190
- tokenizer_path = self.tokenizer.model_file_path
191
 
192
  self.data_loader = TextDataLoader(
193
  data_file=temp_data_file,
@@ -407,9 +409,9 @@ class OpenLLMTrainer:
407
  tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
408
  os.makedirs(tokenizer_dir, exist_ok=True)
409
 
410
- # Copy the tokenizer.model file
411
  import shutil
412
- shutil.copy2(self.tokenizer.model_file_path, os.path.join(tokenizer_dir, "tokenizer.model"))
413
 
414
  print("✅ Model saved locally")
415
 
 
144
  sp_processor = spm.SentencePieceProcessor()
145
  sp_processor.load(tokenizer_path)
146
 
147
+ # Store tokenizer and its path separately
148
  self.tokenizer = sp_processor
149
+ self.tokenizer_path = tokenizer_path # Store the path separately
150
 
151
  print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
152
  print(f" Vocabulary size: {sp_processor.vocab_size()}")
153
+ print(f" Tokenizer path: {tokenizer_path}")
154
 
155
  except Exception as e:
156
  print(f"❌ Failed to load tokenizer: {e}")
 
188
 
189
  # Create OpenLLM's TextDataLoader
190
  try:
191
+ # Use the stored tokenizer path instead of trying to access model_file_path
192
+ tokenizer_path = self.tokenizer_path # Use the stored path
193
 
194
  self.data_loader = TextDataLoader(
195
  data_file=temp_data_file,
 
409
  tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
410
  os.makedirs(tokenizer_dir, exist_ok=True)
411
 
412
+ # Copy the tokenizer.model file using the stored path
413
  import shutil
414
+ shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
415
 
416
  print("✅ Model saved locally")
417