Fix: Store tokenizer path separately to avoid SentencePieceProcessor attribute error
Browse files
app.py
CHANGED
|
@@ -144,11 +144,13 @@ class OpenLLMTrainer:
|
|
| 144 |
sp_processor = spm.SentencePieceProcessor()
|
| 145 |
sp_processor.load(tokenizer_path)
|
| 146 |
|
| 147 |
-
# Store tokenizer
|
| 148 |
self.tokenizer = sp_processor
|
|
|
|
| 149 |
|
| 150 |
print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
|
| 151 |
print(f" Vocabulary size: {sp_processor.vocab_size()}")
|
|
|
|
| 152 |
|
| 153 |
except Exception as e:
|
| 154 |
print(f"❌ Failed to load tokenizer: {e}")
|
|
@@ -186,8 +188,8 @@ class OpenLLMTrainer:
|
|
| 186 |
|
| 187 |
# Create OpenLLM's TextDataLoader
|
| 188 |
try:
|
| 189 |
-
#
|
| 190 |
-
tokenizer_path = self.
|
| 191 |
|
| 192 |
self.data_loader = TextDataLoader(
|
| 193 |
data_file=temp_data_file,
|
|
@@ -407,9 +409,9 @@ class OpenLLMTrainer:
|
|
| 407 |
tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
|
| 408 |
os.makedirs(tokenizer_dir, exist_ok=True)
|
| 409 |
|
| 410 |
-
# Copy the tokenizer.model file
|
| 411 |
import shutil
|
| 412 |
-
shutil.copy2(self.
|
| 413 |
|
| 414 |
print("✅ Model saved locally")
|
| 415 |
|
|
|
|
| 144 |
sp_processor = spm.SentencePieceProcessor()
|
| 145 |
sp_processor.load(tokenizer_path)
|
| 146 |
|
| 147 |
+
# Store tokenizer and its path separately
|
| 148 |
self.tokenizer = sp_processor
|
| 149 |
+
self.tokenizer_path = tokenizer_path # Store the path separately
|
| 150 |
|
| 151 |
print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
|
| 152 |
print(f" Vocabulary size: {sp_processor.vocab_size()}")
|
| 153 |
+
print(f" Tokenizer path: {tokenizer_path}")
|
| 154 |
|
| 155 |
except Exception as e:
|
| 156 |
print(f"❌ Failed to load tokenizer: {e}")
|
|
|
|
| 188 |
|
| 189 |
# Create OpenLLM's TextDataLoader
|
| 190 |
try:
|
| 191 |
+
# Use the stored tokenizer path instead of trying to access model_file_path
|
| 192 |
+
tokenizer_path = self.tokenizer_path # Use the stored path
|
| 193 |
|
| 194 |
self.data_loader = TextDataLoader(
|
| 195 |
data_file=temp_data_file,
|
|
|
|
| 409 |
tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
|
| 410 |
os.makedirs(tokenizer_dir, exist_ok=True)
|
| 411 |
|
| 412 |
+
# Copy the tokenizer.model file using the stored path
|
| 413 |
import shutil
|
| 414 |
+
shutil.copy2(self.tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
|
| 415 |
|
| 416 |
print("✅ Model saved locally")
|
| 417 |
|