Fix: Use trust_remote_code=True for OpenLLM custom tokenizer classes
Browse files
app.py
CHANGED
|
@@ -110,56 +110,24 @@ class OpenLLMTrainer:
|
|
| 110 |
|
| 111 |
model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
|
| 112 |
|
| 113 |
-
#
|
| 114 |
-
tokenizer_loaded = False
|
| 115 |
-
|
| 116 |
-
# Approach 1: Try direct loading with trust_remote_code
|
| 117 |
try:
|
| 118 |
-
print("π
|
| 119 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 120 |
model_name,
|
| 121 |
-
trust_remote_code=True,
|
| 122 |
-
use_fast=False
|
| 123 |
)
|
| 124 |
-
|
| 125 |
-
print("β
Tokenizer loaded with trust_remote_code=True")
|
| 126 |
-
except Exception as e1:
|
| 127 |
-
print(f"β Approach 1 failed: {e1}")
|
| 128 |
|
| 129 |
-
#
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
model_name,
|
| 134 |
-
use_fast=False
|
| 135 |
-
)
|
| 136 |
-
tokenizer_loaded = True
|
| 137 |
-
print("β
Tokenizer loaded with use_fast=False")
|
| 138 |
-
except Exception as e2:
|
| 139 |
-
print(f"β Approach 2 failed: {e2}")
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 145 |
-
model_name,
|
| 146 |
-
use_fast=False,
|
| 147 |
-
legacy=True
|
| 148 |
-
)
|
| 149 |
-
tokenizer_loaded = True
|
| 150 |
-
print("β
Tokenizer loaded with legacy settings")
|
| 151 |
-
except Exception as e3:
|
| 152 |
-
print(f"β Approach 3 failed: {e3}")
|
| 153 |
-
|
| 154 |
-
# Approach 4: Try loading from a different model as fallback
|
| 155 |
-
try:
|
| 156 |
-
print("π Attempting to load fallback tokenizer...")
|
| 157 |
-
self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
| 158 |
-
tokenizer_loaded = True
|
| 159 |
-
print("β
Fallback tokenizer loaded (GPT-2)")
|
| 160 |
-
except Exception as e4:
|
| 161 |
-
print(f"β All tokenizer loading approaches failed")
|
| 162 |
-
return f"β Failed to load any tokenizer: {str(e4)}"
|
| 163 |
|
| 164 |
# Add padding token if not present
|
| 165 |
if self.tokenizer.pad_token is None:
|
|
|
|
| 110 |
|
| 111 |
model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
|
| 112 |
|
| 113 |
+
# Load OpenLLM custom tokenizer with trust_remote_code
|
|
|
|
|
|
|
|
|
|
| 114 |
try:
|
| 115 |
+
print("π Loading OpenLLM custom tokenizer...")
|
| 116 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 117 |
model_name,
|
| 118 |
+
trust_remote_code=True, # CRITICAL for OpenLLM custom tokenizer classes
|
| 119 |
+
use_fast=False # Use slow tokenizer for compatibility
|
| 120 |
)
|
| 121 |
+
print(f"β
OpenLLM custom tokenizer loaded: {type(self.tokenizer).__name__}")
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
# Add padding token if not present
|
| 124 |
+
if self.tokenizer.pad_token is None:
|
| 125 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 126 |
+
print("β
Added padding token")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"β Failed to load OpenLLM custom tokenizer: {e}")
|
| 130 |
+
return f"β Failed to load OpenLLM custom tokenizer: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
# Add padding token if not present
|
| 133 |
if self.tokenizer.pad_token is None:
|