lemms commited on
Commit
4744625
Β·
verified Β·
1 Parent(s): e60435f

Fix: Use trust_remote_code=True for OpenLLM custom tokenizer classes

Browse files
Files changed (1) hide show
  1. app.py +12 -44
app.py CHANGED
@@ -110,56 +110,24 @@ class OpenLLMTrainer:
110
 
111
  model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
112
 
113
- # Try multiple approaches to load the tokenizer
114
- tokenizer_loaded = False
115
-
116
- # Approach 1: Try direct loading with trust_remote_code
117
  try:
118
- print("πŸ”„ Attempting to load tokenizer with trust_remote_code=True...")
119
  self.tokenizer = AutoTokenizer.from_pretrained(
120
  model_name,
121
- trust_remote_code=True,
122
- use_fast=False # Use slow tokenizer as fallback
123
  )
124
- tokenizer_loaded = True
125
- print("βœ… Tokenizer loaded with trust_remote_code=True")
126
- except Exception as e1:
127
- print(f"❌ Approach 1 failed: {e1}")
128
 
129
- # Approach 2: Try with use_fast=False
130
- try:
131
- print("πŸ”„ Attempting to load tokenizer with use_fast=False...")
132
- self.tokenizer = AutoTokenizer.from_pretrained(
133
- model_name,
134
- use_fast=False
135
- )
136
- tokenizer_loaded = True
137
- print("βœ… Tokenizer loaded with use_fast=False")
138
- except Exception as e2:
139
- print(f"❌ Approach 2 failed: {e2}")
140
 
141
- # Approach 3: Try with legacy tokenizer
142
- try:
143
- print("πŸ”„ Attempting to load tokenizer with legacy settings...")
144
- self.tokenizer = AutoTokenizer.from_pretrained(
145
- model_name,
146
- use_fast=False,
147
- legacy=True
148
- )
149
- tokenizer_loaded = True
150
- print("βœ… Tokenizer loaded with legacy settings")
151
- except Exception as e3:
152
- print(f"❌ Approach 3 failed: {e3}")
153
-
154
- # Approach 4: Try loading from a different model as fallback
155
- try:
156
- print("πŸ”„ Attempting to load fallback tokenizer...")
157
- self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
158
- tokenizer_loaded = True
159
- print("βœ… Fallback tokenizer loaded (GPT-2)")
160
- except Exception as e4:
161
- print(f"❌ All tokenizer loading approaches failed")
162
- return f"❌ Failed to load any tokenizer: {str(e4)}"
163
 
164
  # Add padding token if not present
165
  if self.tokenizer.pad_token is None:
 
110
 
111
  model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
112
 
113
+ # Load OpenLLM custom tokenizer with trust_remote_code
 
 
 
114
  try:
115
+ print("πŸ”„ Loading OpenLLM custom tokenizer...")
116
  self.tokenizer = AutoTokenizer.from_pretrained(
117
  model_name,
118
+ trust_remote_code=True, # CRITICAL for OpenLLM custom tokenizer classes
119
+ use_fast=False # Use slow tokenizer for compatibility
120
  )
121
+ print(f"βœ… OpenLLM custom tokenizer loaded: {type(self.tokenizer).__name__}")
 
 
 
122
 
123
+ # Add padding token if not present
124
+ if self.tokenizer.pad_token is None:
125
+ self.tokenizer.pad_token = self.tokenizer.eos_token
126
+ print("βœ… Added padding token")
 
 
 
 
 
 
 
127
 
128
+ except Exception as e:
129
+ print(f"❌ Failed to load OpenLLM custom tokenizer: {e}")
130
+ return f"❌ Failed to load OpenLLM custom tokenizer: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # Add padding token if not present
133
  if self.tokenizer.pad_token is None: