jdesiree commited on
Commit
bdff161
·
verified ·
1 Parent(s): df75c85

Update model_manager.py

Browse files
Files changed (1) hide show
  1. model_manager.py +104 -4
model_manager.py CHANGED
@@ -58,6 +58,7 @@ class LazyLlamaModel:
58
 
59
  def __init__(self):
60
  """Initialize only once"""
 
61
  if hasattr(self, '_initialized') and self._initialized:
62
  return
63
 
@@ -96,6 +97,11 @@ class LazyLlamaModel:
96
  trust_remote_code=True,
97
  )
98
 
 
 
 
 
 
99
  # Load model
100
  self.model = AutoModelForCausalLM.from_pretrained(
101
  LLAMA_MODEL_ID,
@@ -106,8 +112,14 @@ class LazyLlamaModel:
106
  torch_dtype=torch.bfloat16,
107
  )
108
 
 
 
 
 
 
109
  # Create pipeline
110
- self.pipe = pipeline(
 
111
  "text-generation",
112
  model=self.model,
113
  tokenizer=self.tokenizer,
@@ -115,15 +127,37 @@ class LazyLlamaModel:
115
  device_map="auto",
116
  )
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  logger.info("="*60)
119
  logger.info("✅ MODEL LOADED & CACHED")
120
  logger.info(f" Model: {LLAMA_MODEL_ID}")
 
 
121
  logger.info(f" Memory: ~1GB VRAM")
122
  logger.info(f" Context: 128K tokens")
123
  logger.info("="*60)
124
 
125
  except Exception as e:
126
  logger.error(f"Failed to load model: {e}")
 
 
 
 
127
  raise
128
 
129
  def generate(
@@ -151,12 +185,34 @@ class LazyLlamaModel:
151
  if self.model is None:
152
  self._load_model()
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  messages = [
155
  {"role": "system", "content": system_prompt},
156
  {"role": "user", "content": user_message},
157
  ]
158
 
159
  try:
 
 
 
160
  outputs = self.pipe(
161
  messages,
162
  max_new_tokens=max_tokens,
@@ -167,11 +223,39 @@ class LazyLlamaModel:
167
  repetition_penalty=1.15,
168
  )
169
 
170
- result = outputs[0]["generated_text"][-1]["content"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return result.strip()
172
 
173
  except Exception as e:
174
  logger.error(f"Generation error: {e}")
 
 
 
 
 
175
  return ""
176
 
177
  def generate_streaming(
@@ -191,6 +275,17 @@ class LazyLlamaModel:
191
  if self.model is None:
192
  self._load_model()
193
 
 
 
 
 
 
 
 
 
 
 
 
194
  messages = [
195
  {"role": "system", "content": system_prompt},
196
  {"role": "user", "content": user_message},
@@ -231,24 +326,29 @@ class LazyLlamaModel:
231
 
232
  except Exception as e:
233
  logger.error(f"Streaming error: {e}")
 
 
234
  yield ""
235
 
236
  def is_loaded(self) -> bool:
237
  """Check if model is loaded"""
238
- return self.model is not None
239
 
240
  def get_model_info(self) -> dict:
241
  """Get model information"""
242
  return {
243
  "model_id": LLAMA_MODEL_ID,
244
  "loaded": self.is_loaded(),
 
 
 
 
245
  "quantization": "4-bit NF4",
246
  "size_gb": 1.0,
247
  "context_length": 128000,
248
  "lazy_loading": True,
249
  }
250
 
251
-
252
  # Global instance - model loads on first use
253
  _model_instance = None
254
 
 
58
 
59
  def __init__(self):
60
  """Initialize only once"""
61
+ # Added hasattr check to prevent re-initialization
62
  if hasattr(self, '_initialized') and self._initialized:
63
  return
64
 
 
97
  trust_remote_code=True,
98
  )
99
 
100
+ # Add validation after tokenizer load
101
+ if self.tokenizer is None:
102
+ raise RuntimeError("Tokenizer failed to load")
103
+ logger.info(f"✓ Tokenizer loaded: {type(self.tokenizer).__name__}")
104
+
105
  # Load model
106
  self.model = AutoModelForCausalLM.from_pretrained(
107
  LLAMA_MODEL_ID,
 
112
  torch_dtype=torch.bfloat16,
113
  )
114
 
115
+ # Add validation after model load
116
+ if self.model is None:
117
+ raise RuntimeError("Model failed to load")
118
+ logger.info(f"✓ Model loaded: {type(self.model).__name__}")
119
+
120
  # Create pipeline
121
+ # Store pipeline reference explicitly
122
+ pipeline_obj = pipeline(
123
  "text-generation",
124
  model=self.model,
125
  tokenizer=self.tokenizer,
 
127
  device_map="auto",
128
  )
129
 
130
+ # FIXED: Validate pipeline before assignment
131
+ if pipeline_obj is None:
132
+ raise RuntimeError("Pipeline creation returned None")
133
+
134
+ if not callable(pipeline_obj):
135
+ raise RuntimeError(f"Pipeline is not callable: {type(pipeline_obj)}")
136
+
137
+ # Assign to instance
138
+ self.pipe = pipeline_obj
139
+
140
+ # FIXED: Double-check assignment succeeded
141
+ if self.pipe is None:
142
+ raise RuntimeError("Pipeline assignment failed - pipe is still None")
143
+
144
+ logger.info(f"✓ Pipeline created and verified: {type(self.pipe).__name__}")
145
+
146
  logger.info("="*60)
147
  logger.info("✅ MODEL LOADED & CACHED")
148
  logger.info(f" Model: {LLAMA_MODEL_ID}")
149
+ logger.info(f" Tokenizer: {type(self.tokenizer).__name__}")
150
+ logger.info(f" Pipeline: {type(self.pipe).__name__}")
151
  logger.info(f" Memory: ~1GB VRAM")
152
  logger.info(f" Context: 128K tokens")
153
  logger.info("="*60)
154
 
155
  except Exception as e:
156
  logger.error(f"Failed to load model: {e}")
157
+ # Clean up partial state
158
+ self.model = None
159
+ self.tokenizer = None
160
+ self.pipe = None
161
  raise
162
 
163
  def generate(
 
185
  if self.model is None:
186
  self._load_model()
187
 
188
+ # Add comprehensive validation after loading
189
+ if self.pipe is None:
190
+ # Try reloading if pipe is None but model exists
191
+ if self.model is not None:
192
+ logger.warning("Pipeline is None but model exists - attempting reload")
193
+ self.model = None # Force full reload
194
+ self._load_model()
195
+
196
+ # If still None, fail with clear error
197
+ if self.pipe is None:
198
+ raise RuntimeError(
199
+ "Pipeline is None after loading. This may be a ZeroGPU context issue. "
200
+ "Check that _load_model() completed successfully."
201
+ )
202
+
203
+ # Verify pipeline is callable
204
+ if not callable(self.pipe):
205
+ raise RuntimeError(f"Pipeline exists but is not callable: {type(self.pipe)}")
206
+
207
  messages = [
208
  {"role": "system", "content": system_prompt},
209
  {"role": "user", "content": user_message},
210
  ]
211
 
212
  try:
213
+ # FIXED: Add logging for debugging
214
+ logger.debug(f"Calling pipeline with {len(messages)} messages, max_tokens={max_tokens}")
215
+
216
  outputs = self.pipe(
217
  messages,
218
  max_new_tokens=max_tokens,
 
223
  repetition_penalty=1.15,
224
  )
225
 
226
+ # FIXED: Validate output structure
227
+ if not outputs or len(outputs) == 0:
228
+ logger.error("Pipeline returned empty output")
229
+ return ""
230
+
231
+ if not isinstance(outputs[0], dict):
232
+ logger.error(f"Unexpected output format: {type(outputs[0])}")
233
+ return ""
234
+
235
+ if "generated_text" not in outputs[0]:
236
+ logger.error(f"No 'generated_text' in output: {outputs[0].keys()}")
237
+ return ""
238
+
239
+ generated = outputs[0]["generated_text"]
240
+
241
+ if not generated or len(generated) == 0:
242
+ logger.error("Generated text is empty")
243
+ return ""
244
+
245
+ # Extract final message content
246
+ result = generated[-1]["content"]
247
+
248
+ logger.debug(f"Generated {len(result)} characters successfully")
249
+
250
  return result.strip()
251
 
252
  except Exception as e:
253
  logger.error(f"Generation error: {e}")
254
+ logger.error(f"Error type: {type(e).__name__}")
255
+ logger.error(f"Pipeline type: {type(self.pipe)}")
256
+ logger.error(f"Pipeline callable: {callable(self.pipe)}")
257
+ import traceback
258
+ logger.error(traceback.format_exc())
259
  return ""
260
 
261
  def generate_streaming(
 
275
  if self.model is None:
276
  self._load_model()
277
 
278
+ # FIXED: Add validation for streaming
279
+ if self.model is None:
280
+ logger.error("Model is None in generate_streaming")
281
+ yield ""
282
+ return
283
+
284
+ if self.tokenizer is None:
285
+ logger.error("Tokenizer is None in generate_streaming")
286
+ yield ""
287
+ return
288
+
289
  messages = [
290
  {"role": "system", "content": system_prompt},
291
  {"role": "user", "content": user_message},
 
326
 
327
  except Exception as e:
328
  logger.error(f"Streaming error: {e}")
329
+ import traceback
330
+ logger.error(traceback.format_exc())
331
  yield ""
332
 
333
  def is_loaded(self) -> bool:
334
  """Check if model is loaded"""
335
+ return self.model is not None and self.pipe is not None
336
 
337
  def get_model_info(self) -> dict:
338
  """Get model information"""
339
  return {
340
  "model_id": LLAMA_MODEL_ID,
341
  "loaded": self.is_loaded(),
342
+ "model_exists": self.model is not None,
343
+ "tokenizer_exists": self.tokenizer is not None,
344
+ "pipe_exists": self.pipe is not None,
345
+ "pipe_callable": callable(self.pipe) if self.pipe else False,
346
  "quantization": "4-bit NF4",
347
  "size_gb": 1.0,
348
  "context_length": 128000,
349
  "lazy_loading": True,
350
  }
351
 
 
352
  # Global instance - model loads on first use
353
  _model_instance = None
354