Spaces:

SiddharthVenba
/

TraceSceneFinal

Running on Zero

App Files Files Community

Siddharth Ravikumar commited on about 1 month ago

Commit

1f69fb6

1 Parent(s): f9d2cd6

fix: make Chat Agent robust with detailed logging and GPU-context loading

Browse files

Files changed (2) hide show

app.py +19 -8
backend/app/core/inference.py +49 -33

app.py CHANGED Viewed

@@ -60,11 +60,15 @@ inference_engine._run_inference = gpu_run_inference
 _original_chat = chat_engine.chat
 @spaces.GPU(duration=60)
-def gpu_run_chat(system_context: str, user_message: str):
-    """GPU-accelerated chat inference"""
-    return _original_chat(system_context, user_message)
-chat_engine.chat = gpu_run_chat
 # ── Async helpers ──────────────────────────────────────────────────────
@@ -542,14 +546,21 @@ SCENE ANALYSES:\n"""
 def chat_respond(user_message, history, system_ctx):
     if not user_message or not user_message.strip():
         return history, "", system_ctx
-    ensure_init()
-    if not chat_engine.is_loaded:
-        chat_engine.load_model()
     try:
         response = gpu_run_chat(system_ctx, user_message.strip())
     except Exception as e:
         response = f"Error: {e}"
     history = history or []
     history.append({"role": "user", "content": user_message.strip()})
     history.append({"role": "assistant", "content": response})
     return history, "", system_ctx

 _original_chat = chat_engine.chat
 @spaces.GPU(duration=60)
+def gpu_run_chat(system_context, user_message):
+    """GPU-accelerated chat inference."""
+    try:
+        # We call the engine's original method directly to avoid monkey-patch recursion
+        # And let the engine handle its own loading inside this GPU worker
+        return _original_chat(system_context, user_message)
+    except Exception as e:
+        logger.error(f"ZeroGPU Chat Worker Error: {e}")
+        return f"Worker Error: {e}"
 # ── Async helpers ──────────────────────────────────────────────────────
 def chat_respond(user_message, history, system_ctx):
     if not user_message or not user_message.strip():
         return history, "", system_ctx
+    # ensure_init connects DB and loads rules, but not the models
+    run_async(_ensure_init())
+    logger.info(f"Chat request: {user_message[:50]}...")
     try:
+        # Call the @spaces.GPU decorated function directly
         response = gpu_run_chat(system_ctx, user_message.strip())
+        logger.info(f"Received response: {response[:50]}...")
     except Exception as e:
+        logger.error(f"Chat failed: {e}")
         response = f"Error: {e}"
     history = history or []
+    # Use Gradio 5.0 message format (dict)
     history.append({"role": "user", "content": user_message.strip()})
     history.append({"role": "assistant", "content": response})
     return history, "", system_ctx

backend/app/core/inference.py CHANGED Viewed

@@ -253,36 +253,45 @@ class ChatEngine:
     def load_model(self):
         """Load the text-only chat model."""
         from transformers import AutoModelForCausalLM, AutoTokenizer
         model_id = settings.chat_model_id
-        logger.info(f"Loading chat model: {model_id}")
         device = settings.resolve_device()
         dtype = settings.resolve_torch_dtype()
-        self._tokenizer = AutoTokenizer.from_pretrained(
-            model_id, trust_remote_code=settings.model_trust_remote_code,
-        )
-        self._model = AutoModelForCausalLM.from_pretrained(
-            model_id, torch_dtype=dtype, trust_remote_code=settings.model_trust_remote_code,
-        )
-        if device != "cpu":
-            self._model = self._model.to(device)
-        self._device = device
-        self.is_loaded = True
-        logger.info(f"Chat model loaded on {device}")
     def chat(self, system_context: str, user_message: str) -> str:
         """
         Generate a response given system context and a user question.
-        system_context: case data, traffic rules, etc.
-        user_message: the user's question
         """
         if not self.is_loaded:
-            raise RuntimeError("Chat model not loaded. Call load_model() first.")
         messages = [
             {"role": "system", "content": system_context},
@@ -293,25 +302,32 @@ class ChatEngine:
             text_prompt = self._tokenizer.apply_chat_template(
                 messages, add_generation_prompt=True, tokenize=False,
             )
-        except Exception:
-            # Fallback if no chat template
             text_prompt = f"System: {system_context}\n\nUser: {user_message}\n\nAssistant:"
-        inputs = self._tokenizer(text_prompt, return_tensors="pt").to(self._device)
-        with torch.inference_mode():
-            outputs = self._model.generate(
-                **inputs,
-                max_new_tokens=512,
-                repetition_penalty=1.2,
-                temperature=0.4,
-                do_sample=True,
-            )
-        prompt_length = inputs["input_ids"].shape[1]
-        generated_tokens = outputs[0][prompt_length:]
-        response = self._tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        return response.strip()
 # Singleton instance

     def load_model(self):
         """Load the text-only chat model."""
+        if self.is_loaded:
+            return
         from transformers import AutoModelForCausalLM, AutoTokenizer
         model_id = settings.chat_model_id
         device = settings.resolve_device()
         dtype = settings.resolve_torch_dtype()
+        logger.info(f"DEBUG: ChatEngine loading model {model_id} on {device}...")
+        try:
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=settings.model_trust_remote_code,
+            )
+            # Use float16 for GPU, float32 for CPU to avoid issues
+            self._model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16 if "cuda" in str(device) else torch.float32,
+                trust_remote_code=settings.model_trust_remote_code,
+                low_cpu_mem_usage=True
+            )
+            if device != "cpu":
+                self._model = self._model.to(device)
+            self._device = device
+            self.is_loaded = True
+            logger.info(f"DEBUG: Chat model loaded successfully on {device}")
+        except Exception as e:
+            logger.error(f"DEBUG ERROR: Chat model load failed: {str(e)}")
+            raise e
     def chat(self, system_context: str, user_message: str) -> str:
         """
         Generate a response given system context and a user question.
         """
         if not self.is_loaded:
+            self.load_model()
         messages = [
             {"role": "system", "content": system_context},
             text_prompt = self._tokenizer.apply_chat_template(
                 messages, add_generation_prompt=True, tokenize=False,
             )
+            logger.info(f"DEBUG: Chat prompt prepared (length: {len(text_prompt)})")
+        except Exception as e:
+            logger.warning(f"DEBUG: Chat template failed ({e}), using fallback")
             text_prompt = f"System: {system_context}\n\nUser: {user_message}\n\nAssistant:"
+        try:
+            inputs = self._tokenizer(text_prompt, return_tensors="pt").to(self._device)
+            logger.info(f"DEBUG: Inputs tokenized (length: {inputs['input_ids'].shape[1]})")
+            with torch.inference_mode():
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=512,
+                    repetition_penalty=1.2,
+                    temperature=0.4,
+                    do_sample=True,
+                )
+            prompt_length = inputs["input_ids"].shape[1]
+            generated_tokens = outputs[0][prompt_length:]
+            response = self._tokenizer.decode(generated_tokens, skip_special_tokens=True)
+            logger.info(f"DEBUG: Response generated successfully (length: {len(response)})")
+            return response.strip()
+        except Exception as e:
+            logger.error(f"DEBUG ERROR: Inference failed: {str(e)}")
+            raise e
 # Singleton instance