Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 4, 2025

Commit

293ae98

verified ·

1 Parent(s): 10ba1d2

Added Streaming

Browse files

Files changed (1) hide show

app.py +256 -76

app.py CHANGED Viewed

@@ -270,16 +270,18 @@ You have the ability to create graphs and charts to enhance your explanations. U
 - Provide honest, accurate feedback even when it may not be what the student wants to hear
 Your goal is to be an educational partner who empowers students to succeed through understanding, not a service that completes their work for them."""
-# --- Fixed LLM Class with Runnable inheritance ---
-class Qwen25SmallLLM(Runnable):
-    """LLM class that properly inherits from Runnable for LangChain compatibility"""
-    def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
         super().__init__()
         logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         try:
             # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -294,46 +296,63 @@ class Qwen25SmallLLM(Runnable):
                     llm_int8_skip_modules=["lm_head"]
                 )
-                # Try quantized load with updated dtype parameter
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     quantization_config=quant_config,
                     device_map="auto",
-                    dtype=torch.bfloat16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 )
             else:
-                self._load_fallback_model(model_path)
             # Success path - log timing
             end_Loading_Model_time = time.perf_counter()
             Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
-            log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
-            logger.warning(f"Quantized load failed, falling back: {e}")
-            self._load_fallback_model(model_path)
             end_Loading_Model_time = time.perf_counter()
             Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
-            log_metric(f"Model Load time (fallback): {Loading_Model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         # Ensure pad token
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
-    def _load_fallback_model(self, model_path: str):
-        """Fallback if quantization fails."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
             trust_remote_code=True,
             low_cpu_mem_usage=True
         )
     def invoke(self, input: Input, config=None) -> Output:
-        """Main invoke method for Runnable compatibility"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
@@ -344,26 +363,38 @@ class Qwen25SmallLLM(Runnable):
             prompt = str(input)
         try:
-            messages = [
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": prompt}
-            ]
-            text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=2048)
             if torch.cuda.is_available():
                 inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_new_tokens=800,
                     do_sample=True,
-                    temperature=0.7,
                     top_p=0.9,
-                    top_k=50,
                     repetition_penalty=1.1,
-                    pad_token_id=self.tokenizer.eos_token_id
                 )
             new_tokens = [out[len(inp):] for inp, out in zip(inputs.input_ids, outputs)]
@@ -371,17 +402,131 @@ class Qwen25SmallLLM(Runnable):
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
-            log_metric(f"LLM Invoke time: {invoke_time:0.4f} seconds. Input length: {len(prompt)} chars. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            return result
         except Exception as e:
             logger.error(f"Generation error: {e}")
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
-            log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return f"[Error generating response: {str(e)}]"
     @property
     def InputType(self) -> Type[Input]:
         return str
@@ -392,13 +537,13 @@ class Qwen25SmallLLM(Runnable):
 # --- LangGraph Agent Implementation ---
 class Educational_Agent:
-    """Modern LangGraph-based educational agent"""
     def __init__(self):
         start_init_and_langgraph_time = time.perf_counter()
         current_time = datetime.now()
-        self.llm = Qwen25SmallLLM(model_path="Qwen/Qwen2.5-1.5B-Instruct", use_4bit=True)
         self.tool_decision_engine = Tool_Decision_Engine(self.llm)
         # Create LangGraph workflow
@@ -409,7 +554,7 @@ class Educational_Agent:
         log_metric(f"Init and LangGraph workflow setup time: {init_and_langgraph_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     def _create_langgraph_workflow(self):
-        """Create the LangGraph workflow"""
         # Define tools
         tools = [Create_Graph_Tool]
         tool_node = ToolNode(tools)
@@ -600,7 +745,31 @@ Otherwise, provide a regular educational response.
         return workflow.compile(checkpointer=memory)
     def chat(self, message: str, thread_id: str = "default") -> str:
-        """Main chat interface"""
         start_chat_time = time.perf_counter()
         current_time = datetime.now()
@@ -614,37 +783,45 @@ Otherwise, provide a regular educational response.
                 "educational_context": None
             }
-            # Run the workflow
-            result = self.app.invoke(initial_state, config=config)
-            # Extract the final response
-            final_messages = result["messages"]
-            # Build the response from all assistant and tool messages
-            response_parts = []
-            for msg in final_messages:
-                if isinstance(msg, AIMessage) and msg.content:
-                    response_parts.append(msg.content)
-                elif isinstance(msg, ToolMessage) and msg.content:
-                    response_parts.append(msg.content)
-            if response_parts:
-                final_response = "\n\n".join(response_parts)
             else:
-                final_response = "I apologize, but I couldn't generate a proper response."
             end_chat_time = time.perf_counter()
             chat_time = end_chat_time - start_chat_time
-            log_metric(f"Complete chat time: {chat_time:0.4f} seconds. Response length: {len(final_response)} chars. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            return final_response
         except Exception as e:
-            logger.error(f"Error in LangGraph chat: {e}")
             end_chat_time = time.perf_counter()
             chat_time = end_chat_time - start_chat_time
-            log_metric(f"Complete chat time (error): {chat_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            return f"I apologize, but I encountered an error: {str(e)}"
 # --- Global Agent Instance ---
 agent = None
@@ -662,7 +839,7 @@ mathjax_config = '''
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
-    displayMath: [['$', '$'], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},
@@ -732,7 +909,7 @@ def smart_truncate(text, max_length=3000):
     return result
 def generate_response_with_agent(message, max_retries=3):
-    """Generate response using LangGraph agent."""
     start_generate_response_with_agent_time = time.perf_counter()
     current_time = datetime.now()
@@ -741,16 +918,15 @@ def generate_response_with_agent(message, max_retries=3):
             # Get the agent
             current_agent = get_agent()
-            # Use the agent's chat method
-            response = current_agent.chat(message)
-            result = smart_truncate(response)
             end_generate_response_with_agent_time = time.perf_counter()
             generate_response_with_agent_time = end_generate_response_with_agent_time - start_generate_response_with_agent_time
             log_metric(f"Generate response with agent time: {generate_response_with_agent_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            return result
         except Exception as e:
             logger.error(f"Agent error (attempt {attempt + 1}): {e}")
@@ -761,32 +937,33 @@ def generate_response_with_agent(message, max_retries=3):
                 end_generate_response_with_agent_time = time.perf_counter()
                 generate_response_with_agent_time = end_generate_response_with_agent_time - start_generate_response_with_agent_time
                 log_metric(f"Generate response with agent time (error): {generate_response_with_agent_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-                return f"I apologize, but I encountered an error while processing your message: {str(e)}"
 def chat_response(message, history=None):
-    """Process chat message and return response."""
     start_chat_response_time = time.perf_counter()
     current_time = datetime.now()
     try:
-        # Generate response with LangGraph agent
-        response = generate_response_with_agent(message)
         end_chat_response_time = time.perf_counter()
         chat_response_time = end_chat_response_time - start_chat_response_time
         log_metric(f"Chat response time: {chat_response_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-        return response
     except Exception as e:
         logger.error(f"Error in chat_response: {e}")
         end_chat_response_time = time.perf_counter()
         chat_response_time = end_chat_response_time - start_chat_response_time
         log_metric(f"Chat response time (error): {chat_response_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-        return f"I apologize, but I encountered an error while processing your message: {str(e)}"
 def respond_and_update(message, history):
-    """Main function to handle user submission."""
     if not message.strip():
         return history, ""
@@ -794,11 +971,14 @@ def respond_and_update(message, history):
     history.append({"role": "user", "content": message})
     yield history, ""
-    # Generate response
-    response = chat_response(message)
-    history.append({"role": "assistant", "content": response})
-    yield history, ""
 def clear_chat():
     """Clear the chat history."""
@@ -905,7 +1085,7 @@ def create_interface():
 if __name__ == "__main__":
     try:
         logger.info("=" * 50)
-        logger.info("Starting Mimir Application with LangGraph")
         logger.info("=" * 50)
         # Step 1: Preload the model and agent
@@ -929,5 +1109,5 @@ if __name__ == "__main__":
         )
     except Exception as e:
-        logger.error(f"❌ Failed to launch Mimir with LangGraph: {e}")
         raise

 - Provide honest, accurate feedback even when it may not be what the student wants to hear
 Your goal is to be an educational partner who empowers students to succeed through understanding, not a service that completes their work for them."""
+# --- Updated LLM Class with Microsoft Phi-2 and TinyLlama fallback ---
+class Phi2EducationalLLM(Runnable):
+    """LLM class optimized for Microsoft Phi-2 with TinyLlama fallback for educational tasks"""
+    def __init__(self, model_path: str = "microsoft/phi-2", fallback_model: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", use_4bit: bool = False):
         super().__init__()
         logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
+        self.model_name = model_path
         try:
             # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
                     llm_int8_skip_modules=["lm_head"]
                 )
+                # Try quantized load
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     quantization_config=quant_config,
                     device_map="auto",
+                    torch_dtype=torch.float16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 )
             else:
+                self._load_optimized_model(model_path)
             # Success path - log timing
             end_Loading_Model_time = time.perf_counter()
             Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
+            log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Model: {model_path}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
+            logger.warning(f"Primary model {model_path} failed, trying fallback {fallback_model}: {e}")
+            self._load_fallback_model(fallback_model)
             end_Loading_Model_time = time.perf_counter()
             Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
+            log_metric(f"Model Load time (fallback): {Loading_Model_time:0.4f} seconds. Model: {fallback_model}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         # Ensure pad token
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
+    def _load_optimized_model(self, model_path: str):
+        """Optimized model loading for 16GB RAM systems."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
+            torch_dtype=torch.float16,  # Use float16 to save memory
+            device_map="cpu",           # Force CPU for stability
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            max_memory={0: "14GB"}      # Reserve 2GB for system/gradio
+        )
+    def _load_fallback_model(self, fallback_model: str):
+        """Fallback to TinyLlama if Phi-2 fails."""
+        logger.info(f"Loading fallback model: {fallback_model}")
+        # Update tokenizer for fallback model
+        self.tokenizer = AutoTokenizer.from_pretrained(fallback_model, trust_remote_code=True)
+        self.model_name = fallback_model
+        self.model = AutoModelForCausalLM.from_pretrained(
+            fallback_model,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="cpu",
             trust_remote_code=True,
             low_cpu_mem_usage=True
         )
     def invoke(self, input: Input, config=None) -> Output:
+        """Main invoke method optimized for educational tasks"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
             prompt = str(input)
         try:
+            # Try chat template first (works with Phi-2 and TinyLlama)
+            try:
+                messages = [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": prompt}
+                ]
+                text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            except:
+                # Fallback for models without chat template support
+                if "phi" in self.model_name.lower():
+                    # Phi-2 format
+                    text = f"Instruct: {SYSTEM_PROMPT}\n\nUser: {prompt}\nOutput:"
+                else:
+                    # Generic format for other models
+                    text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=1024)
             if torch.cuda.is_available():
                 inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_new_tokens=600,  # Sufficient for comprehensive educational responses
                     do_sample=True,
+                    temperature=0.7,     # Good balance for educational content
                     top_p=0.9,
+                    top_k=50,           # Reasonable variety for educational explanations
                     repetition_penalty=1.1,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    early_stopping=True,
+                    use_cache=True      # Enable KV cache for faster generation
                 )
             new_tokens = [out[len(inp):] for inp, out in zip(inputs.input_ids, outputs)]
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
+            log_metric(f"LLM Invoke time: {invoke_time:0.4f} seconds. Input length: {len(prompt)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            return result if result else "I'm still learning how to respond to that properly."
         except Exception as e:
             logger.error(f"Generation error: {e}")
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
+            log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return f"[Error generating response: {str(e)}]"
+    def stream_generate(self, input: Input, config=None):
+        """Streaming generation method for real-time response display"""
+        start_stream_time = time.perf_counter()
+        current_time = datetime.now()
+        # Handle both string and dict inputs for flexibility
+        if isinstance(input, dict):
+            prompt = input.get('input', str(input))
+        else:
+            prompt = str(input)
+        try:
+            # Prepare input text
+            try:
+                messages = [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": prompt}
+                ]
+                text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            except:
+                if "phi" in self.model_name.lower():
+                    text = f"Instruct: {SYSTEM_PROMPT}\n\nUser: {prompt}\nOutput:"
+                else:
+                    text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=1024)
+            if torch.cuda.is_available():
+                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Initialize for streaming
+            generated_tokens = []
+            input_length = inputs.input_ids.shape[1]
+            max_new_tokens = 600
+            # Generate token by token
+            current_input_ids = inputs.input_ids
+            current_attention_mask = inputs.attention_mask
+            for step in range(max_new_tokens):
+                with torch.no_grad():
+                    outputs = self.model(
+                        input_ids=current_input_ids,
+                        attention_mask=current_attention_mask,
+                        use_cache=True
+                    )
+                    # Get next token probabilities
+                    next_token_logits = outputs.logits[:, -1, :]
+                    # Apply temperature and sampling
+                    next_token_logits = next_token_logits / 0.7
+                    # Apply top-k and top-p filtering
+                    filtered_logits = self._top_k_top_p_filtering(next_token_logits, top_k=50, top_p=0.9)
+                    # Sample next token
+                    probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                    # Check for end of sequence
+                    if next_token.item() == self.tokenizer.eos_token_id:
+                        break
+                    # Add to generated tokens
+                    generated_tokens.append(next_token.item())
+                    # Decode and yield partial result
+                    partial_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+                    yield partial_text
+                    # Update input for next iteration
+                    current_input_ids = torch.cat([current_input_ids, next_token], dim=-1)
+                    current_attention_mask = torch.cat([
+                        current_attention_mask,
+                        torch.ones((1, 1), dtype=current_attention_mask.dtype, device=current_attention_mask.device)
+                    ], dim=-1)
+            # Final result
+            final_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+            end_stream_time = time.perf_counter()
+            stream_time = end_stream_time - start_stream_time
+            log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Tokens generated: {len(generated_tokens)}. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+        except Exception as e:
+            logger.error(f"Streaming generation error: {e}")
+            end_stream_time = time.perf_counter()
+            stream_time = end_stream_time - start_stream_time
+            log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            yield f"[Error in streaming generation: {str(e)}]"
+    def _top_k_top_p_filtering(self, logits, top_k=50, top_p=0.9):
+        """Apply top-k and top-p filtering to logits"""
+        if top_k > 0:
+            # Get top-k indices
+            top_k = min(top_k, logits.size(-1))
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = float('-inf')
+        if top_p < 1.0:
+            # Sort and get cumulative probabilities
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = float('-inf')
+        return logits
     @property
     def InputType(self) -> Type[Input]:
         return str
 # --- LangGraph Agent Implementation ---
 class Educational_Agent:
+    """Modern LangGraph-based educational agent with Phi-2 and streaming"""
     def __init__(self):
         start_init_and_langgraph_time = time.perf_counter()
         current_time = datetime.now()
+        self.llm = Phi2EducationalLLM(model_path="microsoft/phi-2", fallback_model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", use_4bit=False)
         self.tool_decision_engine = Tool_Decision_Engine(self.llm)
         # Create LangGraph workflow
         log_metric(f"Init and LangGraph workflow setup time: {init_and_langgraph_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     def _create_langgraph_workflow(self):
+        """Create the complete LangGraph workflow"""
         # Define tools
         tools = [Create_Graph_Tool]
         tool_node = ToolNode(tools)
         return workflow.compile(checkpointer=memory)
     def chat(self, message: str, thread_id: str = "default") -> str:
+        """Main chat interface (non-streaming for backward compatibility)"""
+        start_chat_time = time.perf_counter()
+        current_time = datetime.now()
+        try:
+            # Collect all streaming parts into final response
+            final_response = ""
+            for partial_response in self.stream_chat(message, thread_id):
+                final_response = partial_response
+            end_chat_time = time.perf_counter()
+            chat_time = end_chat_time - start_chat_time
+            log_metric(f"Complete chat time: {chat_time:0.4f} seconds. Response length: {len(final_response)} chars. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            return final_response
+        except Exception as e:
+            logger.error(f"Error in LangGraph chat: {e}")
+            end_chat_time = time.perf_counter()
+            chat_time = end_chat_time - start_chat_time
+            log_metric(f"Complete chat time (error): {chat_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            return f"I apologize, but I encountered an error: {str(e)}"
+    def stream_chat(self, message: str, thread_id: str = "default"):
+        """Streaming chat interface that yields partial responses"""
         start_chat_time = time.perf_counter()
         current_time = datetime.now()
                 "educational_context": None
             }
+            # First check if tools are needed
+            user_query = message
+            needs_tools = self.tool_decision_engine.should_use_visualization(user_query)
+            if needs_tools:
+                logger.info("Query requires visualization - handling tool call first")
+                # Handle tool generation first (non-streaming for tools)
+                result = self.app.invoke(initial_state, config=config)
+                final_messages = result["messages"]
+                # Build the response from all assistant and tool messages
+                response_parts = []
+                for msg in final_messages:
+                    if isinstance(msg, AIMessage) and msg.content:
+                        response_parts.append(msg.content)
+                    elif isinstance(msg, ToolMessage) and msg.content:
+                        response_parts.append(msg.content)
+                final_response = "\n\n".join(response_parts) if response_parts else "I couldn't generate a proper response."
+                # For tool responses, yield the complete result at once
+                yield final_response
             else:
+                logger.info("Streaming regular response without tools")
+                # Stream the LLM response directly
+                for partial_text in self.llm.stream_generate(message):
+                    yield smart_truncate(partial_text, max_length=3000)
             end_chat_time = time.perf_counter()
             chat_time = end_chat_time - start_chat_time
+            log_metric(f"Complete streaming chat time: {chat_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
+            logger.error(f"Error in streaming chat: {e}")
             end_chat_time = time.perf_counter()
             chat_time = end_chat_time - start_chat_time
+            log_metric(f"Complete streaming chat time (error): {chat_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            yield f"I apologize, but I encountered an error: {str(e)}"
 # --- Global Agent Instance ---
 agent = None
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
+    displayMath: [[', '], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},
     return result
 def generate_response_with_agent(message, max_retries=3):
+    """Generate streaming response using LangGraph agent."""
     start_generate_response_with_agent_time = time.perf_counter()
     current_time = datetime.now()
             # Get the agent
             current_agent = get_agent()
+            # Use the agent's streaming chat method
+            for partial_response in current_agent.stream_chat(message):
+                yield partial_response
             end_generate_response_with_agent_time = time.perf_counter()
             generate_response_with_agent_time = end_generate_response_with_agent_time - start_generate_response_with_agent_time
             log_metric(f"Generate response with agent time: {generate_response_with_agent_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            return
         except Exception as e:
             logger.error(f"Agent error (attempt {attempt + 1}): {e}")
                 end_generate_response_with_agent_time = time.perf_counter()
                 generate_response_with_agent_time = end_generate_response_with_agent_time - start_generate_response_with_agent_time
                 log_metric(f"Generate response with agent time (error): {generate_response_with_agent_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+                yield f"I apologize, but I encountered an error while processing your message: {str(e)}"
 def chat_response(message, history=None):
+    """Process chat message and return streaming response."""
     start_chat_response_time = time.perf_counter()
     current_time = datetime.now()
     try:
+        # Generate streaming response with LangGraph agent
+        final_response = ""
+        for partial_response in generate_response_with_agent(message):
+            final_response = partial_response
+            yield partial_response
         end_chat_response_time = time.perf_counter()
         chat_response_time = end_chat_response_time - start_chat_response_time
         log_metric(f"Chat response time: {chat_response_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     except Exception as e:
         logger.error(f"Error in chat_response: {e}")
         end_chat_response_time = time.perf_counter()
         chat_response_time = end_chat_response_time - start_chat_response_time
         log_metric(f"Chat response time (error): {chat_response_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+        yield f"I apologize, but I encountered an error while processing your message: {str(e)}"
 def respond_and_update(message, history):
+    """Main function to handle user submission with streaming."""
     if not message.strip():
         return history, ""
     history.append({"role": "user", "content": message})
     yield history, ""
+    # Start with empty assistant message
+    history.append({"role": "assistant", "content": ""})
+    # Stream the response
+    for partial_response in chat_response(message):
+        # Update the last message (assistant) with the partial response
+        history[-1]["content"] = partial_response
+        yield history, ""
 def clear_chat():
     """Clear the chat history."""
 if __name__ == "__main__":
     try:
         logger.info("=" * 50)
+        logger.info("Starting Mimir Application with Microsoft Phi-2 and Streaming")
         logger.info("=" * 50)
         # Step 1: Preload the model and agent
         )
     except Exception as e:
+        logger.error(f"❌ Failed to launch Mimir with Microsoft Phi-2: {e}")
         raise