Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 5, 2025

Commit

14fc2b4

verified ·

1 Parent(s): 393c789

Major Update to app.py

Browse files

- Removed fallback model
- Changed model to Phi-3-mini-4k-instruct
- Removed bulky streaming script for TextIteratorStreamer

Files changed (1) hide show

app.py +247 -320

app.py CHANGED Viewed

@@ -11,11 +11,12 @@ from dotenv import load_dotenv
 import logging
 import re
 import json
 from datetime import datetime
 from typing import Annotated, Sequence, TypedDict, List, Optional, Any, Type
 from pydantic import BaseModel, Field
-# LangGraph imports (replacing deprecated LangChain agent imports)
 from langgraph.graph import StateGraph, START, END
 from langgraph.graph.message import add_messages
 from langgraph.checkpoint.memory import MemorySaver
@@ -24,11 +25,11 @@ from langgraph.prebuilt import ToolNode
 # Updated LangChain imports
 from langchain_core.tools import tool
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage, ToolMessage, BaseMessage
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 load_dotenv(".env")
@@ -223,13 +224,15 @@ Decision:"""
             log_metric(f"Tool decision time (error): {graph_decision_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return False
-# --- System Prompt ---
 SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process rather than providing direct answers to academic work.
 ## Core Educational Principles
 - Provide comprehensive, educational responses that help students truly understand concepts
 - Use minimal formatting, with markdown bolding reserved for **key terms** only
 - Prioritize teaching methodology over answer delivery
 - Foster critical thinking and independent problem-solving skills
 ## Tone and Communication Style
 - Maintain an engaging, friendly tone appropriate for high school students
 - Write at a reading level that is accessible yet intellectually stimulating
@@ -239,52 +242,64 @@ SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to faci
 - Skip flattery and respond directly to questions
 - Do not use emojis or actions in asterisks unless specifically requested
 - Present critiques and corrections kindly as educational opportunities
 ## Academic Integrity Approach
-You recognize that students may seek direct answers to homework, assignments, or test questions. Rather than providing complete solutions or making accusations about intent, you should:
-- **Guide through processes**: Break down problems into conceptual components and teach underlying principles
-- **Ask clarifying questions**: Understand what the student already knows and where their confusion lies
-- **Provide similar examples**: Work through analogous problems that demonstrate the same concepts without directly solving their specific assignment
-- **Encourage original thinking**: Help students develop their own reasoning and analytical skills
-- **Suggest study strategies**: Recommend effective learning approaches for the subject matter
-## Visual Learning Enhancement
-You have the ability to create graphs and charts to enhance your explanations. Use this capability proactively when:
-- Explaining mathematical concepts (functions, distributions, relationships)
-- Teaching statistical analysis or data interpretation
-- Discussing scientific trends, patterns, or experimental results
-- Comparing different options, outcomes, or scenarios
-- Illustrating economic principles, business metrics, or financial concepts
-- Showing survey results, demographic data, or research findings
-- Demonstrating any concept where visualization aids comprehension
-**Important**: Only use the graph tool when visualization would genuinely help explain a concept. For general conversation, explanations, or questions that don't involve data or relationships, respond normally without tools.
 ## Response Guidelines
-- **For math problems**: Explain concepts, provide formula derivations, and guide through problem-solving steps without computing final numerical answers
-- **For multiple-choice questions**: Discuss the concepts being tested and help students understand how to analyze options rather than identifying the correct choice
-- **For essays or written work**: Discuss research strategies, organizational techniques, and critical thinking approaches rather than providing content or thesis statements
-- **For factual questions**: Provide educational context and encourage students to synthesize information rather than stating direct answers
-## Communication Guidelines
-- Maintain a supportive, non-judgmental tone in all interactions
-- Assume positive intent while redirecting toward genuine learning
-- Use Socratic questioning to promote discovery and critical thinking
-- Celebrate understanding and progress in the learning process
-- Encourage students to explain their thinking and reasoning
-- Provide honest, accurate feedback even when it may not be what the student wants to hear
-Your goal is to be an educational partner who empowers students to succeed through understanding, not a service that completes their work for them."""
-# --- Updated LLM Class with Microsoft Phi-2 and TinyLlama fallback ---
-class Phi2EducationalLLM(Runnable):
-    """LLM class optimized for Microsoft Phi-2 with TinyLlama fallback for educational tasks"""
-    def __init__(self, model_path: str = "microsoft/phi-2", fallback_model: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", use_4bit: bool = False):
         super().__init__()
-        logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         self.model_name = model_path
         try:
-            # Load tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
             if use_4bit:
                 quant_config = BitsAndBytesConfig(
@@ -296,14 +311,14 @@ class Phi2EducationalLLM(Runnable):
                     llm_int8_skip_modules=["lm_head"]
                 )
-                # Try quantized load
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     quantization_config=quant_config,
                     device_map="auto",
-                    torch_dtype=torch.float16,
                     trust_remote_code=True,
-                    low_cpu_mem_usage=True
                 )
             else:
                 self._load_optimized_model(model_path)
@@ -314,45 +329,48 @@ class Phi2EducationalLLM(Runnable):
             log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Model: {model_path}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
-            logger.warning(f"Primary model {model_path} failed, trying fallback {fallback_model}: {e}")
-            self._load_fallback_model(fallback_model)
-            end_Loading_Model_time = time.perf_counter()
-            Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
-            log_metric(f"Model Load time (fallback): {Loading_Model_time:0.4f} seconds. Model: {fallback_model}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-        # Ensure pad token
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
     def _load_optimized_model(self, model_path: str):
-        """Optimized model loading for 16GB RAM systems."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            torch_dtype=torch.float16,  # Use float16 to save memory
-            device_map="cpu",           # Force CPU for stability
             trust_remote_code=True,
             low_cpu_mem_usage=True,
-            max_memory={0: "14GB"}      # Reserve 2GB for system/gradio
         )
-    def _load_fallback_model(self, fallback_model: str):
-        """Fallback to TinyLlama if Phi-2 fails."""
-        logger.info(f"Loading fallback model: {fallback_model}")
-        # Update tokenizer for fallback model
-        self.tokenizer = AutoTokenizer.from_pretrained(fallback_model, trust_remote_code=True)
-        self.model_name = fallback_model
-        self.model = AutoModelForCausalLM.from_pretrained(
-            fallback_model,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="cpu",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True
-        )
     def invoke(self, input: Input, config=None) -> Output:
-        """Main invoke method optimized for educational tasks"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
@@ -363,42 +381,37 @@ class Phi2EducationalLLM(Runnable):
             prompt = str(input)
         try:
-            # Try chat template first (works with Phi-2 and TinyLlama)
-            try:
-                messages = [
-                    {"role": "system", "content": SYSTEM_PROMPT},
-                    {"role": "user", "content": prompt}
-                ]
-                text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            except:
-                # Fallback for models without chat template support
-                if "phi" in self.model_name.lower():
-                    # Phi-2 proper format
-                    text = f"{SYSTEM_PROMPT}\n\nQuestion: {prompt}\nAnswer:"
-                else:
-                    # Generic format for other models
-                    text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
-            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=1024)
-            if torch.cuda.is_available():
-                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_new_tokens=600,  # Sufficient for comprehensive educational responses
                     do_sample=True,
-                    temperature=0.7,     # Good balance for educational content
                     top_p=0.9,
-                    top_k=50,           # Reasonable variety for educational explanations
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
                     early_stopping=True,
-                    use_cache=True      # Enable KV cache for faster generation
                 )
-            new_tokens = [out[len(inp):] for inp, out in zip(inputs.input_ids, outputs)]
-            result = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
@@ -414,115 +427,78 @@ class Phi2EducationalLLM(Runnable):
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
-        """Streaming generation method for real-time response display"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
-        logger.info("Starting stream_generate...")
-        # Handle both string and dict inputs for flexibility
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
-            # Prepare input text with better error handling
             try:
-                messages = [
-                    {"role": "system", "content": SYSTEM_PROMPT},
-                    {"role": "user", "content": prompt}
-                ]
-                text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-                logger.info("Successfully used chat template")
             except Exception as e:
-                logger.warning(f"Failed to use chat template: {e}")
-                if "phi" in self.model_name.lower():
-                    text = f"Instruct: {SYSTEM_PROMPT}\n\nUser: {prompt}\nOutput:"
-                    logger.info("Using Phi-2 format")
-                else:
-                    text = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
-                    logger.info("Using generic format")
-            inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=1024)
-            if torch.cuda.is_available():
-                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            # Initialize for streaming
-            generated_tokens = []
-            max_new_tokens = 600
-            logger.info("Beginning token-by-token generation...")
-            # Generate token by token
-            current_input_ids = inputs.input_ids
-            current_attention_mask = inputs.attention_mask
-            for step in range(max_new_tokens):
-                try:
-                    with torch.no_grad():
-                        outputs = self.model(
-                            input_ids=current_input_ids,
-                            attention_mask=current_attention_mask,
-                            use_cache=True
-                        )
-                        # Get next token probabilities
-                        next_token_logits = outputs.logits[:, -1, :]
-                        # Apply temperature and sampling
-                        next_token_logits = next_token_logits / 0.7
-                        # Apply top-k and top-p filtering
-                        filtered_logits = self._top_k_top_p_filtering(next_token_logits, top_k=50, top_p=0.9)
-                        # Sample next token
-                        probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
-                        next_token = torch.multinomial(probs, num_samples=1)
-                        # Check for end of sequence
-                        if next_token.item() == self.tokenizer.eos_token_id:
-                            logger.info(f"Reached EOS token at step {step}")
-                            break
-                        # Add to generated tokens
-                        generated_tokens.append(next_token.item())
-                        # Decode and yield partial result every few tokens for efficiency
-                        if step % 3 == 0 or step < 10:  # Yield more frequently at start
-                            partial_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
-                            if partial_text.strip():  # Only yield non-empty text
-                                yield partial_text
-                        # Safety checks to prevent infinite loops
-                        if step > 10 and len(generated_tokens) == 0:
-                            logger.error("No tokens generated after 10 steps, breaking")
-                            break
-                        if step > 50 and len(partial_text.strip()) < 10:
-                            logger.warning("Very little text generated, continuing...")
-                        # Update input for next iteration
-                        current_input_ids = torch.cat([current_input_ids, next_token], dim=-1)
-                        current_attention_mask = torch.cat([
-                            current_attention_mask,
-                            torch.ones((1, 1), dtype=current_attention_mask.dtype, device=current_attention_mask.device)
-                        ], dim=-1)
-                except Exception as e:
-                    logger.error(f"Error in generation step {step}: {e}")
-                    break
-            # Final result
-            final_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
-            if final_text:
-                yield final_text
-            else:
-                logger.error("No final text generated")
-                yield "I'm having trouble generating a response. Please try again."
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
-            log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Tokens generated: {len(generated_tokens)}. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            logger.info(f"Stream generation completed: {len(generated_tokens)} tokens in {stream_time:.2f}s")
         except Exception as e:
             logger.error(f"Streaming generation error: {e}")
@@ -531,29 +507,6 @@ class Phi2EducationalLLM(Runnable):
             log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             yield f"[Error in streaming generation: {str(e)}]"
-    def _top_k_top_p_filtering(self, logits, top_k=50, top_p=0.9):
-        """Apply top-k and top-p filtering to logits"""
-        if top_k > 0:
-            # Get top-k indices
-            top_k = min(top_k, logits.size(-1))
-            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-            logits[indices_to_remove] = float('-inf')
-        if top_p < 1.0:
-            # Sort and get cumulative probabilities
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-            cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
-            # Remove tokens with cumulative probability above the threshold
-            sorted_indices_to_remove = cumulative_probs > top_p
-            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-            sorted_indices_to_remove[..., 0] = 0
-            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-            logits[indices_to_remove] = float('-inf')
-        return logits
     @property
     def InputType(self) -> Type[Input]:
         return str
@@ -562,15 +515,15 @@ class Phi2EducationalLLM(Runnable):
     def OutputType(self) -> Type[Output]:
         return str
-# --- LangGraph Agent Implementation ---
 class Educational_Agent:
-    """Modern LangGraph-based educational agent with Phi-2 and streaming"""
     def __init__(self):
         start_init_and_langgraph_time = time.perf_counter()
         current_time = datetime.now()
-        self.llm = Phi2EducationalLLM(model_path="microsoft/phi-2", fallback_model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", use_4bit=False)
         self.tool_decision_engine = Tool_Decision_Engine(self.llm)
         # Create LangGraph workflow
@@ -581,29 +534,33 @@ class Educational_Agent:
         log_metric(f"Init and LangGraph workflow setup time: {init_and_langgraph_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     def _create_langgraph_workflow(self):
-        """Create the complete LangGraph workflow"""
         # Define tools
         tools = [Create_Graph_Tool]
         tool_node = ToolNode(tools)
-        # Bind tools to model
-        model_with_tools = self.llm
         def should_continue(state: EducationalAgentState) -> str:
             """Determine next step in the workflow"""
             messages = state["messages"]
             last_message = messages[-1]
-            # If we have tool calls, execute them
             if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
                 logger.info("Executing tools based on model decision")
                 return "tools"
-            # Otherwise, we're done
             return END
         def call_model(state: EducationalAgentState) -> dict:
-            """Call the model with tool decision logic"""
             start_call_model_time = time.perf_counter()
             current_time = datetime.now()
@@ -619,74 +576,38 @@ class Educational_Agent:
             # Decide if tools should be used
             needs_tools = self.tool_decision_engine.should_use_visualization(user_query)
-            if needs_tools:
-                logger.info("Query requires visualization - model will consider tools")
-                # Create a special prompt that encourages tool use
-                enhanced_messages = messages + [
-                    SystemMessage(content="The user's query would benefit from visualization. Consider using the Create_Graph_Tool if appropriate for educational purposes.")
-                ]
-            else:
-                logger.info("Query doesn't need tools - responding normally")
-                enhanced_messages = messages
             try:
-                # For this implementation, we'll handle tool calling manually
-                # since our custom LLM doesn't automatically generate tool calls
                 if needs_tools:
-                    # Try to generate a tool call based on the query
-                    tool_call_prompt = f"""
-Based on the user's educational query: "{user_query}"
-If this query would benefit from a graph or chart, create a JSON configuration for the Create_Graph_Tool.
-Otherwise, provide a regular educational response.
-If you decide to create a visualization, format it as:
-TOOL_CALL: Create_Graph_Tool
 {{
-    "data": {{"key": "value"}},
     "plot_type": "bar|line|pie",
-    "title": "Chart Title",
-    "x_label": "X Label",
-    "y_label": "Y Label",
-    "educational_context": "Why this visualization helps learning"
 }}
-Otherwise, provide a regular educational response.
 """
-                    response = model_with_tools.invoke(tool_call_prompt)
-                    # Check if the response contains a tool call
-                    if "TOOL_CALL:" in response and "Create_Graph_Tool" in response:
-                        # Extract the JSON part
-                        json_start = response.find("{")
-                        json_end = response.rfind("}") + 1
-                        if json_start != -1 and json_end > json_start:
-                            json_config = response[json_start:json_end]
-                            # Create a mock tool call message
-                            tool_call_message = AIMessage(
-                                content="I'll create a visualization to help explain this concept.",
-                                tool_calls=[{
-                                    "name": "Create_Graph_Tool",
-                                    "args": {"graph_config": json_config},
-                                    "id": "tool_call_1"
-                                }]
-                            )
-                            end_call_model_time = time.perf_counter()
-                            call_model_time = end_call_model_time - start_call_model_time
-                            log_metric(f"Call model time (with tools): {call_model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-                            return {"messages": [tool_call_message]}
-                # Regular response without tools
-                response = model_with_tools.invoke(enhanced_messages)
                 end_call_model_time = time.perf_counter()
                 call_model_time = end_call_model_time - start_call_model_time
-                log_metric(f"Call model time (no tools): {call_model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                 return {"messages": [AIMessage(content=response)]}
@@ -700,7 +621,7 @@ Otherwise, provide a regular educational response.
                 return {"messages": [error_response]}
         def handle_tools(state: EducationalAgentState) -> dict:
-            """Handle tool execution"""
             start_handle_tools_time = time.perf_counter()
             current_time = datetime.now()
@@ -708,29 +629,39 @@ Otherwise, provide a regular educational response.
                 messages = state["messages"]
                 last_message = messages[-1]
-                if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
-                    tool_call = last_message.tool_calls[0]
-                    if tool_call["name"] == "Create_Graph_Tool":
-                        # Execute the graph tool
-                        graph_config = tool_call["args"]["graph_config"]
-                        result = Create_Graph_Tool.invoke({"graph_config": graph_config})
-                        # Create tool message
-                        tool_message = ToolMessage(
-                            content=result,
-                            tool_call_id=tool_call["id"]
-                        )
-                        end_handle_tools_time = time.perf_counter()
-                        handle_tools_time = end_handle_tools_time - start_handle_tools_time
-                        log_metric(f"Handle tools time: {handle_tools_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-                        return {"messages": [tool_message]}
-                # If no valid tool call, return empty
                 end_handle_tools_time = time.perf_counter()
                 handle_tools_time = end_handle_tools_time - start_handle_tools_time
-                log_metric(f"Handle tools time (no valid call): {handle_tools_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                 return {"messages": []}
@@ -740,11 +671,7 @@ Otherwise, provide a regular educational response.
                 handle_tools_time = end_handle_tools_time - start_handle_tools_time
                 log_metric(f"Handle tools time (error): {handle_tools_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-                error_msg = ToolMessage(
-                    content=f"Tool execution failed: {str(e)}",
-                    tool_call_id="error"
-                )
-                return {"messages": [error_msg]}
         # Build the workflow
         workflow = StateGraph(EducationalAgentState)
@@ -763,7 +690,7 @@ Otherwise, provide a regular educational response.
                 END: END,
             }
         )
-        workflow.add_edge("tools", "agent")
         # Add memory
         memory = MemorySaver()
@@ -796,7 +723,7 @@ Otherwise, provide a regular educational response.
             return f"I apologize, but I encountered an error: {str(e)}"
     def stream_chat(self, message: str, thread_id: str = "default"):
-        """Streaming chat interface that yields partial responses"""
         start_chat_time = time.perf_counter()
         current_time = datetime.now()
@@ -810,17 +737,17 @@ Otherwise, provide a regular educational response.
                 "educational_context": None
             }
-            # First check if tools are needed
             user_query = message
             needs_tools = self.tool_decision_engine.should_use_visualization(user_query)
             if needs_tools:
                 logger.info("Query requires visualization - handling tool call first")
-                # Handle tool generation first (non-streaming for tools)
                 result = self.app.invoke(initial_state, config=config)
                 final_messages = result["messages"]
-                # Build the response from all assistant and tool messages
                 response_parts = []
                 for msg in final_messages:
                     if isinstance(msg, AIMessage) and msg.content:
@@ -834,8 +761,8 @@ Otherwise, provide a regular educational response.
                 yield final_response
             else:
-                logger.info("Streaming regular response without tools")
-                # Stream the LLM response directly
                 for partial_text in self.llm.stream_generate(message):
                     yield smart_truncate(partial_text, max_length=3000)
@@ -866,7 +793,7 @@ mathjax_config = '''
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
-    displayMath: [[', '], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},
@@ -936,7 +863,7 @@ def smart_truncate(text, max_length=3000):
     return result
 def generate_response_with_agent(message, max_retries=3):
-    """Generate streaming response using LangGraph agent."""
     start_generate_response_with_agent_time = time.perf_counter()
     current_time = datetime.now()
@@ -1016,20 +943,20 @@ def warmup_agent():
     start_agent_warmup_time = time.perf_counter()
     current_time = datetime.now()
-    logger.info("Warming up LangGraph agent with test query...")
     try:
         current_agent = get_agent()
         # Run a simple test query
         test_response = current_agent.chat("Hello, this is a warmup test.")
-        logger.info(f"LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
         end_agent_warmup_time = time.perf_counter()
         agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
         log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     except Exception as e:
-        logger.error(f"LangGraph agent warmup failed: {e}")
         end_agent_warmup_time = time.perf_counter()
         agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
         log_metric(f"Agent warmup time (error): {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
@@ -1064,7 +991,7 @@ def create_interface():
         with gr.Column(elem_classes=["main-container"]):
             # Title Section
-            gr.HTML('<div class="title-header"><h1> Mimir 🎓</h1></div>')
             # Chat Section
             with gr.Row():
@@ -1112,18 +1039,18 @@ def create_interface():
 if __name__ == "__main__":
     try:
         logger.info("=" * 50)
-        logger.info("Starting Mimir Application with Microsoft Phi-2 and Streaming")
         logger.info("=" * 50)
         # Step 1: Preload the model and agent
-        logger.info("Loading AI model and LangGraph workflow...")
         start_time = time.time()
         agent = Educational_Agent()
         load_time = time.time() - start_time
-        logger.info(f"LangGraph agent loaded successfully in {load_time:.2f} seconds")
         # Step 2: Warm up the model
-        logger.info("Warming up LangGraph model...")
         warmup_agent()
         interface = create_interface()
@@ -1136,5 +1063,5 @@ if __name__ == "__main__":
         )
     except Exception as e:
-        logger.error(f"❌ Failed to launch Mimir with Microsoft Phi-2: {e}")
         raise

 import logging
 import re
 import json
+import threading
 from datetime import datetime
 from typing import Annotated, Sequence, TypedDict, List, Optional, Any, Type
 from pydantic import BaseModel, Field
+# LangGraph imports
 from langgraph.graph import StateGraph, START, END
 from langgraph.graph.message import add_messages
 from langgraph.checkpoint.memory import MemorySaver
 # Updated LangChain imports
 from langchain_core.tools import tool
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage, ToolMessage, BaseMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
 import torch
 load_dotenv(".env")
             log_metric(f"Tool decision time (error): {graph_decision_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return False
+# --- System Prompt with ReAct Framework for Phi-3-mini ---
 SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process rather than providing direct answers to academic work.
 ## Core Educational Principles
 - Provide comprehensive, educational responses that help students truly understand concepts
 - Use minimal formatting, with markdown bolding reserved for **key terms** only
 - Prioritize teaching methodology over answer delivery
 - Foster critical thinking and independent problem-solving skills
 ## Tone and Communication Style
 - Maintain an engaging, friendly tone appropriate for high school students
 - Write at a reading level that is accessible yet intellectually stimulating
 - Skip flattery and respond directly to questions
 - Do not use emojis or actions in asterisks unless specifically requested
 - Present critiques and corrections kindly as educational opportunities
+## Tool Usage Instructions
+You have access to a Create_Graph_Tool that can create educational visualizations. When a query would benefit from visual representation, you should use this tool by outputting a properly formatted JSON configuration.
+To use the Create_Graph_Tool, format your response like this:
+```json
+{
+    "data": {"Category 1": 30, "Category 2": 45, "Category 3": 25},
+    "plot_type": "bar",
+    "title": "Example Chart",
+    "x_label": "Categories",
+    "y_label": "Values",
+    "educational_context": "This visualization helps students understand..."
+}
+```
+Use this tool for:
+- Mathematical functions and relationships
+- Statistical distributions and data analysis
+- Scientific trends and comparisons
+- Economic models and business metrics
+- Any concept where visualization aids comprehension
 ## Academic Integrity Approach
+Rather than providing complete solutions, you should:
+- **Guide through processes**: Break down problems into conceptual components
+- **Ask clarifying questions**: Understand what the student knows
+- **Provide similar examples**: Work through analogous problems
+- **Encourage original thinking**: Help students develop reasoning skills
+- **Suggest study strategies**: Recommend effective learning approaches
 ## Response Guidelines
+- **For math problems**: Explain concepts and guide through steps without computing final answers
+- **For multiple-choice questions**: Discuss concepts being tested rather than identifying correct choices
+- **For essays**: Discuss research strategies and organizational techniques
+- **For factual questions**: Provide educational context and encourage synthesis
+Your goal is to be an educational partner who empowers students to succeed through understanding."""
+# --- Updated LLM Class with Phi-3-mini and TextIteratorStreamer ---
+class Phi3MiniEducationalLLM(Runnable):
+    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer"""
+    def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = False):
         super().__init__()
+        logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit})")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         self.model_name = model_path
         try:
+            # Load tokenizer - Phi-3 requires trust_remote_code
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                token=hf_token
+            )
             if use_4bit:
                 quant_config = BitsAndBytesConfig(
                     llm_int8_skip_modules=["lm_head"]
                 )
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     quantization_config=quant_config,
                     device_map="auto",
+                    dtype=torch.float16,
                     trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    token=hf_token
                 )
             else:
                 self._load_optimized_model(model_path)
             log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Model: {model_path}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
+            logger.error(f"Failed to load Phi-3-mini model {model_path}: {e}")
+            raise
+        # Ensure pad token exists
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize TextIteratorStreamer
+        self.streamer = None
     def _load_optimized_model(self, model_path: str):
+        """Optimized model loading for Phi-3-mini."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
+            dtype=torch.float16,  # Use float16 to save memory
+            device_map="auto",          # Let transformers decide placement
             trust_remote_code=True,
             low_cpu_mem_usage=True,
+            token=hf_token
         )
+    def _format_chat_template(self, prompt: str) -> str:
+        """Format prompt using Phi-3's chat template"""
+        try:
+            messages = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": prompt}
+            ]
+            # Use Phi-3's chat template
+            formatted_text = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            return formatted_text
+        except Exception as e:
+            logger.warning(f"Chat template failed, using fallback format: {e}")
+            # Fallback to manual Phi-3 format
+            return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     def invoke(self, input: Input, config=None) -> Output:
+        """Main invoke method optimized for Phi-3-mini"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
             prompt = str(input)
         try:
+            # Format using Phi-3 chat template
+            text = self._format_chat_template(prompt)
+            inputs = self.tokenizer(
+                text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=3072  # Leave room for generation within 4k context
+            )
+            # Move to model device
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
+                    max_new_tokens=800,     # Increased for comprehensive responses
                     do_sample=True,
+                    temperature=0.7,        # Good balance for educational content
                     top_p=0.9,
+                    top_k=50,
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
                     early_stopping=True,
+                    use_cache=True
                 )
+            # Decode only new tokens
+            new_tokens = outputs[0][len(inputs.input_ids[0]):]
+            result = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
+        """Streaming generation using TextIteratorStreamer"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
+        logger.info("Starting stream_generate with TextIteratorStreamer...")
+        # Handle both string and dict inputs
         if isinstance(input, dict):
             prompt = input.get('input', str(input))
         else:
             prompt = str(input)
         try:
+            # Format using Phi-3 chat template
+            text = self._format_chat_template(prompt)
+            inputs = self.tokenizer(
+                text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=3072
+            )
+            # Move to model device
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Initialize TextIteratorStreamer
+            streamer = TextIteratorStreamer(
+                self.tokenizer,
+                skip_prompt=True,
+                skip_special_tokens=True
+            )
+            # Generation parameters
+            generation_kwargs = {
+                **inputs,
+                "max_new_tokens": 800,
+                "do_sample": True,
+                "temperature": 0.7,
+                "top_p": 0.9,
+                "top_k": 50,
+                "repetition_penalty": 1.1,
+                "pad_token_id": self.tokenizer.eos_token_id,
+                "streamer": streamer,
+                "use_cache": True
+            }
+            # Start generation in a separate thread
+            generation_thread = threading.Thread(
+                target=self.model.generate,
+                kwargs=generation_kwargs
+            )
+            generation_thread.start()
+            # Yield tokens as they become available
+            generated_text = ""
             try:
+                for new_text in streamer:
+                    if new_text:  # Only yield non-empty strings
+                        generated_text += new_text
+                        yield generated_text
             except Exception as e:
+                logger.error(f"Error in streaming iteration: {e}")
+                yield f"[Streaming error: {str(e)}]"
+            # Wait for generation to complete
+            generation_thread.join()
             end_stream_time = time.perf_counter()
             stream_time = end_stream_time - start_stream_time
+            log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Generated length: {len(generated_text)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            logger.info(f"Stream generation completed: {len(generated_text)} chars in {stream_time:.2f}s")
         except Exception as e:
             logger.error(f"Streaming generation error: {e}")
             log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             yield f"[Error in streaming generation: {str(e)}]"
     @property
     def InputType(self) -> Type[Input]:
         return str
     def OutputType(self) -> Type[Output]:
         return str
+# --- LangGraph Agent Implementation with Tool Calling ---
 class Educational_Agent:
+    """Modern LangGraph-based educational agent with Phi-3-mini and improved tool calling"""
     def __init__(self):
         start_init_and_langgraph_time = time.perf_counter()
         current_time = datetime.now()
+        self.llm = Phi3MiniEducationalLLM(model_path="microsoft/Phi-3-mini-4k-instruct", use_4bit=True)
         self.tool_decision_engine = Tool_Decision_Engine(self.llm)
         # Create LangGraph workflow
         log_metric(f"Init and LangGraph workflow setup time: {init_and_langgraph_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     def _create_langgraph_workflow(self):
+        """Create the complete LangGraph workflow with improved tool calling"""
         # Define tools
         tools = [Create_Graph_Tool]
         tool_node = ToolNode(tools)
         def should_continue(state: EducationalAgentState) -> str:
             """Determine next step in the workflow"""
             messages = state["messages"]
             last_message = messages[-1]
+            # Check if we have tool calls to execute
             if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
                 logger.info("Executing tools based on model decision")
                 return "tools"
+            # Check if the message content contains JSON for tool calling
+            if isinstance(last_message, AIMessage) and last_message.content:
+                content = last_message.content.strip()
+                # Look for JSON blocks that might be tool calls
+                if content.startswith('```json') and 'plot_type' in content:
+                    logger.info("Found JSON tool configuration in message")
+                    return "tools"
             return END
         def call_model(state: EducationalAgentState) -> dict:
+            """Call the model with enhanced tool decision logic"""
             start_call_model_time = time.perf_counter()
             current_time = datetime.now()
             # Decide if tools should be used
             needs_tools = self.tool_decision_engine.should_use_visualization(user_query)
             try:
                 if needs_tools:
+                    logger.info("Query requires visualization - prompting for tool use")
+                    # Enhanced prompt that guides Phi-3 to generate tool calls
+                    tool_prompt = f"""
+You are an educational AI assistant. The user has asked: "{user_query}"
+This query would benefit from a visualization. Please provide a helpful educational response AND include a JSON configuration for creating a graph or chart.
+Format your response with explanatory text followed by a JSON block like this:
+```json
 {{
+    "data": {{"Category 1": value1, "Category 2": value2}},
     "plot_type": "bar|line|pie",
+    "title": "Descriptive Title",
+    "x_label": "X Axis Label",
+    "y_label": "Y Axis Label",
+    "educational_context": "Explanation of why this visualization helps learning"
 }}
+```
+Make sure the data is relevant to the educational concept being discussed.
 """
+                    response = self.llm.invoke(tool_prompt)
+                else:
+                    # Regular educational response
+                    response = self.llm.invoke(user_query)
                 end_call_model_time = time.perf_counter()
                 call_model_time = end_call_model_time - start_call_model_time
+                log_metric(f"Call model time: {call_model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                 return {"messages": [AIMessage(content=response)]}
                 return {"messages": [error_response]}
         def handle_tools(state: EducationalAgentState) -> dict:
+            """Handle tool execution by parsing JSON from message content"""
             start_handle_tools_time = time.perf_counter()
             current_time = datetime.now()
                 messages = state["messages"]
                 last_message = messages[-1]
+                if isinstance(last_message, AIMessage) and last_message.content:
+                    content = last_message.content
+                    # Extract JSON from code blocks
+                    json_pattern = r'```json\s*(\{.*?\})\s*```'
+                    json_match = re.search(json_pattern, content, re.DOTALL)
+                    if json_match:
+                        json_str = json_match.group(1)
+                        try:
+                            # Validate and execute the tool
+                            json.loads(json_str)  # Validate JSON
+                            result = Create_Graph_Tool.invoke({"graph_config": json_str})
+                            # Create a response that combines the explanation with the visualization
+                            text_before_json = content[:json_match.start()].strip()
+                            combined_response = f"{text_before_json}\n\n{result}"
+                            end_handle_tools_time = time.perf_counter()
+                            handle_tools_time = end_handle_tools_time - start_handle_tools_time
+                            log_metric(f"Handle tools time: {handle_tools_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+                            # Replace the last message with the combined response
+                            return {"messages": [AIMessage(content=combined_response)]}
+                        except json.JSONDecodeError as e:
+                            logger.error(f"Invalid JSON in tool call: {e}")
+                            return {"messages": [AIMessage(content=f"{content}\n\n[Error: Invalid JSON format for visualization]")]}
+                # If no valid tool call found, return the message as-is
                 end_handle_tools_time = time.perf_counter()
                 handle_tools_time = end_handle_tools_time - start_handle_tools_time
+                log_metric(f"Handle tools time (no tool found): {handle_tools_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                 return {"messages": []}
                 handle_tools_time = end_handle_tools_time - start_handle_tools_time
                 log_metric(f"Handle tools time (error): {handle_tools_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+                return {"messages": [AIMessage(content=f"Tool execution failed: {str(e)}")]}
         # Build the workflow
         workflow = StateGraph(EducationalAgentState)
                 END: END,
             }
         )
+        workflow.add_edge("tools", END)  # After tools, we're done
         # Add memory
         memory = MemorySaver()
             return f"I apologize, but I encountered an error: {str(e)}"
     def stream_chat(self, message: str, thread_id: str = "default"):
+        """Streaming chat interface that yields partial responses using TextIteratorStreamer"""
         start_chat_time = time.perf_counter()
         current_time = datetime.now()
                 "educational_context": None
             }
+            # Check if tools are needed
             user_query = message
             needs_tools = self.tool_decision_engine.should_use_visualization(user_query)
             if needs_tools:
                 logger.info("Query requires visualization - handling tool call first")
+                # Handle tool generation (non-streaming for tools since they involve JSON parsing)
                 result = self.app.invoke(initial_state, config=config)
                 final_messages = result["messages"]
+                # Build the response from all messages
                 response_parts = []
                 for msg in final_messages:
                     if isinstance(msg, AIMessage) and msg.content:
                 yield final_response
             else:
+                logger.info("Streaming regular response without tools using TextIteratorStreamer")
+                # Stream the LLM response directly using TextIteratorStreamer
                 for partial_text in self.llm.stream_generate(message):
                     yield smart_truncate(partial_text, max_length=3000)
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
+    displayMath: [['$', '$'], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},
     return result
 def generate_response_with_agent(message, max_retries=3):
+    """Generate streaming response using LangGraph agent with Phi-3-mini."""
     start_generate_response_with_agent_time = time.perf_counter()
     current_time = datetime.now()
     start_agent_warmup_time = time.perf_counter()
     current_time = datetime.now()
+    logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
     try:
         current_agent = get_agent()
         # Run a simple test query
         test_response = current_agent.chat("Hello, this is a warmup test.")
+        logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
         end_agent_warmup_time = time.perf_counter()
         agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
         log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
     except Exception as e:
+        logger.error(f"Phi-3-mini LangGraph agent warmup failed: {e}")
         end_agent_warmup_time = time.perf_counter()
         agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
         log_metric(f"Agent warmup time (error): {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         with gr.Column(elem_classes=["main-container"]):
             # Title Section
+            gr.HTML('<div class="title-header"><h1>🎓 Mimir - Powered by Phi-3-mini</h1></div>')
             # Chat Section
             with gr.Row():
 if __name__ == "__main__":
     try:
         logger.info("=" * 50)
+        logger.info("Starting Mimir Application with Microsoft Phi-3-mini-4k-instruct and TextIteratorStreamer")
         logger.info("=" * 50)
         # Step 1: Preload the model and agent
+        logger.info("Loading Phi-3-mini model and LangGraph workflow...")
         start_time = time.time()
         agent = Educational_Agent()
         load_time = time.time() - start_time
+        logger.info(f"Phi-3-mini LangGraph agent loaded successfully in {load_time:.2f} seconds")
         # Step 2: Warm up the model
+        logger.info("Warming up Phi-3-mini model...")
         warmup_agent()
         interface = create_interface()
         )
     except Exception as e:
+        logger.error(f"❌ Failed to launch Mimir with Phi-3-mini: {e}")
         raise