Spaces:

jdesiree
/

Mimir

Sleeping

File size: 54,447 Bytes

import os
import re
import json
import time
import torch
import gradio as gr
import threading
import logging
import platform
import warnings

from datetime import datetime
from dotenv import load_dotenv
from typing import Annotated, Sequence, TypedDict, List, Optional, Any, Type

from pydantic import BaseModel, Field

# Gradio Spaces decorator (for @spaces.GPU)
import spaces 

# LangGraph imports
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import ToolNode

# LangChain Core imports
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage, ToolMessage, BaseMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import Runnable
from langchain_core.runnables.utils import Input, Output

# Transformers imports
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList,
    BitsAndBytesConfig,
)

from graph_tool import generate_plot
from loading_animations import create_thinking_indicator, get_thinking_dots


# Updated environment variables
os.environ['HF_HOME'] = '/tmp/huggingface'
os.environ['HF_DATASETS_CACHE'] = '/tmp/huggingface'

# Suppress warnings
warnings.filterwarnings("ignore", message="Special tokens have been added")
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")
warnings.filterwarnings("ignore", message=".*TracerWarning.*")
warnings.filterwarnings("ignore", message=".*flash-attention.*")

load_dotenv(".env")
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
print("Environment variables loaded.")

DEBUG_STATE = os.getenv("DEBUG_STATE", "false").lower() == "true"

def debug_state(conversation_state, event_name="", force_debug=False):
    """Debug function to inspect current conversation state"""
    if not (DEBUG_STATE or force_debug):
        return conversation_state
    
    timestamp = datetime.now().strftime("%H:%M:%S")
    logger.info(f"[{timestamp}] DEBUG STATE - {event_name}")
    logger.info(f"Total messages: {len(conversation_state)}")
    
    for i, msg in enumerate(conversation_state):
        role = msg["role"]
        content_preview = msg["content"][:100] + "..." if len(msg["content"]) > 100 else msg["content"]
        logger.info(f"  {i+1}. {role}: {content_preview}")
    
    # Log to file for later analysis
    if DEBUG_STATE:
        debug_log_file = "debug_state.log"
        with open(debug_log_file, "a", encoding="utf-8") as f:
            f.write(f"\n=== {timestamp} - {event_name} ===\n")
            f.write(f"Total messages: {len(conversation_state)}\n")
            for i, msg in enumerate(conversation_state):
                f.write(f"{i+1}. {msg['role']}: {msg['content'][:200]}...\n")
            f.write("=" * 40 + "\n")
    
    return conversation_state

# Setup main logger first
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# MISSING HTML CONTENT DEFINITIONS - FIX FOR UNDEFINED VARIABLES
html_head_content = """
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Mimir - Educational AI Assistant</title>
"""

force_light_mode = """
<script>
// Force light mode
if (document.documentElement) {
    document.documentElement.setAttribute('data-theme', 'light');
}
</script>
"""

# Environment and Logging Setup
def setup_metrics_logger():
    """Setup a simple file logger for human-readable metrics"""
    metrics_logger = logging.getLogger('metrics')
    metrics_logger.setLevel(logging.INFO)
    
    # Avoid duplicate handlers
    if metrics_logger.handlers:
        return metrics_logger
    
    # Create file handler
    log_file = 'performance_metrics.log'
    handler = logging.FileHandler(log_file)
    
    # Create formatter for clean output
    formatter = logging.Formatter('%(message)s')
    handler.setFormatter(formatter)
    
    metrics_logger.addHandler(handler)
    return metrics_logger

# Initialize the logger
metrics_logger = setup_metrics_logger()

def log_metric(message):
    """Log a human-readable metric message with automatic timestamp"""
    current_time = datetime.now()
    timestamped_message = f"{message} | Logged: {current_time:%Y-%m-%d %H:%M:%S}"
    metrics_logger.info(timestamped_message)
    logger.info(timestamped_message)

# Support both token names for flexibility
hf_token = HF_TOKEN
if not hf_token:
    logger.warning("Neither HF_TOKEN nor HUGGINGFACEHUB_API_TOKEN is set, the application may not work.")

# Tool Decision Engine (Updated for LangGraph)
class Tool_Decision_Engine:
    """Uses LLM to intelligently decide when visualization tools would be beneficial"""
    
    def __init__(self, llm):
        self.decision_llm = llm
        self.decision_prompt = """Analyze this educational query and determine if creating a graph, chart, or visual representation would significantly enhance learning and understanding.
Query: "{query}"

EXCLUDE if query is:
- Greetings or casual conversation (hello, hi, hey)
- Simple definitions without data
- Test/warmup messages
- General explanations that don't involve data

INCLUDE if query involves:
- Mathematical functions or relationships
- Data analysis or statistics
- Comparisons that benefit from charts
- Trends or patterns over time
- Creating practice questions with data

Answer with exactly: YES or NO
Decision:"""

    def should_use_visualization(self, query: str) -> bool:
        """Enhanced decision logic with explicit exclusions"""
        start_graph_decision_time = time.perf_counter()
        current_time = datetime.now()
        
        try:
            # Explicit exclusions for common non-visual queries
            exclusion_patterns = [
                r'^(hello|hi|hey)\b',
                r'warmup.*test',
                r'(what is|define|explain)\s+\w+\s*(of|the)?',
                r'capital\s+of',
                r'^(greet|greeting)'
            ]
            
            query_lower = query.lower().strip()
            
            # Check exclusions first
            for pattern in exclusion_patterns:
                if re.search(pattern, query_lower):
                    end_graph_decision_time = time.perf_counter()
                    graph_decision_time = end_graph_decision_time - start_graph_decision_time
                    log_metric(f"Tool decision time (excluded): {graph_decision_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                    return False
            
            # Create decision prompt
            decision_query = self.decision_prompt.format(query=query)
            
            # Get LLM decision
            decision_response = self.decision_llm.invoke(decision_query)
            
            # Parse response - look for YES/NO
            decision_text = decision_response.strip().upper()
            
            # Log the decision for debugging
            logger.info(f"Tool decision for '{query[:50]}...': {decision_text}")
            
            # More strict parsing
            result = "YES" in decision_text and "NO" not in decision_text
            
            end_graph_decision_time = time.perf_counter()
            graph_decision_time = end_graph_decision_time - start_graph_decision_time
            log_metric(f"Tool decision time: {graph_decision_time:0.4f} seconds. Decision: {result}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            
            return result
            
        except Exception as e:
            logger.error(f"Error in tool decision making: {e}")
            end_graph_decision_time = time.perf_counter()
            graph_decision_time = end_graph_decision_time - start_graph_decision_time
            log_metric(f"Tool decision time (error): {graph_decision_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            return False

# LangGraph State Definition
class EducationalAgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]
    needs_tools: bool
    educational_context: Optional[str]

@tool(return_direct=False)
def Create_Graph_Tool(
    data: dict,
    plot_type: str,
    title: str = "Generated Plot",
    x_label: str = "",
    y_label: str = "",
    educational_context: str = ""
) -> str:
    """
    Creates educational graphs and charts to help explain concepts to students.
    
    Use this tool ONLY when teaching concepts that would benefit from visual representation, such as:
    - Mathematical functions and relationships (quadratic equations, exponential growth)
    - Statistical distributions and data analysis (normal curves, survey results)
    - Scientific trends and comparisons (temperature changes, population growth)
    - Economic models and business metrics (profit over time, market shares)
    - Grade distributions or performance analysis (test score ranges)
    - Any quantitative concept that's clearer with visualization
    
    Args:
        data: Dictionary with string keys and numeric values {"Category A": 25, "Category B": 40}
        plot_type: "bar", "line", or "pie"
        title: Title for the chart
        x_label: X-axis label
        y_label: Y-axis label
        educational_context: Explanation of why this visualization helps learning
    """
    start_create_graph_tool_time = time.perf_counter()
    current_time = datetime.now()
    
    try:
        # Call the generate_plot function directly
        content, artifact = generate_plot(
            data=data,
            plot_type=plot_type,
            title=title,
            x_label=x_label,
            y_label=y_label
        )
        
        # Check if there was an error
        if "error" in artifact:
            return f'<p style="color:red;">Graph generation failed: {artifact["error"]}</p>'
        
        # Convert the base64 image to HTML
        base64_image = artifact["base64_image"]
        
        # Add educational context if provided
        context_html = ""
        if educational_context:
            context_html = f'<div style="margin: 10px 0; padding: 10px; background: #f8f9fa; border-left: 4px solid #007bff; font-style: italic;">💡 {educational_context}</div>'
        
        # Create the complete HTML with image
        result = f"""{context_html}
<div style="text-align: center; margin: 20px 0;">
    <img src="data:image/png;base64,{base64_image}" 
         style="max-width: 100%; height: auto; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1);" 
         alt="{title}" />
</div>"""
        
        end_create_graph_tool_time = time.perf_counter()
        graph_create_graph_tool_time = end_create_graph_tool_time - start_create_graph_tool_time
        log_metric(f"Graph tool creation time: {graph_create_graph_tool_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
        
        return result
        
    except Exception as e:
        logger.error(f"Error in graph generation: {e}")
        return f'<p style="color:red;">Error creating graph: {str(e)}</p>'

# System Prompt with ReAct Framework for Phi-3-mini
SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process. You do so concisely, without excessive filler language or flowery content.

## Core Educational Principles
- Provide comprehensive, educational responses that help students truly understand concepts
- Prioritize teaching methodology over answer delivery
- Foster critical thinking and independent problem-solving skills

## Formatting
- You have access to LaTeX and markdown rendering.
- Use ## and ## headings when needed. If only one heading level is needed, use ##.
- For inline math, use $ ... $, e.g. $\sum_{i=0}^n i^2$  
- For centered display math, use $$ ... $$ on its own line.  
- To show a literal dollar sign, use `\$` (e.g., \$5.00).  
- To show literal parentheses in LaTeX, use `\(` and `\)` (e.g., \(a+b\)).  
- For simple responses, use minimal formatting; for multi-step explanations, use clear structure.  
- Separate sections and paragraphs with a full black line.  
- Emojis are disabled.

## Tone and Communication Style
- Write at a reading level that is accessible yet intellectually stimulating
- Be supportive and encouraging without being condescending
- Never use crude language or content inappropriate for an educational setting
- Avoid preachy, judgmental, or accusatory language
- Skip flattery and respond directly to questions
- Do not use emojis or actions in asterisks unless specifically requested
- Present critiques and corrections kindly as educational opportunities
- Keep responses between **1 and 4 sentences** unless step-by-step reasoning is required.
- Responses may be longer if the user explicitly requests expanded detail, such as practice questions or worked examples.

## Simple Greetings
If a user only says "Hello," "Thank You," or another short greeting, first reciprocate in a professional, friendly way, then ask what you can help with today.

### Tool Usage Instructions
You are equipped with a sophisticated data visualization tool, `generate_plot`, designed to create precise, publication-quality charts. Your primary function is to assist users in data analysis and interpretation by generating visual representations of their data. When a user's query involves numerical data that would benefit from visualization, you must invoke this tool.

**Tool Signature:**
`generate_plot(data: Dict[str, float], plot_type: Literal["bar", "line", "pie"], title: str, labels: List[str], x_label: str, y_label: str)`

**Parameter Guide:**
*   `data` **(Required)**: A dictionary where keys are string labels and values are the corresponding numeric data points.
    *   *Example:* `{"Experiment A": 88.5, "Experiment B": 92.1}`
*   `plot_type` **(Required)**: The specific type of chart to generate. This **must** be one of `"bar"`, `"line"`, or `"pie"`.
*   `title` (Optional): A formal title for the plot.
*   `x_label` (Optional): The label for the horizontal axis (for `bar` and `line` charts).
*   `y_label` (Optional): The label for the vertical axis (for `bar` and `line` charts).
*   `labels` (Optional): A list of strings to use as custom labels, overriding the keys from the `data` dictionary if necessary for specific ordering or formatting.

**When to Use This Tool:**
Invoke the `generate_plot` tool to address analytical and academic queries, such as:
*   **Trend Analysis:** Visualizing data points over a sequence to identify trends, growth, or decay (use a `line` chart).
*   **Comparative Analysis:** Comparing discrete quantities or categories against each other (use a `bar` chart).
*   **Proportional Distribution:** Illustrating the component parts of a whole, typically as percentages (use a `pie` chart).

**Example Scenarios:**
*   **User Query:** "I need help practicing interpretation of trends in line graphs. To analyze the efficacy of a new fertilizer, I have recorded crop yield in kilograms over a five-week period. Please generate a line graph to visualize this growth trend and label the axes appropriately as 'Week' and 'Crop Yield (kg)'."
*   **Your Tool Call:**
    *   `data`: `{"Week 1": 120, "Week 2": 155, "Week 3": 190, "Week 4": 210, "Week 5": 245}`
    *   `plot_type`: `"line"`
    *   `title`: `"Efficacy of New Fertilizer on Crop Yield"`
    *   `x_label`: `"Week"`
    *   `y_label`: `"Crop Yield (kg)"`

*   **User Query:** "I am studying for my ACT, and I am at a loss on interpreting the charts. For practice, consider this: a study surveyed the primary mode of transportation for 1000 commuters. The results were: 450 drive, 300 use public transit, 150 cycle, and 100 walk. Construct a pie chart to illustrate the proportional distribution of these methods."
*   **Your Tool Call:**
    *   `data`: `{"Driving": 450, "Public Transit": 300, "Cycling": 150, "Walking": 100}`
    *   `plot_type`: `"pie"`
    *   `title`: `"Proportional Distribution of Commuter Transportation Methods"`

NOTE: If specific data to use is not supplied, create reasonable data to create your charts.

## Academic Integrity and Response Guidelines
- Do not provide full solutions. Instead:
  - **Guide through processes**: Break down problems into conceptual components
  - **Ask clarifying questions**: Understand what the student knows
  - **Provide similar examples**: Work through analogous problems
  - **Encourage original thinking**: Help students develop reasoning skills
  - **Suggest study strategies**: Recommend effective learning approaches
- **Math problems**: Explain concepts and guide through steps without computing final answers
- **Multiple-choice questions**: Discuss concepts being tested rather than identifying correct choices
- **Essays**: Discuss research strategies and organizational techniques
- **Factual questions**: Provide educational context and encourage synthesis

## Practice Question Templates

**Multiple Choice**

1. 1 to 4 sentence question  
OPTIONAL, IF NEEDED. only INCLUDE A GRAPH, LINKED AS IMAGE, OR TABLE, NEVER BOTH.  
![Chart, Graph](my_image.png "Scenic View")  

| Example C1 | Example C2 |...  
| :---------------: | :----------------: |...  
| Content...... | Content....... |...  

A. Option  
B. Option  
C. Option  
D. Option  

---

**All That Apply**

1. 1 to 4 sentence question  
OPTIONAL, IF NEEDED. only INCLUDE A GRAPH, LINKED AS IMAGE, OR TABLE, NEVER BOTH.  
![Chart, Graph](my_image.png "Scenic View")  

| Example C1 | Example C2 |...  
| :---------------: | :----------------: |...  
| Content...... | Content....... |...  

- [ ] A. Option  
- [ ] B. Option  
- [ ] C. Option  
- [ ] D. Option  

---

**Written Response**

1. 1 to 4 sentence question  
OPTIONAL, IF NEEDED. only INCLUDE A GRAPH, LINKED AS IMAGE, OR TABLE, NEVER BOTH.  
![Chart, Graph](my_image.png "Scenic View")  

| Example C1 | Example C2 |...  
| :---------------: | :----------------: |...  
| Content...... | Content....... |...  

Prompt the user, in one sentence, to write their response
"""

# --- Stop Criteria ---

class StopOnSequence(StoppingCriteria):
    def __init__(self, tokenizer, stop_sequence):
        self.tokenizer = tokenizer
        self.stop_sequence = tokenizer.encode(stop_sequence, add_special_tokens=False)

    def __call__(self, input_ids, scores, **kwargs):
        if input_ids[0, -len(self.stop_sequence):].tolist() == self.stop_sequence:
            return True
        return False

# --- LLM Class with Phi-3 Mini ---
class Phi3MiniEducationalLLM(Runnable):
    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with 4-bit quantization"""
    
    def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct"):
        super().__init__()
        logger.info(f"Loading Phi-3-mini model with 4-bit quantization: {model_path}")
        start_Loading_Model_time = time.perf_counter()
        current_time = datetime.now()

        self.model_name = model_path
    
        try:
            # Load tokenizer (can be done on CPU)
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_path, 
                trust_remote_code=True,
                token=hf_token,
                use_fast=False
            )

            # Configure 4-bit quantization
            self.quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_quant_type="nf4",  # NormalFloat 4-bit
                bnb_4bit_use_double_quant=True,  # Nested quantization for extra savings
            )

            # Store model path - model will be loaded inside GPU context
            self.model_path = model_path
            self.model = None
            
        except Exception as e:
            logger.error(f"Failed to initialize Phi-3-mini model {model_path}: {e}")
            raise

        # Ensure pad token exists
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.streamer = None

    def _load_model_if_needed(self):
        """Load model with 4-bit quantization only when needed inside GPU context"""
        if self.model is None:
            logger.info("Loading model with 4-bit quantization...")
            try:
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_path,
                    quantization_config=self.quantization_config,
                    torch_dtype=torch.bfloat16,
                    trust_remote_code=True,
                    low_cpu_mem_usage=True,
                    token=hf_token,
                    attn_implementation="eager",
                    device_map="auto"
                )
                logger.info(f"Model loaded successfully. Memory footprint reduced to ~2.2GB with 4-bit quantization")
            except Exception as e:
                logger.error(f"Failed to load quantized model: {e}")
                raise
        return self.model

    def _format_chat_template(self, prompt: str) -> str:
        """Format prompt using Phi-3's chat template"""
        try:
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt}
            ]
            # Use Phi-3's chat template
            formatted_text = self.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            return formatted_text
        except Exception as e:
            logger.warning(f"Chat template failed, using fallback format: {e}")
            # Fallback to manual Phi-3 format
            return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"

    @spaces.GPU(duration=180)
    def invoke(self, input: Input, config=None) -> Output:
        """Main invoke method optimized for 4-bit quantized Phi‑3‑mini"""
        start_invoke_time = time.perf_counter()
        current_time = datetime.now()

        # Handle different input types
        if isinstance(input, dict):
            if 'input' in input:
                prompt = input['input']
            elif 'messages' in input:
                prompt = str(input['messages'])
            else:
                prompt = str(input)
        else:
            prompt = str(input)

        try:
            model = self._load_model_if_needed()
            text = self._format_chat_template(prompt)

            try:
                max_input_length = 2000 - 400
                inputs = self.tokenizer(
                    text,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=max_input_length
                )
                if 'input_ids' not in inputs:
                    logger.error("Tokenizer did not return input_ids")
                    return "I encountered an error processing your request. Please try again."
            except Exception as tokenizer_error:
                logger.error(f"Tokenization error: {tokenizer_error}")
                return "I encountered an error processing your request. Please try again."

            try:
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
            except Exception as device_error:
                logger.error(f"Device transfer error: {device_error}")
                return "I encountered an error processing your request. Please try again."

            # Define stopping criteria after tokenizer initialization
            stop_criteria = StoppingCriteriaList([StopOnSequence(self.tokenizer, "User:")])

            with torch.no_grad():
                try:
                    outputs = model.generate(
                        input_ids=inputs['input_ids'],
                        attention_mask=inputs.get('attention_mask', None),
                        max_new_tokens=250,
                        do_sample=True,
                        temperature=0.4,
                        top_p=0.9,
                        top_k=50,
                        repetition_penalty=1.1,
                        pad_token_id=self.tokenizer.eos_token_id,
                        use_cache=False,
                        past_key_values=None,
                        stopping_criteria=stop_criteria
                    )
                except Exception as generation_error:
                    logger.error(f"Generation error: {generation_error}")
                    return "I encountered an error generating the response. Please try again."

            try:
                new_tokens = outputs[0][len(inputs['input_ids'][0]):]
                result = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

                # Apply soft-stop cleanup
                for stop_word in ["User:", "\n\n", "###"]:
                    if stop_word in result:
                        result = result.split(stop_word)[0].strip()
                        break
            except Exception as decode_error:
                logger.error(f"Decoding error: {decode_error}")
                return "I encountered an error processing the response. Please try again."

            end_invoke_time = time.perf_counter()
            invoke_time = end_invoke_time - start_invoke_time
            log_metric(
                f"LLM Invoke time (4‑bit): {invoke_time:0.4f} seconds. "
                f"Input length: {len(prompt)} chars. "
                f"Model: {self.model_name}. "
                f"Timestamp: {current_time:%Y‑%m‑%d %H:%M:%S}"
            )

            return result if result else "I'm still learning how to respond to that properly."

        except Exception as e:
            logger.error(f"Generation error with 4‑bit model: {e}")
            end_invoke_time = time.perf_counter()
            invoke_time = end_invoke_time - start_invoke_time
            log_metric(
                f"LLM Invoke time (error): {invoke_time:0.4f} seconds. "
                f"Model: {self.model_name}. "
                f"Timestamp: {current_time:%Y‑%m‑%d %H:%M:%S}"
            )
            return f"I encountered an error: {str(e)}"

    @spaces.GPU(duration=240)
    def stream_generate(self, input: Input, config=None):
        """Streaming generation with 4‑bit quantized model and expanded context"""
        start_stream_time = time.perf_counter()
        current_time = datetime.now()
        logger.info("Starting stream_generate with 4‑bit quantized model...")

        # Handle input properly
        if isinstance(input, dict):
            prompt = input.get('input', str(input))
        else:
            prompt = str(input)

        try:
            model = self._load_model_if_needed()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            text = self._format_chat_template(prompt)

            try:
                inputs = self.tokenizer(
                    text,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=2000
                )
                if 'input_ids' not in inputs:
                    yield "I encountered an error processing your request. Please try again."
                    return
            except Exception as tokenizer_error:
                logger.error(f"Streaming tokenization error: {tokenizer_error}")
                yield "I encountered an error processing your request. Please try again."
                return

            try:
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
            except Exception as device_error:
                logger.error(f"Streaming device transfer error: {device_error}")
                yield "I encountered an error processing your request. Please try again."
                return

            streamer = TextIteratorStreamer(
                self.tokenizer,
                skip_prompt=True,
                skip_special_tokens=True
            )

            generation_kwargs = {
                "input_ids": inputs['input_ids'],
                "attention_mask": inputs.get('attention_mask', None),
                "max_new_tokens": 250,
                "do_sample": True,
                "temperature": 0.7,
                "top_p": 0.9,
                "top_k": 50,
                "repetition_penalty": 1.2,
                "pad_token_id": self.tokenizer.eos_token_id,
                "streamer": streamer,
                "use_cache": False,
                "past_key_values": None
            }

            generation_thread = threading.Thread(
                target=model.generate,
                kwargs=generation_kwargs
            )
            generation_thread.start()

            generated_text = ""
            consecutive_repeats = 0
            last_chunk = ""

            try:
                for new_token_text in streamer:
                    if not new_token_text:
                        continue
                    generated_text += new_token_text
                    if new_token_text == last_chunk:
                        consecutive_repeats += 1
                        if consecutive_repeats >= 5:
                            logger.warning("Repetitive generation detected, stopping early")
                            break
                    else:
                        consecutive_repeats = 0
                        last_chunk = new_token_text
                    yield generated_text
            except Exception as e:
                logger.error(f"Error in streaming iteration: {e}")
                if not generated_text.strip():
                    generated_text = "I apologize, but I'm having trouble generating a response. Please try rephrasing your question."
                yield generated_text

            generation_thread.join()
            if not generated_text.strip():
                generated_text = "I apologize, but I'm having trouble generating a response. Please try rephrasing your question."
                yield generated_text

            end_stream_time = time.perf_counter()
            stream_time = end_stream_time - start_stream_time
            log_metric(
                f"LLM Stream time (4‑bit): {stream_time:0.4f} seconds. "
                f"Generated length: {len(generated_text)} chars. "
                f"Model: {self.model_name}. "
                f"Timestamp: {current_time:%Y‑%m‑%d %H:%M:%S}"
            )
        except Exception as e:
            logger.error(f"4‑bit streaming generation error: {e}")
            end_stream_time = time.perf_counter()
            stream_time = end_stream_time - start_stream_time
            log_metric(
                f"LLM Stream time (error): {stream_time:0.4f} seconds. "
                f"Model: {self.model_name}. "
                f"Timestamp: {current_time:%Y‑%m‑%d %H:%M:%S}"
            )
            yield "I encountered an error generating the response. Please try again."

    @property
    def InputType(self) -> Type[Input]:
        return str

    @property 
    def OutputType(self) -> Type[Output]:
        return str

# LangGraph Agent Implementation with Tool Calling
class Educational_Agent:
    """Modern LangGraph-based educational agent with Phi-3-mini and improved tool calling"""
    
    def __init__(self):
        start_init_and_langgraph_time = time.perf_counter()
        current_time = datetime.now()
        
        self.llm = Phi3MiniEducationalLLM(model_path="microsoft/Phi-3-mini-4k-instruct")
        self.tool_decision_engine = Tool_Decision_Engine(self.llm)
        
        # Create LangGraph workflow
        self.app = self._create_langgraph_workflow()
        
        end_init_and_langgraph_time = time.perf_counter()
        init_and_langgraph_time = end_init_and_langgraph_time - start_init_and_langgraph_time
        log_metric(f"Init and LangGraph workflow setup time: {init_and_langgraph_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
        
    def _create_langgraph_workflow(self):
        """Create the complete LangGraph workflow with improved tool calling"""
        # Use the updated Create_Graph_Tool
        tools = [Create_Graph_Tool]
        tool_node = ToolNode(tools)
        
        def call_model(state: EducationalAgentState) -> dict:
            """Call the LLM to generate a response"""
            start_call_model_time = time.perf_counter()
            current_time = datetime.now()
            
            messages = state["messages"]
            
            # Get the latest human message
            user_query = ""
            for msg in reversed(messages):
                if isinstance(msg, HumanMessage):
                    user_query = msg.content
                    break
            
            if not user_query:
                return {"messages": [AIMessage(content="I didn't receive a question. Please ask me something!")]}
            
            try:
                # Check if tools are needed based on state
                needs_tools = state.get("needs_tools", False)
                
                if needs_tools:
                    # Create tool prompt that guides the model to use structured parameters
                    tool_prompt = f"""
    You are an educational AI assistant. The user has asked: "{user_query}"
    
    This query would benefit from a visualization. Please call the Create_Graph_Tool with appropriate structured parameters.
    
    For the data parameter, create a meaningful dictionary with string keys and numeric values that illustrate the concept being discussed.
    
    Choose the appropriate plot_type:
    - "bar" for comparing categories or discrete data
    - "line" for showing trends over time or continuous relationships  
    - "pie" for showing parts of a whole or proportions
    
    Create a descriptive title and appropriate axis labels. Include an educational_context explaining why this visualization helps learning.
    
    Call the tool with these structured parameters, don't format as JSON.
    """
                    prompt = tool_prompt
                else:
                    prompt = user_query
                
                # Bind tools to LLM if needed
                if needs_tools:
                    model_with_tools = self.llm
                    # For Phi-3, we need to manually bind tools if supported
                    try:
                        if hasattr(self.llm, 'bind_tools'):
                            model_with_tools = self.llm.bind_tools(tools)
                        response = model_with_tools.invoke(prompt)
                    except:
                        # Fallback if tool binding not supported
                        response = self.llm.invoke(prompt)
                else:
                    response = self.llm.invoke(prompt)
                
                # Create AI message
                ai_message = AIMessage(content=response)
                
                end_call_model_time = time.perf_counter()
                call_model_time = end_call_model_time - start_call_model_time
                log_metric(f"Call model time: {call_model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                
                return {"messages": [ai_message]}
                
            except Exception as e:
                logger.error(f"Error in call_model: {e}")
                end_call_model_time = time.perf_counter()
                call_model_time = end_call_model_time - start_call_model_time
                log_metric(f"Call model time (error): {call_model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
                
                error_message = AIMessage(content=f"I encountered an error generating a response: {str(e)}")
                return {"messages": [error_message]}
    
        def should_continue(state: EducationalAgentState) -> str:
            """Route to tools or end based on the last message"""
            last_message = state["messages"][-1]
            
            # Check if the last message has tool calls
            if hasattr(last_message, "tool_calls") and last_message.tool_calls:
                return "tools"
            else:
                return END
    
        def make_tool_decision(state: EducationalAgentState) -> dict:
            """Decide whether tools are needed and update state"""
            start_tool_decision_time = time.perf_counter()
            current_time = datetime.now()
            
            messages = state["messages"]
            
            # Get the latest human message
            user_query = ""
            for msg in reversed(messages):
                if isinstance(msg, HumanMessage):
                    user_query = msg.content
                    break
            
            if not user_query:
                return {"needs_tools": False}
            
            # Use the tool decision engine
            needs_visualization = self.tool_decision_engine.should_use_visualization(user_query)
            
            end_tool_decision_time = time.perf_counter()
            tool_decision_time = end_tool_decision_time - start_tool_decision_time
            log_metric(f"Tool decision workflow time: {tool_decision_time:0.4f} seconds. Decision: {needs_visualization}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            
            return {"needs_tools": needs_visualization}
    
        # Create the workflow graph
        workflow = StateGraph(EducationalAgentState)
        
        # Add nodes
        workflow.add_node("decide_tools", make_tool_decision)
        workflow.add_node("call_model", call_model)
        workflow.add_node("tools", tool_node)
        
        # Add edges
        workflow.add_edge(START, "decide_tools")
        workflow.add_edge("decide_tools", "call_model")
        
        # Add conditional edge from call_model
        workflow.add_conditional_edges(
            "call_model",
            should_continue,
            {"tools": "tools", END: END}
        )
        
        # After tools, go back to call_model for final response
        workflow.add_edge("tools", "call_model")
        
        # Compile the workflow
        return workflow.compile(checkpointer=MemorySaver())

    def process_query(self, user_input: str, thread_id: str = "default") -> str:
        """Process a user query through the LangGraph workflow"""
        start_process_query_time = time.perf_counter()
        current_time = datetime.now()
        
        try:
            # Create initial state
            initial_state = {
                "messages": [HumanMessage(content=user_input)],
                "needs_tools": False,
                "educational_context": None
            }
            
            # Run the workflow
            config = {"configurable": {"thread_id": thread_id}}
            result = self.app.invoke(initial_state, config)
            
            # Extract the final response
            messages = result["messages"]
            
            # Combine AI message and tool results
            response_parts = []
            
            for msg in messages:
                if isinstance(msg, AIMessage):
                    # Clean up the response - remove JSON blocks if tools were used
                    content = msg.content
                    if "```json" in content and result.get("needs_tools", False):
                        # Remove JSON blocks from display since tools handle visualization
                        content = re.sub(r'```json.*?```', '', content, flags=re.DOTALL)
                        content = content.strip()
                    response_parts.append(content)
                elif isinstance(msg, ToolMessage):
                    response_parts.append(msg.content)
            
            final_response = "\n\n".join(response_parts).strip()
            
            end_process_query_time = time.perf_counter()
            process_query_time = end_process_query_time - start_process_query_time
            log_metric(f"Total query processing time: {process_query_time:0.4f} seconds. Input: '{user_input[:50]}...'. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            
            return final_response if final_response else "I'm having trouble generating a response. Please try rephrasing your question."
            
        except Exception as e:
            logger.error(f"Error in process_query: {e}")
            end_process_query_time = time.perf_counter()
            process_query_time = end_process_query_time - start_process_query_time
            log_metric(f"Total query processing time (error): {process_query_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            return f"I encountered an error processing your request: {str(e)}"

    def stream_query(self, user_input: str, thread_id: str = "default"):
        """Stream a response for a user query"""
        start_stream_query_time = time.perf_counter()
        current_time = datetime.now()
        
        try:
            # For streaming, we'll use the LLM directly with tool decision
            needs_tools = self.tool_decision_engine.should_use_visualization(user_input)
            
            if needs_tools:
                # Create tool prompt
                tool_prompt = f"""
You are an educational AI assistant. The user has asked: "{user_input}"

This query would benefit from a visualization. Please provide a helpful educational response AND include a JSON configuration for creating a graph or chart. 

Format your response with explanatory text followed by a JSON block like this:

```json
{{
"data": {{"Category 1": value1, "Category 2": value2}},
"plot_type": "bar|line|pie",
"title": "Descriptive Title",
"x_label": "X Axis Label",
"y_label": "Y Axis Label",
"educational_context": "Explanation of why this visualization helps learning"
}}
```

Provide your educational response followed by the JSON configuration.
"""
                prompt = tool_prompt
            else:
                prompt = user_input
            
            # Stream the response
            full_response = ""
            for chunk in self.llm.stream_generate(prompt):
                full_response = chunk
                yield chunk
            
            # Process tools if needed after streaming completes
            if needs_tools and "```json" in full_response:
                json_pattern = r'```json\s*\n?(.*?)\n?```'
                json_matches = re.findall(json_pattern, full_response, re.DOTALL)
                
                if json_matches:
                    json_config = json_matches[0].strip()
                    try:
                        # Validate and process the JSON
                        config_dict = json.loads(json_config)
                        required_keys = ['data', 'plot_type', 'title']
                        
                        if all(key in config_dict for key in required_keys):
                            # Generate the visualization
                            tool_result = Create_Graph_Tool.invoke({"graph_config": json_config})
                            
                            # Clean response and add visualization
                            cleaned_response = re.sub(r'```json.*?```', '', full_response, flags=re.DOTALL).strip()
                            final_response = f"{cleaned_response}\n\n{tool_result}"
                            yield final_response
                    except (json.JSONDecodeError, Exception) as e:
                        logger.error(f"Error processing streamed JSON: {e}")
            
            end_stream_query_time = time.perf_counter()
            stream_query_time = end_stream_query_time - start_stream_query_time
            log_metric(f"Stream query total time: {stream_query_time:0.4f} seconds. Input: '{user_input[:50]}...'. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            
        except Exception as e:
            logger.error(f"Error in stream_query: {e}")
            end_stream_query_time = time.perf_counter()
            stream_query_time = end_stream_query_time - start_stream_query_time
            log_metric(f"Stream query total time (error): {stream_query_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
            yield f"I encountered an error: {str(e)}"

# Gradio Interface Functions
def warmup_agent():
    """Warm up the agent with a simple test query"""
    try:
        logger.info("Warming up Phi-3-mini educational agent...")
        start_warmup_time = time.perf_counter()
        
        # Simple warmup query
        warmup_response = agent.process_query("Hello", thread_id="warmup")
        
        end_warmup_time = time.perf_counter()
        warmup_time = end_warmup_time - start_warmup_time
        log_metric(f"Agent warmup completed in {warmup_time:.2f} seconds")
        logger.info(f"Warmup response: {warmup_response[:100]}...")
        
    except Exception as e:
        logger.error(f"Warmup failed: {e}")

def add_user_message(message, chat_history, conversation_state):
    """Add user message to state and display immediately"""
    if not message.strip():
        return "", chat_history, conversation_state
    
    # Add to conversation state
    conversation_state.append({"role": "user", "content": message})
    
    # Update chat display
    chat_history.append({"role": "user", "content": message})
    
    return "", chat_history, conversation_state

def add_thinking_indicator(chat_history, conversation_state):
    """Add thinking indicator to chat display"""
    if not conversation_state:
        return chat_history, conversation_state
    
    # Add simple animated dots to chat display (not permanent state)
    thinking_html = get_thinking_dots()
    chat_history.append({"role": "assistant", "content": thinking_html})
    
    return chat_history, conversation_state

def generate_response(chat_history, conversation_state):
    """Generate streaming response from the agent"""
    if not conversation_state:
        return chat_history, conversation_state
    
    # Get the last user message
    last_user_message = ""
    for msg in reversed(conversation_state):
        if msg["role"] == "user":
            last_user_message = msg["content"]
            break
    
    if not last_user_message:
        return chat_history, conversation_state
    
    try:
        # Stream the response
        full_response = ""
        
        for chunk in agent.stream_query(last_user_message):
            full_response = chunk
            
            # Update the last message in chat display (replace thinking indicator)
            if chat_history and chat_history[-1]["role"] == "assistant":
                chat_history[-1]["content"] = full_response
            else:
                chat_history.append({"role": "assistant", "content": full_response})
            
            yield chat_history, conversation_state
        
        # Add final response to permanent conversation state
        conversation_state.append({"role": "assistant", "content": full_response})
        yield chat_history, conversation_state
        
    except Exception as e:
        logger.error(f"Error in generate_response: {e}")
        error_msg = f"I encountered an error: {str(e)}"
        
        # Update display
        if chat_history and chat_history[-1]["role"] == "assistant":
            chat_history[-1]["content"] = error_msg
        else:
            chat_history.append({"role": "assistant", "content": error_msg})
        
        # Add to permanent state
        conversation_state.append({"role": "assistant", "content": error_msg})
        yield chat_history, conversation_state

def reset_conversation():
    """Reset both chat display and conversation state"""
    return [], []
    
# --- UI: Interface Creation ---
def create_interface():
    """Creates and configures the complete Gradio interface with proper state management."""
    start_create_interface_time = time.perf_counter()
    current_time = datetime.now()
    
    # Read CSS file
    custom_css = ""
    try:
        with open("styles.css", "r", encoding="utf-8") as css_file:
            custom_css = css_file.read()
    except FileNotFoundError:
        logger.warning("styles.css file not found, using default styling")
    except Exception as e:
        logger.warning(f"Error reading styles.css: {e}")

    try:
        with open("styles.css", "r", encoding="utf-8") as css_file:
            custom_css = css_file.read()
        logger.info(f"CSS loaded successfully, length: {len(custom_css)} characters")
        # Check if dots CSS is actually in the file
        if ".thinking-indicator" in custom_css:
            logger.info("Dots CSS found in file")
        else:
            logger.warning("Dots CSS NOT found in file")
    except FileNotFoundError:
        logger.warning("styles.css file not found, using default styling")
    
    with gr.Blocks(
        title="Mimir", 
        fill_width=True, 
        fill_height=True,
        theme=gr.themes.Origin()
    ) as demo:
        # Add head content
        gr.HTML(html_head_content)
        gr.HTML(force_light_mode)
        
        # State management - this is the key addition
        conversation_state = gr.State([])  # Persistent conversation memory
        
        with gr.Column(elem_classes=["main-container"]):
            # Title Section
            gr.HTML('<div class="title-header"><h1>🎓 Mimir</h1></div>')
            
            # Chat Section
            with gr.Row():
                chatbot = gr.Chatbot(
                    type="messages",
                    show_copy_button=True,
                    show_share_button=False,
                    layout="bubble",
                    autoscroll=True,
                    avatar_images=None,
                    elem_id="main-chatbot",
                    scale=1,
                    height="70vh",
                    value=[],  # Initialize with empty list
                    latex_delimiters=[
                        {"left": "$$", "right": "$$", "display": True},
                        {"left": "$", "right": "$", "display": False},
                    ]
                )
            
            # Input Section
            with gr.Row(elem_classes=["input-controls"]):
                msg = gr.Textbox(
                    placeholder="Ask me about math, research, study strategies, or any educational topic...",
                    show_label=False,
                    lines=6,
                    max_lines=8,
                    elem_classes=["input-textbox"],
                    container=False,
                    scale=4
                )
                with gr.Column(elem_classes=["button-column"], scale=1):
                    send = gr.Button("Send", elem_classes=["send-button"], size="sm")
                    clear = gr.Button("Clear", elem_classes=["clear-button"], size="sm")
            
            # event chaining with state management
            submit_event = msg.submit(
                add_user_message,
                inputs=[msg, chatbot, conversation_state],
                outputs=[msg, chatbot, conversation_state],
                show_progress="hidden"
            ).then(
                add_thinking_indicator,
                inputs=[chatbot, conversation_state],
                outputs=[chatbot, conversation_state],
                show_progress="hidden"
            ).then(
                generate_response,
                inputs=[chatbot, conversation_state],
                outputs=[chatbot, conversation_state],
                show_progress="hidden"
            )
            
            send_event = send.click(
                add_user_message,
                inputs=[msg, chatbot, conversation_state],
                outputs=[msg, chatbot, conversation_state],
                show_progress="hidden"
            ).then(
                add_thinking_indicator,
                inputs=[chatbot, conversation_state],
                outputs=[chatbot, conversation_state],
                show_progress="hidden"
            ).then(
                generate_response,
                inputs=[chatbot, conversation_state],
                outputs=[chatbot, conversation_state],
                show_progress="hidden"
            )
            
            # Clear button
            clear.click(
                reset_conversation,
                inputs=None,
                outputs=[chatbot, conversation_state],
                show_progress="hidden"
            )
            
            # Apply CSS
            gr.HTML(f'<style>{custom_css}</style>')
    
    end_create_interface_time = time.perf_counter()
    create_interface_time = end_create_interface_time - start_create_interface_time
    log_metric(f"Create interface time: {create_interface_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
    
    return demo

# --- Main Execution ---
if __name__ == "__main__":
    try:
        logger.info("=" * 50)
        logger.info("Starting Mimir Application with Microsoft Phi-3-mini-4k-instruct")
        logger.info("=" * 50)
        
        # Step 1: Preload the model and agent
        logger.info("Loading Phi-3-mini model and LangGraph workflow...")
        start_time = time.time()
        agent = Educational_Agent()
        load_time = time.time() - start_time
        logger.info(f"Phi-3-mini LangGraph agent loaded successfully in {load_time:.2f} seconds")
        
        # Step 2: Warm up the model
        logger.info("Warming up Phi-3-mini model...")
        warmup_agent()
        
        interface = create_interface()
        interface.launch(
            server_name="0.0.0.0",
            share=False,
            debug=True,
            favicon_path="favicon.ico",
            ssr_mode=False
        )
        
    except Exception as e:
        logger.error(f"❌ Failed to launch Mimir with Phi-3-mini: {e}")
        raise