Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 6, 2025

Commit

7868388

verified ·

1 Parent(s): adf3479

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -139

app.py CHANGED Viewed

@@ -38,7 +38,6 @@ os.environ['HF_DATASETS_CACHE'] = '/tmp/huggingface'
 warnings.filterwarnings("ignore", message="Special tokens have been added")
 warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
 warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")
-# Suppress transformer warnings specifically
 warnings.filterwarnings("ignore", message=".*TracerWarning.*")
 warnings.filterwarnings("ignore", message=".*flash-attention.*")
@@ -46,11 +45,43 @@ load_dotenv(".env")
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 print("Environment variables loaded.")
-# --- Setup main logger first ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# --- Environment and Logging Setup ---
 def setup_metrics_logger():
     """Setup a simple file logger for human-readable metrics"""
     metrics_logger = logging.getLogger('metrics')
@@ -86,7 +117,7 @@ hf_token = HF_TOKEN
 if not hf_token:
     logger.warning("Neither HF_TOKEN nor HUGGINGFACEHUB_API_TOKEN is set, the application may not work.")
-# --- LangGraph State Definition ---
 class EducationalAgentState(TypedDict):
     messages: Annotated[Sequence[BaseMessage], add_messages]
     needs_tools: bool
@@ -156,7 +187,7 @@ def Create_Graph_Tool(graph_config: str) -> str:
         logger.error(f"Error in graph generation: {e}")
         return f'<p style="color:red;">Error creating graph: {str(e)}</p>'
-# --- Tool Decision Engine (Updated for LangGraph) ---
 class Tool_Decision_Engine:
     """Uses LLM to intelligently decide when visualization tools would be beneficial"""
@@ -234,7 +265,7 @@ Decision:"""
             log_metric(f"Tool decision time (error): {graph_decision_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return False
-# --- System Prompt with ReAct Framework for Phi-3-mini ---
 SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process rather than providing direct answers to academic work.
 ## Core Educational Principles
@@ -291,8 +322,7 @@ Rather than providing complete solutions, you should:
 Your goal is to be an educational partner who empowers students to succeed through understanding."""
-# --- Updated LLM Class with Phi-3-mini ---
 class Phi3MiniEducationalLLM(Runnable):
     """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with 4-bit quantization"""
@@ -381,8 +411,15 @@ class Phi3MiniEducationalLLM(Runnable):
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
         if isinstance(input, dict):
-            prompt = input.get('input', str(input))
         else:
             prompt = str(input)
@@ -393,36 +430,59 @@ class Phi3MiniEducationalLLM(Runnable):
             # Format using Phi-3 chat template
             text = self._format_chat_template(prompt)
-            inputs = self.tokenizer(
-                text,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=4096  # Expanded to use full 4k context
-            )
             # Move inputs to model device
-            inputs = {k: v.to(model.device) for k, v in inputs.items()}
             # Generate with optimized parameters for quantized model
             with torch.no_grad():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=1200,  # Increased from 800
-                    do_sample=True,
-                    temperature=0.7,
-                    top_p=0.9,
-                    top_k=50,
-                    repetition_penalty=1.1,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    early_stopping=True,
-                    use_cache=False,  # Disable cache for compatibility
-                    past_key_values=None
-                )
             # Decode only new tokens
-            new_tokens = outputs[0][len(inputs.input_ids[0]):]
-            result = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
@@ -435,7 +495,7 @@ class Phi3MiniEducationalLLM(Runnable):
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
             log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-            return f"[Error generating response: {str(e)}]"
     @spaces.GPU(duration=240)
     def stream_generate(self, input: Input, config=None):
@@ -444,8 +504,12 @@ class Phi3MiniEducationalLLM(Runnable):
         current_time = datetime.now()
         logger.info("Starting stream_generate with 4-bit quantized model...")
         if isinstance(input, dict):
-            prompt = input.get('input', str(input))
         else:
             prompt = str(input)
@@ -459,69 +523,88 @@ class Phi3MiniEducationalLLM(Runnable):
             text = self._format_chat_template(prompt)
-            inputs = self.tokenizer(
-                text,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=4096  # Expanded to use full 4k context
-            )
             # Move inputs to model device
-            inputs = {k: v.to(model.device) for k, v in inputs.items()}
-            # Initialize TextIteratorStreamer
             streamer = TextIteratorStreamer(
                 self.tokenizer,
-                skip_prompt=True,
                 skip_special_tokens=True
             )
             # Generation parameters optimized for 4-bit
             generation_kwargs = {
-                **inputs,
-                "max_new_tokens": 1200,  # Increased from 800
                 "do_sample": True,
                 "temperature": 0.7,
                 "top_p": 0.9,
                 "top_k": 50,
                 "repetition_penalty": 1.2,
                 "pad_token_id": self.tokenizer.eos_token_id,
-                "streamer": streamer,
                 "use_cache": False,
                 "past_key_values": None
             }
-            # Start generation in background
             generation_thread = threading.Thread(
                 target=model.generate,
                 kwargs=generation_kwargs
             )
             generation_thread.start()
-            # Clean streaming without error messages in stream
             generated_text = ""
             consecutive_repeats = 0
             last_chunk = ""
             try:
-                for new_text in streamer:
-                    if not new_text:
                         continue
-                    generated_text += new_text
                     # Simple repetition detection
-                    if new_text == last_chunk:
                         consecutive_repeats += 1
                         if consecutive_repeats >= 5:
                             logger.warning("Repetitive generation detected, stopping early")
                             break
                     else:
                         consecutive_repeats = 0
-                        last_chunk = new_text
-                    # Only yield actual content, never error messages
                     yield generated_text
             except Exception as e:
@@ -555,8 +638,8 @@ class Phi3MiniEducationalLLM(Runnable):
     @property
     def OutputType(self) -> Type[Output]:
         return str
-# --- LangGraph Agent Implementation with Tool Calling ---
 class Educational_Agent:
     """Modern LangGraph-based educational agent with Phi-3-mini and improved tool calling"""
@@ -592,87 +675,7 @@ class Educational_Agent:
             # Check if the message content contains JSON for tool calling
             if isinstance(last_message, AIMessage) and last_message.content:
-                content = last_message.content.strip()
-                # Look for JSON blocks that might be tool calls
-                if content.startswith('```json') and 'plot_type' in content:
-                    logger.info("Found JSON tool configuration in message")
-                    return "tools"
-            return END
-        def call_model(state: EducationalAgentState) -> dict:
-            """Call the model using the tool decision already made in state"""
-            start_call_model_time = time.perf_counter()
-            current_time = datetime.now()
-            messages = state["messages"]
-            needs_tools = state.get("needs_tools", False)  # Use the decision from state
-            # Extract original user query from messages
-            user_query = ""
-            for msg in reversed(messages):
-                if isinstance(msg, HumanMessage):
-                    user_query = msg.content
-                    break
-            if not user_query:
-                logger.error("No user query found in state messages")
-                return {"messages": [AIMessage(content="I didn't receive your message properly. Please try again.")]}
-            try:
-                if needs_tools:
-                    logger.info("Generating response with tool instructions based on state decision")
-                    # Create tool prompt but preserve original user query
-                    tool_prompt = f"""
-You are an educational AI assistant. The user has asked: "{user_query}"
-This query would benefit from a visualization. Please provide a helpful educational response AND include a JSON configuration for creating a graph or chart.
-Format your response with explanatory text followed by a JSON block like this:
-```json
-{{
-"data": {{"Category 1": value1, "Category 2": value2}},
-"plot_type": "bar|line|pie",
-"title": "Descriptive Title",
-"x_label": "X Axis Label",
-"y_label": "Y Axis Label",
-"educational_context": "Explanation of why this visualization helps learning"
-}}
-```
-Provide your educational response followed by the JSON configuration.
-"""
-                    response = self.llm.invoke(tool_prompt)
-                else:
-                    logger.info("Generating standard educational response")
-                    response = self.llm.invoke(user_query)
-                end_call_model_time = time.perf_counter()
-                call_model_time = end_call_model_time - start_call_model_time
-                log_metric(f"Call model time: {call_model_time:0.4f} seconds. Tool decision: {needs_tools}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-                return {"messages": [AIMessage(content=response)]}
-            except Exception as e:
-                logger.error(f"Error in call_model: {e}")
-                end_call_model_time = time.perf_counter()
-                call_model_time = end_call_model_time - start_call_model_time
-                log_metric(f"Call model time (error): {call_model_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
-                return {"messages": [AIMessage(content=f"I encountered an error: {str(e)}")]}
-        def process_json_tools(state: EducationalAgentState) -> dict:
-            """Extract and process JSON tool configurations from AI messages"""
-            start_process_tools_time = time.perf_counter()
-            current_time = datetime.now()
-            messages = state["messages"]
-            last_message = messages[-1]
-            if not isinstance(last_message, AIMessage):
-                return {"messages": []}
-            content = last_message.content
             # Look for JSON blocks in the message
             json_pattern = r'```json\s*\n?(.*?)\n?```'
@@ -887,7 +890,7 @@ Provide your educational response followed by the JSON configuration.
             log_metric(f"Stream query total time (error): {stream_query_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             yield f"I encountered an error: {str(e)}"
-# --- Gradio Interface Functions ---
 def warmup_agent():
     """Warm up the agent with a simple test query"""
     try:
@@ -904,6 +907,35 @@ def warmup_agent():
     except Exception as e:
         logger.error(f"Warmup failed: {e}")
 # --- UI: Interface Creation ---
 def create_interface():

 warnings.filterwarnings("ignore", message="Special tokens have been added")
 warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
 warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")
 warnings.filterwarnings("ignore", message=".*TracerWarning.*")
 warnings.filterwarnings("ignore", message=".*flash-attention.*")
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 print("Environment variables loaded.")
+# Setup main logger first
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# MISSING HTML CONTENT DEFINITIONS - FIX FOR UNDEFINED VARIABLES
+html_head_content = """
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Mimir - Educational AI Assistant</title>
+"""
+force_light_mode = """
+<script>
+// Force light mode
+if (document.documentElement) {
+    document.documentElement.setAttribute('data-theme', 'light');
+}
+</script>
+"""
+mathjax_config = """
+<script>
+window.MathJax = {
+  tex: {
+    inlineMath: [['$', '$'], ['\\(', '\\)']],
+    displayMath: [['$$', '$$'], ['\\[', '\\]']],
+    processEscapes: true,
+    processEnvironments: true
+  },
+  options: {
+    skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+  }
+};
+</script>
+"""
+# Environment and Logging Setup
 def setup_metrics_logger():
     """Setup a simple file logger for human-readable metrics"""
     metrics_logger = logging.getLogger('metrics')
 if not hf_token:
     logger.warning("Neither HF_TOKEN nor HUGGINGFACEHUB_API_TOKEN is set, the application may not work.")
+# LangGraph State Definition
 class EducationalAgentState(TypedDict):
     messages: Annotated[Sequence[BaseMessage], add_messages]
     needs_tools: bool
         logger.error(f"Error in graph generation: {e}")
         return f'<p style="color:red;">Error creating graph: {str(e)}</p>'
+# Tool Decision Engine (Updated for LangGraph)
 class Tool_Decision_Engine:
     """Uses LLM to intelligently decide when visualization tools would be beneficial"""
             log_metric(f"Tool decision time (error): {graph_decision_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return False
+# System Prompt with ReAct Framework for Phi-3-mini
 SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process rather than providing direct answers to academic work.
 ## Core Educational Principles
 Your goal is to be an educational partner who empowers students to succeed through understanding."""
+# FIXED LLM Class with Phi-3-mini
 class Phi3MiniEducationalLLM(Runnable):
     """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with 4-bit quantization"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
+        # FIX: Handle different input types properly
         if isinstance(input, dict):
+            if 'input' in input:
+                prompt = input['input']
+            elif 'messages' in input:
+                # Handle messages format
+                prompt = str(input['messages'])
+            else:
+                prompt = str(input)
         else:
             prompt = str(input)
             # Format using Phi-3 chat template
             text = self._format_chat_template(prompt)
+            # FIX: Proper tokenization with error handling
+            try:
+                inputs = self.tokenizer(
+                    text,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=4096
+                )
+                # Ensure inputs are properly formatted
+                if not hasattr(inputs, 'input_ids'):
+                    logger.error("Tokenizer did not return input_ids")
+                    return "I encountered an error processing your request. Please try again."
+            except Exception as tokenizer_error:
+                logger.error(f"Tokenization error: {tokenizer_error}")
+                return "I encountered an error processing your request. Please try again."
             # Move inputs to model device
+            try:
+                inputs = {k: v.to(model.device) for k, v in inputs.items()}
+            except Exception as device_error:
+                logger.error(f"Device transfer error: {device_error}")
+                return "I encountered an error processing your request. Please try again."
             # Generate with optimized parameters for quantized model
             with torch.no_grad():
+                try:
+                    outputs = model.generate(
+                        input_ids=inputs['input_ids'],
+                        attention_mask=inputs.get('attention_mask', None),
+                        max_new_tokens=1200,
+                        do_sample=True,
+                        temperature=0.7,
+                        top_p=0.9,
+                        top_k=50,
+                        repetition_penalty=1.1,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                        use_cache=False,
+                        past_key_values=None
+                    )
+                except Exception as generation_error:
+                    logger.error(f"Generation error: {generation_error}")
+                    return "I encountered an error generating the response. Please try again."
             # Decode only new tokens
+            try:
+                new_tokens = outputs[0][len(inputs['input_ids'][0]):]
+                result = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+            except Exception as decode_error:
+                logger.error(f"Decoding error: {decode_error}")
+                return "I encountered an error processing the response. Please try again."
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
             end_invoke_time = time.perf_counter()
             invoke_time = end_invoke_time - start_invoke_time
             log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
+            return f"I encountered an error: {str(e)}"
     @spaces.GPU(duration=240)
     def stream_generate(self, input: Input, config=None):
         current_time = datetime.now()
         logger.info("Starting stream_generate with 4-bit quantized model...")
+        # Handle input properly
         if isinstance(input, dict):
+            if 'input' in input:
+                prompt = input['input']
+            else:
+                prompt = str(input)
         else:
             prompt = str(input)
             text = self._format_chat_template(prompt)
+            # Proper tokenization with error handling
+            try:
+                inputs = self.tokenizer(
+                    text,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=4096
+                )
+                if not hasattr(inputs, 'input_ids'):
+                    yield "I encountered an error processing your request. Please try again."
+                    return
+            except Exception as tokenizer_error:
+                logger.error(f"Streaming tokenization error: {tokenizer_error}")
+                yield "I encountered an error processing your request. Please try again."
+                return
             # Move inputs to model device
+            try:
+                inputs = {k: v.to(model.device) for k, v in inputs.items()}
+            except Exception as device_error:
+                logger.error(f"Streaming device transfer error: {device_error}")
+                yield "I encountered an error processing your request. Please try again."
+                return
+            # Initialize TextIteratorStreamer - this streams the GENERATED TOKENS, not the input
             streamer = TextIteratorStreamer(
                 self.tokenizer,
+                skip_prompt=True,  # Skip the input prompt in output
                 skip_special_tokens=True
             )
             # Generation parameters optimized for 4-bit
             generation_kwargs = {
+                "input_ids": inputs['input_ids'],
+                "attention_mask": inputs.get('attention_mask', None),
+                "max_new_tokens": 1200,
                 "do_sample": True,
                 "temperature": 0.7,
                 "top_p": 0.9,
                 "top_k": 50,
                 "repetition_penalty": 1.2,
                 "pad_token_id": self.tokenizer.eos_token_id,
+                "streamer": streamer,  # This streams the OUTPUT tokens as they're generated
                 "use_cache": False,
                 "past_key_values": None
             }
+            # Start generation in background thread
             generation_thread = threading.Thread(
                 target=model.generate,
                 kwargs=generation_kwargs
             )
             generation_thread.start()
+            # Stream the generated tokens as they come from the model
             generated_text = ""
             consecutive_repeats = 0
             last_chunk = ""
             try:
+                # This loop receives tokens as they're generated by the model
+                for new_token_text in streamer:
+                    if not new_token_text:
                         continue
+                    # Accumulate the generated text
+                    generated_text += new_token_text
                     # Simple repetition detection
+                    if new_token_text == last_chunk:
                         consecutive_repeats += 1
                         if consecutive_repeats >= 5:
                             logger.warning("Repetitive generation detected, stopping early")
                             break
                     else:
                         consecutive_repeats = 0
+                        last_chunk = new_token_text
+                    # Yield the accumulated generated text (not the input prompt)
                     yield generated_text
             except Exception as e:
     @property
     def OutputType(self) -> Type[Output]:
         return str
+# LangGraph Agent Implementation with Tool Calling
 class Educational_Agent:
     """Modern LangGraph-based educational agent with Phi-3-mini and improved tool calling"""
             # Check if the message content contains JSON for tool calling
             if isinstance(last_message, AIMessage) and last_message.content:
+                            content = last_message.content
             # Look for JSON blocks in the message
             json_pattern = r'```json\s*\n?(.*?)\n?```'
             log_metric(f"Stream query total time (error): {stream_query_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             yield f"I encountered an error: {str(e)}"
+# Gradio Interface Functions
 def warmup_agent():
     """Warm up the agent with a simple test query"""
     try:
     except Exception as e:
         logger.error(f"Warmup failed: {e}")
+def respond_and_update(message, history):
+    """Handle user input and generate streaming response"""
+    if not message.strip():
+        return history, ""
+    # Add user message to history
+    history.append({"role": "user", "content": message})
+    # Add empty assistant message that will be updated
+    history.append({"role": "assistant", "content": ""})
+    try:
+        # Generate streaming response
+        full_response = ""
+        for chunk in agent.stream_query(message):
+            full_response = chunk
+            # Update the last message in history
+            history[-1]["content"] = full_response
+            yield history, ""
+    except Exception as e:
+        logger.error(f"Error in respond_and_update: {e}")
+        history[-1]["content"] = f"I encountered an error: {str(e)}"
+        yield history, ""
+def clear_chat():
+    """Clear the chat history"""
+    return [], ""
 # --- UI: Interface Creation ---
 def create_interface():