Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 1, 2025

Commit

a57855f

verified ·

1 Parent(s): 8cddfd8

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -132

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import gradio as gr
 from graph_tool import generate_plot
-from metrics import EduBotMetrics
 import os
 import time
 import logging
-import json
 import re
-import requests
 from langchain.tools import BaseTool
 from langchain.agents import initialize_agent, AgentType
 from langchain.memory import ConversationBufferWindowMemory
@@ -14,22 +18,23 @@ from langchain.schema import SystemMessage
 from langchain.llms.base import LLM
 from typing import Optional, List, Any, Type
 from pydantic import BaseModel, Field
-from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
-from qwen_omni_utils import process_mm_info
-import soundfile as sf
-import atexit
-import glob
 # --- Environment and Logging Setup ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Support both token names for flexibility
-hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 if not hf_token:
     logger.warning("Neither HF_TOKEN nor HUGGINGFACEHUB_API_TOKEN is set, the application may not work.")
-metrics_tracker = EduBotMetrics(save_file="edu_metrics.json")
 # --- LangChain Tool Definition ---
 class GraphInput(BaseModel):
@@ -91,7 +96,7 @@ Always use proper JSON formatting with quotes around keys and string values."""
 # --- System Prompt ---
-SYSTEM_PROMPT = """You are EduBot, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process rather than providing direct answers to academic work.
 ## Core Educational Principles
 - Provide comprehensive, educational responses that help students truly understand concepts
@@ -161,44 +166,224 @@ def initialize_system_prompt(agent):
         agent.memory.chat_memory.add_message(system_message)
         system_prompt_initialized = True
-class Qwen25OmniLLM(LLM):
     model: Any = None
-    processor: Any = None
-    def __init__(self, model_path: str = "Qwen/Qwen2.5-Omni-7B"):
         super().__init__()
-        self.model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
-            model_path,
-            torch_dtype="auto",
-            device_map="auto"
-        )
-        self.processor = Qwen2_5OmniProcessor.from_pretrained(model_path)
     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
-        # Implementation for text-only responses
-        conversation = [
-            {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
-            {"role": "user", "content": [{"type": "text", "text": prompt}]}
-        ]
-        text = self.processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-        audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
-        inputs = self.processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True)
-        inputs = inputs.to(self.model.device)
-        text_ids = self.model.generate(**inputs, return_audio=False)
-        response = self.processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        return response
     @property
     def _llm_type(self) -> str:
-        return "qwen25_omni"
 def create_langchain_agent():
-    # Replace HuggingFaceHub with custom LLM
-    llm = Qwen25OmniLLM()
-    # Rest remains the same
     tools = [CreateGraphTool()]
     memory = ConversationBufferWindowMemory(
         memory_key="chat_history",
@@ -228,52 +413,6 @@ def get_agent():
         agent = create_langchain_agent()
     return agent
-def generate_voice_response(text_response: str, voice_enabled: bool = False) -> Optional[str]:
-    """Generate audio response if voice is enabled."""
-    if not voice_enabled:
-        return None
-    try:
-        current_agent = get_agent()
-        model = current_agent.llm.model
-        processor = current_agent.llm.processor
-        if not hasattr(model, 'generate') or not hasattr(model.generate, '__code__'):
-            logger.warning("Model may not support audio generation")
-            return None
-        conversation = [
-            {"role": "system", "content": [{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
-            {"role": "user", "content": [{"type": "text", "text": "Please read this response aloud: " + text_response}]}
-        ]
-        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-        audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
-        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True)
-        inputs = inputs.to(model.device)
-        text_ids, audio = model.generate(**inputs, speaker="Ethan")
-        # Save audio to temporary file
-        audio_path = f"temp_audio_{int(time.time())}.wav"
-        sf.write(audio_path, audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
-        return audio_path
-    except Exception as e:
-        logger.error(f"Error generating voice response: {e}")
-        return None
-def cleanup_temp_audio():
-    """Clean up temporary audio files on exit."""
-    for file in glob.glob("temp_audio_*.wav"):
-        try:
-            os.remove(file)
-        except:
-            pass
-# Register cleanup function
-atexit.register(cleanup_temp_audio)
 # --- UI: MathJax Configuration ---
 mathjax_config = '''
 <script>
@@ -302,7 +441,7 @@ window.MathJax = {
 html_head_content = '''
 <meta charset="utf-8">
 <meta name="viewport" content="width=device-width, initial-scale=1">
-<title>EduBot - AI Educational Assistant</title>
 '''
 # --- Force Light Mode Script ---
@@ -348,7 +487,6 @@ def generate_response_with_langchain(message, max_retries=3):
             initialize_system_prompt(current_agent)
             # Use the agent directly with the message
-            # LangChain will automatically handle adding HumanMessage and AIMessage to memory
             response = current_agent.run(input=message)
             return smart_truncate(response)
@@ -366,62 +504,58 @@ def chat_response(message, history=None):
     try:
         # Track metrics with timing context
         start_time = time.time()
-        # Debug: Check message type
-        logger.info(f"Message type: {type(message)}")
-        logger.info(f"Message content: {message}")
         try:
-            metrics_tracker.log_interaction(message, "user_query", "chat_start")
             logger.info("Metrics interaction logged successfully")
         except Exception as metrics_error:
             logger.error(f"Error in metrics_tracker.log_interaction: {metrics_error}")
-            logger.error(f"Metrics error type: {type(metrics_error)}")
-            # Continue without metrics if this fails
         # Generate response with LangChain
-        logger.info("About to call generate_response_with_langchain")
-        try:
-            response = generate_response_with_langchain(message)
-            logger.info(f"Response type: {type(response)}")
-            logger.info(f"Response preview: {str(response)[:200]}...")
-        except Exception as langchain_error:
-            logger.error(f"Error in generate_response_with_langchain: {langchain_error}")
-            raise langchain_error
-        # Log metrics with timing context
         try:
-            end_time = time.time()
-            timing_context = f"response_time_{end_time - start_time:.2f}s"
-            metrics_tracker.log_interaction(response, "bot_response", timing_context)
         except Exception as metrics_error:
             logger.error(f"Error in final metrics logging: {metrics_error}")
-            # Continue without metrics if this fails
         return response
     except Exception as e:
         logger.error(f"Error in chat_response: {e}")
-        logger.error(f"Error type: {type(e)}")
-        import traceback
-        logger.error(f"Full traceback: {traceback.format_exc()}")
         return f"I apologize, but I encountered an error while processing your message: {str(e)}"
-def respond_and_update(message, history, voice_enabled):
-    """Main function to handle user submission."""
     if not message.strip():
-        return history, "", None
     # Add user message to history
     history.append({"role": "user", "content": message})
-    yield history, "", None
-    # Generate response directly (no mock streaming)
     response = chat_response(message)
-    audio_path = generate_voice_response(response, voice_enabled) if voice_enabled else None
     history.append({"role": "assistant", "content": response})
-    yield history, "", audio_path
 def clear_chat():
     """Clear the chat history and reset system prompt flag."""
@@ -446,7 +580,7 @@ def create_interface():
         logger.warning(f"Error reading styles.css: {e}")
     with gr.Blocks(
-        title="EduBot",
         fill_width=True,
         fill_height=True,
         theme=gr.themes.Origin()
@@ -459,7 +593,7 @@ def create_interface():
         with gr.Column(elem_classes=["main-container"]):
             # Title Section
-            gr.HTML('<div class="title-header"><h1>🎓 EduBot</h1></div>')
             # Chat Section
             with gr.Row():
@@ -469,18 +603,18 @@ def create_interface():
                     show_share_button=False,
                     avatar_images=None,
                     elem_id="main-chatbot",
-                    container=False,  # Remove wrapper
                     scale=1,
-                    height="70vh"  # Explicit height instead of min_height
                 )
-            # Input Section - fixed height
             with gr.Row(elem_classes=["input-controls"]):
                 msg = gr.Textbox(
                     placeholder="Ask me about math, research, study strategies, or any educational topic...",
                     show_label=False,
-                    lines=4,
-                    max_lines=6,
                     elem_classes=["input-textbox"],
                     container=False,
                     scale=4
@@ -488,14 +622,10 @@ def create_interface():
                 with gr.Column(elem_classes=["button-column"], scale=1):
                     send = gr.Button("Send", elem_classes=["send-button"], size="sm")
                     clear = gr.Button("Clear", elem_classes=["clear-button"], size="sm")
-                    voice_toggle = gr.Checkbox(label="Enable Voice (Ethan)", value=False, elem_classes=["voice-toggle"])
-            # Add audio output component
-            audio_output = gr.Audio(label="Voice Response", visible=True, autoplay=True)
-            # Event handlers - INSIDE the Blocks context
-            msg.submit(respond_and_update, [msg, chatbot, voice_toggle], [chatbot, msg, audio_output])
-            send.click(respond_and_update, [msg, chatbot, voice_toggle], [chatbot, msg, audio_output])
             clear.click(clear_chat, outputs=[chatbot, msg])
             # Apply CSS at the very end
@@ -506,10 +636,14 @@ def create_interface():
 # --- Main Execution ---
 if __name__ == "__main__":
     try:
-        logger.info("Starting EduBot...")
         interface = create_interface()
         interface.queue()
-        interface.launch()
     except Exception as e:
-        logger.error(f"Failed to launch EduBot: {e}")
-        raise

 import gradio as gr
 from graph_tool import generate_plot
+from metrics import MimirMetrics
 import os
+os.environ['HF_HOME'] = '/tmp/huggingface'
+os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
+os.environ['HF_DATASETS_CACHE'] = '/tmp/huggingface'
 import time
+from dotenv import load_dotenv
 import logging
 import re
 from langchain.tools import BaseTool
 from langchain.agents import initialize_agent, AgentType
 from langchain.memory import ConversationBufferWindowMemory
 from langchain.llms.base import LLM
 from typing import Optional, List, Any, Type
 from pydantic import BaseModel, Field
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import torch
+# Load environment variables from .EVN fil (case-sensitive)
+load_dotenv(".evn")
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+print("Environment variables loaded.")
 # --- Environment and Logging Setup ---
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Support both token names for flexibility
+hf_token = HF_TOKEN
 if not hf_token:
     logger.warning("Neither HF_TOKEN nor HUGGINGFACEHUB_API_TOKEN is set, the application may not work.")
+metrics_tracker = MimirMetrics(save_file="Mimir_metrics.json")
 # --- LangChain Tool Definition ---
 class GraphInput(BaseModel):
 # --- System Prompt ---
+SYSTEM_PROMPT = """You are Mimir, an expert multi-concept tutor designed to facilitate genuine learning and understanding. Your primary mission is to guide students through the learning process rather than providing direct answers to academic work.
 ## Core Educational Principles
 - Provide comprehensive, educational responses that help students truly understand concepts
         agent.memory.chat_memory.add_message(system_message)
         system_prompt_initialized = True
+logger = logging.getLogger(__name__)
+class Qwen25SmallLLM(LLM):
     model: Any = None
+    tokenizer: Any = None
+    def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
         super().__init__()
+        logger.info(f"Loading model with BitsAndBytes quantization: {model_path}")
+        # Configure BitsAndBytes quantization
+        if use_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for better performance
+                bnb_4bit_use_double_quant=True,         # Double quantization for additional memory savings
+                bnb_4bit_quant_type="nf4"               # Normal Float 4-bit quantization
+            )
+            logger.info("Using 4-bit quantization with BitsAndBytes")
+        else:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True   # Offload to CPU if needed
+            )
+            logger.info("Using 8-bit quantization with BitsAndBytes")
+        try:
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                trust_remote_code=True
+            )
+            # Load model with quantization
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                quantization_config=quantization_config,
+                device_map="auto",  # Automatically distribute across available devices
+                torch_dtype=torch.bfloat16,  # Use bfloat16 for memory efficiency
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,  # Reduce CPU memory usage during loading
+                max_memory={0: "15GB"} if torch.cuda.is_available() else None  # Limit GPU memory usage
+            )
+            # Ensure pad token is set
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            logger.info("Model loaded successfully with BitsAndBytes quantization")
+        except Exception as e:
+            logger.error(f"Failed to load model with quantization: {e}")
+            logger.info("Falling back to standard loading...")
+            # Fallback to standard loading if quantization fails
+            self._load_fallback_model(model_path)
+    def _load_fallback_model(self, model_path: str):
+        """Fallback method to load model without quantization if needed."""
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            logger.info("Model loaded with fallback method")
+        except Exception as e:
+            logger.error(f"Fallback model loading also failed: {e}")
+            raise e
     def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Generate text response using the quantized local model."""
+        try:
+            # Format the conversation
+            messages = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": prompt}
+            ]
+            # Apply chat template
+            text = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            # Tokenize with proper padding
+            model_inputs = self.tokenizer(
+                [text],
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=2048  # Limit input length to prevent memory issues
+            )
+            # Move to model device if available
+            if torch.cuda.is_available():
+                model_inputs = {k: v.to(self.model.device) for k, v in model_inputs.items()}
+            # Generate with memory-efficient settings
+            with torch.no_grad():
+                generated_ids = self.model.generate(
+                    **model_inputs,
+                    max_new_tokens=800,  # Reduced for memory efficiency
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    top_k=50,
+                    repetition_penalty=1.1,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    use_cache=True,  # Enable KV cache for efficiency
+                    attention_mask=model_inputs.get('attention_mask', None)
+                )
+            # Decode response (only new tokens)
+            generated_ids = [
+                output_ids[len(input_ids):]
+                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+            ]
+            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            # Clean up GPU memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            return response.strip()
+        except torch.cuda.OutOfMemoryError:
+            logger.error("GPU out of memory during generation")
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            return "I apologize, but I'm experiencing memory constraints. Please try a shorter message or restart the application."
+        except Exception as e:
+            logger.error(f"Error in model generation: {e}")
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            return f"I apologize, but I encountered an error while generating a response: {str(e)}"
+    @property
+    def _llm_type(self) -> str:
+        return "qwen25_small_quantized"
+    model: Any = None
+    tokenizer: Any = None
+    def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct"):
+        super().__init__()
+        logger.info(f"Loading model: {model_path}")
+        # Load tokenizer and model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True
+        )
+        logger.info("Model loaded successfully")
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Generate text response using the local model."""
+        try:
+            # Format the conversation
+            messages = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": prompt}
+            ]
+            # Apply chat template
+            text = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            # Tokenize
+            model_inputs = self.tokenizer([text], return_tensors="pt")
+            if torch.cuda.is_available():
+                model_inputs = model_inputs.to(self.model.device)
+            # Generate
+            with torch.no_grad():
+                generated_ids = self.model.generate(
+                    **model_inputs,
+                    max_new_tokens=1000,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            # Decode response
+            generated_ids = [
+                output_ids[len(input_ids):]
+                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+            ]
+            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            return response.strip()
+        except Exception as e:
+            logger.error(f"Error in model generation: {e}")
+            return f"I apologize, but I encountered an error while generating a response: {str(e)}"
     @property
     def _llm_type(self) -> str:
+        return "qwen25_small"
 def create_langchain_agent():
+    # Use the smaller local model
+    llm = Qwen25SmallLLM()
     tools = [CreateGraphTool()]
     memory = ConversationBufferWindowMemory(
         memory_key="chat_history",
         agent = create_langchain_agent()
     return agent
 # --- UI: MathJax Configuration ---
 mathjax_config = '''
 <script>
 html_head_content = '''
 <meta charset="utf-8">
 <meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Mimir - AI Educational Assistant</title>
 '''
 # --- Force Light Mode Script ---
             initialize_system_prompt(current_agent)
             # Use the agent directly with the message
             response = current_agent.run(input=message)
             return smart_truncate(response)
     try:
         # Track metrics with timing context
         start_time = time.time()
+        timing_context = {
+            'start_time': start_time,
+            'chunk_count': 0,
+            'provider_latency': 0.0
+        }
         try:
+            # Log start of interaction
+            metrics_tracker.log_interaction(
+                query=message,
+                response="",
+                timing_context=timing_context,
+                error_occurred=False
+            )
             logger.info("Metrics interaction logged successfully")
         except Exception as metrics_error:
             logger.error(f"Error in metrics_tracker.log_interaction: {metrics_error}")
         # Generate response with LangChain
+        response = generate_response_with_langchain(message)
+        # Log final metrics
         try:
+            metrics_tracker.log_interaction(
+                query=message,
+                response=response,
+                timing_context=timing_context,
+                error_occurred=False
+            )
         except Exception as metrics_error:
             logger.error(f"Error in final metrics logging: {metrics_error}")
         return response
     except Exception as e:
         logger.error(f"Error in chat_response: {e}")
         return f"I apologize, but I encountered an error while processing your message: {str(e)}"
+def respond_and_update(message, history):
+    """Main function to handle user submission - no voice parameter."""
     if not message.strip():
+        return history, ""
     # Add user message to history
     history.append({"role": "user", "content": message})
+    yield history, ""
+    # Generate response
     response = chat_response(message)
     history.append({"role": "assistant", "content": response})
+    yield history, ""
 def clear_chat():
     """Clear the chat history and reset system prompt flag."""
         logger.warning(f"Error reading styles.css: {e}")
     with gr.Blocks(
+        title="Mimir",
         fill_width=True,
         fill_height=True,
         theme=gr.themes.Origin()
         with gr.Column(elem_classes=["main-container"]):
             # Title Section
+            gr.HTML('<div class="title-header"><h1> Mimir 🎓</h1></div>')
             # Chat Section
             with gr.Row():
                     show_share_button=False,
                     avatar_images=None,
                     elem_id="main-chatbot",
+                    container=False,
                     scale=1,
+                    height="70vh"
                 )
+            # Input Section
             with gr.Row(elem_classes=["input-controls"]):
                 msg = gr.Textbox(
                     placeholder="Ask me about math, research, study strategies, or any educational topic...",
                     show_label=False,
+                    lines=6,
+                    max_lines=8,
                     elem_classes=["input-textbox"],
                     container=False,
                     scale=4
                 with gr.Column(elem_classes=["button-column"], scale=1):
                     send = gr.Button("Send", elem_classes=["send-button"], size="sm")
                     clear = gr.Button("Clear", elem_classes=["clear-button"], size="sm")
+            # Event handlers - no voice parameter
+            msg.submit(respond_and_update, [msg, chatbot], [chatbot, msg])
+            send.click(respond_and_update, [msg, chatbot], [chatbot, msg])
             clear.click(clear_chat, outputs=[chatbot, msg])
             # Apply CSS at the very end
 # --- Main Execution ---
 if __name__ == "__main__":
     try:
+        logger.info("Starting Mimir...")
         interface = create_interface()
         interface.queue()
+        interface.launch(
+            server_name="0.0.0.0",
+            share=True,
+            debug=True,
+            favicon_path="assets/favicon.ico"
+        )
     except Exception as e:
+        logger.error(f"Failed to launch Mimir: {e}")