Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 5, 2025

Commit

9a01e13

verified ·

1 Parent(s): 5a61673

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -103

app.py CHANGED Viewed

@@ -1,12 +1,6 @@
 import gradio as gr
 from graph_tool import generate_plot
 import os
-# Environment Variables
-os.environ['HF_HOME'] = '/tmp/huggingface'
-os.environ['HF_DATASETS_CACHE'] = '/tmp/huggingface'
-import time
 import platform
 from dotenv import load_dotenv
 import logging
@@ -30,10 +24,20 @@ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
-from transformers import AutoTokenizer, TextIteratorStreamer
-from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
-from optimum.onnxruntime.configuration import AutoQuantizationConfig
 import torch
 load_dotenv(".env")
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
@@ -284,20 +288,17 @@ Rather than providing complete solutions, you should:
 Your goal is to be an educational partner who empowers students to succeed through understanding."""
-# --- Updated LLM Class with Phi-3-mini and TextIteratorStreamer ---
 class Phi3MiniEducationalLLM(Runnable):
-    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with ONNX Runtime quantization"""
-    def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_quantization: bool = True,
-                 quantization_type: str = "avx512_vnni"):
         super().__init__()
-        logger.info(f"Loading Phi-3-mini model: {model_path} (quantization={use_quantization}, type={quantization_type})")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         self.model_name = model_path
-        self.use_quantization = use_quantization
-        self.quantization_type = quantization_type
         try:
             # Load tokenizer - Phi-3 requires trust_remote_code
@@ -307,15 +308,21 @@ class Phi3MiniEducationalLLM(Runnable):
                 token=hf_token
             )
-            if use_quantization:
-                self._load_quantized_model(model_path, quantization_type)
-            else:
-                self._load_standard_onnx_model(model_path)
             # Success path - log timing
             end_Loading_Model_time = time.perf_counter()
             Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
-            log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Model: {model_path}. Quantization: {use_quantization}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
             logger.error(f"Failed to load Phi-3-mini model {model_path}: {e}")
@@ -328,67 +335,6 @@ class Phi3MiniEducationalLLM(Runnable):
         # Initialize TextIteratorStreamer
         self.streamer = None
-    def _load_quantized_model(self, model_path: str, quantization_type: str):
-        """Load model with ONNX Runtime quantization."""
-        try:
-            # First, load the model as ONNX format
-            onnx_model = ORTModelForCausalLM.from_pretrained(
-                model_path,
-                export=True,  # Convert PyTorch to ONNX if needed
-                trust_remote_code=True,
-                token=hf_token,
-                provider="CPUExecutionProvider"  # Force CPU execution
-            )
-            # Create quantizer
-            quantizer = ORTQuantizer.from_pretrained(onnx_model)
-            # Define quantization configuration based on type
-            if quantization_type == "avx512_vnni":
-                qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
-            elif quantization_type == "avx512":
-                qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=False)
-            elif quantization_type == "avx2":
-                qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)
-            elif quantization_type == "arm64":
-                qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False)
-            else:
-                logger.warning(f"Unknown quantization type {quantization_type}, using avx512_vnni")
-                qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
-            # Create temporary directory for quantized model
-            quantized_model_dir = f"./quantized_{model_path.replace('/', '_')}"
-            os.makedirs(quantized_model_dir, exist_ok=True)
-            # Quantize the model
-            logger.info(f"Quantizing model with {quantization_type}...")
-            model_quantized_path = quantizer.quantize(
-                save_dir=quantized_model_dir,
-                quantization_config=qconfig,
-            )
-            # Load the quantized model
-            self.model = ORTModelForCausalLM.from_pretrained(
-                quantized_model_dir,
-                provider="CPUExecutionProvider"
-            )
-            logger.info(f"Successfully loaded quantized model from {model_quantized_path}")
-        except Exception as e:
-            logger.warning(f"Quantization failed ({e}), falling back to standard ONNX model")
-            self._load_standard_onnx_model(model_path)
-    def _load_standard_onnx_model(self, model_path: str):
-        """Load standard ONNX model without quantization."""
-        self.model = ORTModelForCausalLM.from_pretrained(
-            model_path,
-            export=True,  # Convert PyTorch to ONNX if needed
-            trust_remote_code=True,
-            token=hf_token,
-            provider="CPUExecutionProvider"  # Force CPU execution
-        )
     def _format_chat_template(self, prompt: str) -> str:
         """Format prompt using Phi-3's chat template"""
         try:
@@ -409,7 +355,7 @@ class Phi3MiniEducationalLLM(Runnable):
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     def invoke(self, input: Input, config=None) -> Output:
-        """Main invoke method optimized for Phi-3-mini with ONNX Runtime"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
@@ -431,19 +377,23 @@ class Phi3MiniEducationalLLM(Runnable):
                 max_length=3072  # Leave room for generation within 4k context
             )
-            # Generate with ONNX Runtime model
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=800,     # Increased for comprehensive responses
-                do_sample=True,
-                temperature=0.7,        # Good balance for educational content
-                top_p=0.9,
-                top_k=50,
-                repetition_penalty=1.1,
-                pad_token_id=self.tokenizer.eos_token_id,
-                early_stopping=True,
-                use_cache=True         # Enable cache for performance
-            )
             # Decode only new tokens
             new_tokens = outputs[0][len(inputs.input_ids[0]):]
@@ -463,10 +413,10 @@ class Phi3MiniEducationalLLM(Runnable):
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
-        """Streaming generation using TextIteratorStreamer with ONNX Runtime"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
-        logger.info("Starting stream_generate with TextIteratorStreamer and ONNX Runtime...")
         # Handle both string and dict inputs
         if isinstance(input, dict):
@@ -486,6 +436,9 @@ class Phi3MiniEducationalLLM(Runnable):
                 max_length=3072
             )
             # Initialize TextIteratorStreamer
             streamer = TextIteratorStreamer(
                 self.tokenizer,
@@ -493,7 +446,7 @@ class Phi3MiniEducationalLLM(Runnable):
                 skip_special_tokens=True
             )
-            # Generation parameters for ONNX Runtime model
             generation_kwargs = {
                 **inputs,
                 "max_new_tokens": 800,
@@ -529,14 +482,14 @@ class Phi3MiniEducationalLLM(Runnable):
             generation_thread.join()
             end_stream_time = time.perf_counter()
-            stream_time = end_stream_time - start_invoke_time
             log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Generated length: {len(generated_text)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             logger.info(f"Stream generation completed: {len(generated_text)} chars in {stream_time:.2f}s")
         except Exception as e:
             logger.error(f"Streaming generation error: {e}")
             end_stream_time = time.perf_counter()
-            stream_time = end_stream_time - start_invoke_time
             log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             yield f"[Error in streaming generation: {str(e)}]"
@@ -556,7 +509,7 @@ class Educational_Agent:
         start_init_and_langgraph_time = time.perf_counter()
         current_time = datetime.now()
-        self.llm = Phi3MiniEducationalLLM(model_path="microsoft/Phi-3-mini-4k-instruct", use_quantization=True)
         self.tool_decision_engine = Tool_Decision_Engine(self.llm)
         # Create LangGraph workflow
@@ -1081,7 +1034,7 @@ def create_interface():
 if __name__ == "__main__":
     try:
         logger.info("=" * 50)
-        logger.info("Starting Mimir Application with Microsoft Phi-3-mini-4k-instruct and ONNX Runtime Quantization")
         logger.info("=" * 50)
         # Step 1: Preload the model and agent

 import gradio as gr
 from graph_tool import generate_plot
 import os
 import platform
 from dotenv import load_dotenv
 import logging
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
+from transformers import AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
 import torch
+import time
+import warnings
+# Updated environment variables
+os.environ['HF_HOME'] = '/tmp/huggingface'
+os.environ['HF_DATASETS_CACHE'] = '/tmp/huggingface'
+# Suppress warnings
+warnings.filterwarnings("ignore", message="Special tokens have been added")
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub")
+torch._C._set_print_trace_warnings(False)
 load_dotenv(".env")
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 Your goal is to be an educational partner who empowers students to succeed through understanding."""
+# --- Updated LLM Class with Phi-3-mini ---
 class Phi3MiniEducationalLLM(Runnable):
+    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct without quantization"""
+    def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct"):
         super().__init__()
+        logger.info(f"Loading Phi-3-mini model: {model_path}")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
         self.model_name = model_path
         try:
             # Load tokenizer - Phi-3 requires trust_remote_code
                 token=hf_token
             )
+            # Load model with memory-efficient settings
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                dtype=torch.float16,        # Use float16 to reduce memory usage
+                device_map="auto",          # Let it handle device placement
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,     # Essential for memory efficiency
+                token=hf_token,
+                attn_implementation="eager"  # Use eager attention for compatibility
+            )
             # Success path - log timing
             end_Loading_Model_time = time.perf_counter()
             Loading_Model_time = end_Loading_Model_time - start_Loading_Model_time
+            log_metric(f"Model Load time: {Loading_Model_time:0.4f} seconds. Model: {model_path}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
         except Exception as e:
             logger.error(f"Failed to load Phi-3-mini model {model_path}: {e}")
         # Initialize TextIteratorStreamer
         self.streamer = None
     def _format_chat_template(self, prompt: str) -> str:
         """Format prompt using Phi-3's chat template"""
         try:
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     def invoke(self, input: Input, config=None) -> Output:
+        """Main invoke method optimized for Phi-3-mini"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
                 max_length=3072  # Leave room for generation within 4k context
             )
+            # Move inputs to model device
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate with the model
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=800,     # Increased for comprehensive responses
+                    do_sample=True,
+                    temperature=0.7,        # Good balance for educational content
+                    top_p=0.9,
+                    top_k=50,
+                    repetition_penalty=1.1,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    early_stopping=True,
+                    use_cache=True         # Enable cache for performance
+                )
             # Decode only new tokens
             new_tokens = outputs[0][len(inputs.input_ids[0]):]
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
+        """Streaming generation using TextIteratorStreamer"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
+        logger.info("Starting stream_generate with TextIteratorStreamer...")
         # Handle both string and dict inputs
         if isinstance(input, dict):
                 max_length=3072
             )
+            # Move inputs to model device
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             # Initialize TextIteratorStreamer
             streamer = TextIteratorStreamer(
                 self.tokenizer,
                 skip_special_tokens=True
             )
+            # Generation parameters
             generation_kwargs = {
                 **inputs,
                 "max_new_tokens": 800,
             generation_thread.join()
             end_stream_time = time.perf_counter()
+            stream_time = end_stream_time - start_stream_time
             log_metric(f"LLM Stream time: {stream_time:0.4f} seconds. Generated length: {len(generated_text)} chars. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             logger.info(f"Stream generation completed: {len(generated_text)} chars in {stream_time:.2f}s")
         except Exception as e:
             logger.error(f"Streaming generation error: {e}")
             end_stream_time = time.perf_counter()
+            stream_time = end_stream_time - start_stream_time
             log_metric(f"LLM Stream time (error): {stream_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             yield f"[Error in streaming generation: {str(e)}]"
         start_init_and_langgraph_time = time.perf_counter()
         current_time = datetime.now()
+        self.llm = Phi3MiniEducationalLLM(model_path="microsoft/Phi-3-mini-4k-instruct")
         self.tool_decision_engine = Tool_Decision_Engine(self.llm)
         # Create LangGraph workflow
 if __name__ == "__main__":
     try:
         logger.info("=" * 50)
+        logger.info("Starting Mimir Application with Microsoft Phi-3-mini-4k-instruct")
         logger.info("=" * 50)
         # Step 1: Preload the model and agent