Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 5, 2025

Commit

89e465f

verified ·

1 Parent(s): f2731c1

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -12

app.py CHANGED Viewed

@@ -285,9 +285,9 @@ Your goal is to be an educational partner who empowers students to succeed throu
 class Phi3MiniEducationalLLM(Runnable):
     """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
-    def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = False):
         super().__init__()
-        logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit})")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
@@ -302,24 +302,25 @@ class Phi3MiniEducationalLLM(Runnable):
             )
             if use_4bit:
                 quant_config = BitsAndBytesConfig(
                     load_in_4bit=True,
-                    bnb_4bit_compute_dtype=torch.float16,
                     bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type="nf4",
-                    llm_int8_threshold=0.0,
                     llm_int8_skip_modules=["lm_head"]
                 )
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     quantization_config=quant_config,
-                    device_map="auto",
-                    torch_dtype=torch.float16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     token=hf_token,
-                    # Use eager attention for better compatibility in HF Spaces
                     attn_implementation="eager"
                 )
             else:
@@ -342,15 +343,15 @@ class Phi3MiniEducationalLLM(Runnable):
         self.streamer = None
     def _load_optimized_model(self, model_path: str):
-        """Optimized model loading for Phi-3-mini with proper cache support."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            torch_dtype=torch.float16,  # Use float16 to save memory
-            device_map="auto",          # Let transformers decide placement
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             token=hf_token,
-            # Use eager attention for better compatibility in HF Spaces
             attn_implementation="eager"
         )
@@ -951,12 +952,18 @@ def clear_chat():
     """Clear the chat history."""
     return [], ""
 def warmup_agent():
     """Warm up the agent with a test query to preload everything."""
     start_agent_warmup_time = time.perf_counter()
     current_time = datetime.now()
     logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
     try:
         current_agent = get_agent()
@@ -964,6 +971,9 @@ def warmup_agent():
         test_response = current_agent.chat("Hello, this is a warmup test.")
         logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
         end_agent_warmup_time = time.perf_counter()
         agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
         log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")

 class Phi3MiniEducationalLLM(Runnable):
     """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
+    def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = True):
         super().__init__()
+        logger.info(f"Loading Phi-3-mini model: {model_path} (use_4bit={use_4bit}) for CPU")
         start_Loading_Model_time = time.perf_counter()
         current_time = datetime.now()
             )
             if use_4bit:
+                # CPU-optimized 4-bit quantization configuration
                 quant_config = BitsAndBytesConfig(
                     load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float32,  # Use float32 for CPU compatibility
                     bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",  # NF4 is optimal for normally distributed weights
+                    llm_int8_threshold=6.0,  # Default threshold for outlier detection
                     llm_int8_skip_modules=["lm_head"]
                 )
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     quantization_config=quant_config,
+                    device_map="cpu",  # Force CPU placement
+                    dtype=torch.float32,  # Use float32 for CPU
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     token=hf_token,
+                    # Use eager attention for better compatibility
                     attn_implementation="eager"
                 )
             else:
         self.streamer = None
     def _load_optimized_model(self, model_path: str):
+        """Optimized model loading for Phi-3-mini with proper CPU support."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
+            dtype=torch.float32,  # Use float32 for CPU compatibility
+            device_map="cpu",           # Force CPU placement
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             token=hf_token,
+            # Use eager attention for better compatibility
             attn_implementation="eager"
         )
     """Clear the chat history."""
     return [], ""
+def log_cpu_memory_usage():
+    """Placeholder for CPU/memory logging function."""
+    pass
 def warmup_agent():
     """Warm up the agent with a test query to preload everything."""
     start_agent_warmup_time = time.perf_counter()
     current_time = datetime.now()
     logger.info("Warming up Phi-3-mini LangGraph agent with test query...")
+    log_cpu_memory_usage()  # Log usage before warmup
     try:
         current_agent = get_agent()
         test_response = current_agent.chat("Hello, this is a warmup test.")
         logger.info(f"Phi-3-mini LangGraph agent warmup completed successfully! Test response length: {len(test_response)} chars")
+        # Log usage after warmup
+        log_cpu_memory_usage()
         end_agent_warmup_time = time.perf_counter()
         agent_warmup_time = end_agent_warmup_time - start_agent_warmup_time
         log_metric(f"Agent warmup time: {agent_warmup_time:0.4f} seconds. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")