Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 5, 2025

Commit

dd436fe

verified ·

1 Parent(s): ef96f77

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -10

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
 import torch
 load_dotenv(".env")
@@ -283,7 +283,7 @@ Your goal is to be an educational partner who empowers students to succeed throu
 # --- Updated LLM Class with Phi-3-mini and TextIteratorStreamer ---
 class Phi3MiniEducationalLLM(Runnable):
-    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer"""
     def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = False):
         super().__init__()
@@ -318,7 +318,9 @@ class Phi3MiniEducationalLLM(Runnable):
                     torch_dtype=torch.float16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
-                    token=hf_token
                 )
             else:
                 self._load_optimized_model(model_path)
@@ -340,7 +342,7 @@ class Phi3MiniEducationalLLM(Runnable):
         self.streamer = None
     def _load_optimized_model(self, model_path: str):
-        """Optimized model loading for Phi-3-mini."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.float16,  # Use float16 to save memory
@@ -348,7 +350,8 @@ class Phi3MiniEducationalLLM(Runnable):
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             token=hf_token,
-            revision="0a67737cc96d2554230f90338b163bc6380a2a85"  # Pin revision for security
         )
     def _format_chat_template(self, prompt: str) -> str:
@@ -371,7 +374,7 @@ class Phi3MiniEducationalLLM(Runnable):
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     def invoke(self, input: Input, config=None) -> Output:
-        """Main invoke method optimized for Phi-3-mini"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
@@ -396,6 +399,9 @@ class Phi3MiniEducationalLLM(Runnable):
             # Move to model device
             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
@@ -407,7 +413,8 @@ class Phi3MiniEducationalLLM(Runnable):
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
                     early_stopping=True,
-                    use_cache=False         # Disable cache to avoid compatibility issues
                 )
             # Decode only new tokens
@@ -428,7 +435,7 @@ class Phi3MiniEducationalLLM(Runnable):
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
-        """Streaming generation using TextIteratorStreamer"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
         logger.info("Starting stream_generate with TextIteratorStreamer...")
@@ -461,6 +468,9 @@ class Phi3MiniEducationalLLM(Runnable):
                 skip_special_tokens=True
             )
             # Generation parameters
             generation_kwargs = {
                 **inputs,
@@ -472,7 +482,8 @@ class Phi3MiniEducationalLLM(Runnable):
                 "repetition_penalty": 1.1,
                 "pad_token_id": self.tokenizer.eos_token_id,
                 "streamer": streamer,
-                "use_cache": True
             }
             # Start generation in a separate thread
@@ -794,7 +805,7 @@ mathjax_config = '''
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
-    displayMath: [['$', '$'], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},

 from langchain_core.runnables import Runnable
 from langchain_core.runnables.utils import Input, Output
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer, DynamicCache
 import torch
 load_dotenv(".env")
 # --- Updated LLM Class with Phi-3-mini and TextIteratorStreamer ---
 class Phi3MiniEducationalLLM(Runnable):
+    """LLM class optimized for Microsoft Phi-3-mini-4k-instruct with TextIteratorStreamer and proper cache handling"""
     def __init__(self, model_path: str = "microsoft/Phi-3-mini-4k-instruct", use_4bit: bool = False):
         super().__init__()
                     torch_dtype=torch.float16,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
+                    token=hf_token,
+                    # Use eager attention for better compatibility in HF Spaces
+                    attn_implementation="eager"
                 )
             else:
                 self._load_optimized_model(model_path)
         self.streamer = None
     def _load_optimized_model(self, model_path: str):
+        """Optimized model loading for Phi-3-mini with proper cache support."""
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.float16,  # Use float16 to save memory
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             token=hf_token,
+            # Use eager attention for better compatibility in HF Spaces
+            attn_implementation="eager"
         )
     def _format_chat_template(self, prompt: str) -> str:
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     def invoke(self, input: Input, config=None) -> Output:
+        """Main invoke method optimized for Phi-3-mini with proper cache handling"""
         start_invoke_time = time.perf_counter()
         current_time = datetime.now()
             # Move to model device
             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Initialize DynamicCache for proper caching
+            past_key_values = DynamicCache()
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
                     repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.eos_token_id,
                     early_stopping=True,
+                    past_key_values=past_key_values,  # Use DynamicCache properly
+                    use_cache=True         # Enable cache for performance
                 )
             # Decode only new tokens
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
+        """Streaming generation using TextIteratorStreamer with proper cache handling"""
         start_stream_time = time.perf_counter()
         current_time = datetime.now()
         logger.info("Starting stream_generate with TextIteratorStreamer...")
                 skip_special_tokens=True
             )
+            # Initialize DynamicCache for proper caching
+            past_key_values = DynamicCache()
             # Generation parameters
             generation_kwargs = {
                 **inputs,
                 "repetition_penalty": 1.1,
                 "pad_token_id": self.tokenizer.eos_token_id,
                 "streamer": streamer,
+                "past_key_values": past_key_values,  # Use DynamicCache properly
+                "use_cache": True                    # Enable cache for performance
             }
             # Start generation in a separate thread
 window.MathJax = {
   tex: {
     inlineMath: [['\\\\(', '\\\\)']],
+    displayMath: [[', '], ['\\\\[', '\\\\]']],
     packages: {'[+]': ['ams']}
   },
   svg: {fontCache: 'global'},