Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Sep 5, 2025

Commit

9f6709f

verified ·

1 Parent(s): d0d37ab

@spaces.GPU decorator and spaces import for ZeroGPU

Browse files

Files changed (1) hide show

app.py +8 -5

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from graph_tool import generate_plot
 import os
@@ -310,15 +311,15 @@ class Phi3MiniEducationalLLM(Runnable):
                 token=hf_token
             )
-            # Load model with memory-efficient settings
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
-                torch_dtype=torch.float16,        # Use float16 to reduce memory usage
-                device_map="auto",          # Let it handle device placement
                 trust_remote_code=True,
-                low_cpu_mem_usage=True,     # Essential for memory efficiency
                 token=hf_token,
-                attn_implementation="eager"  # Use eager attention for compatibility
             )
             # Success path - log timing
@@ -356,6 +357,7 @@ class Phi3MiniEducationalLLM(Runnable):
             # Fallback to manual Phi-3 format
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     def invoke(self, input: Input, config=None) -> Output:
         """Main invoke method optimized for Phi-3-mini"""
         start_invoke_time = time.perf_counter()
@@ -414,6 +416,7 @@ class Phi3MiniEducationalLLM(Runnable):
             log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return f"[Error generating response: {str(e)}]"
     def stream_generate(self, input: Input, config=None):
         """Streaming generation using TextIteratorStreamer with loop detection and early escape."""
         start_stream_time = time.perf_counter()

+import spaces
 import gradio as gr
 from graph_tool import generate_plot
 import os
                 token=hf_token
             )
+            # Load model with ZeroGPU-optimized settings
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
+                torch_dtype=torch.float16,
+                device_map="auto",  # This will work with ZeroGPU allocation
                 trust_remote_code=True,
+                low_cpu_mem_usage=True,
                 token=hf_token,
+                attn_implementation="eager"
             )
             # Success path - log timing
             # Fallback to manual Phi-3 format
             return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
+    @spaces.GPU(duration=60)
     def invoke(self, input: Input, config=None) -> Output:
         """Main invoke method optimized for Phi-3-mini"""
         start_invoke_time = time.perf_counter()
             log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
             return f"[Error generating response: {str(e)}]"
+    @spaces.GPU(duration=120)
     def stream_generate(self, input: Input, config=None):
         """Streaming generation using TextIteratorStreamer with loop detection and early escape."""
         start_stream_time = time.perf_counter()