Spaces:
Sleeping
Sleeping
@spaces.GPU decorator and spaces import for ZeroGPU
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from graph_tool import generate_plot
|
| 3 |
import os
|
|
@@ -310,15 +311,15 @@ class Phi3MiniEducationalLLM(Runnable):
|
|
| 310 |
token=hf_token
|
| 311 |
)
|
| 312 |
|
| 313 |
-
# Load model with
|
| 314 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 315 |
model_path,
|
| 316 |
-
torch_dtype=torch.float16,
|
| 317 |
-
device_map="auto",
|
| 318 |
trust_remote_code=True,
|
| 319 |
-
low_cpu_mem_usage=True,
|
| 320 |
token=hf_token,
|
| 321 |
-
attn_implementation="eager"
|
| 322 |
)
|
| 323 |
|
| 324 |
# Success path - log timing
|
|
@@ -356,6 +357,7 @@ class Phi3MiniEducationalLLM(Runnable):
|
|
| 356 |
# Fallback to manual Phi-3 format
|
| 357 |
return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
|
| 358 |
|
|
|
|
| 359 |
def invoke(self, input: Input, config=None) -> Output:
|
| 360 |
"""Main invoke method optimized for Phi-3-mini"""
|
| 361 |
start_invoke_time = time.perf_counter()
|
|
@@ -414,6 +416,7 @@ class Phi3MiniEducationalLLM(Runnable):
|
|
| 414 |
log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
|
| 415 |
return f"[Error generating response: {str(e)}]"
|
| 416 |
|
|
|
|
| 417 |
def stream_generate(self, input: Input, config=None):
|
| 418 |
"""Streaming generation using TextIteratorStreamer with loop detection and early escape."""
|
| 419 |
start_stream_time = time.perf_counter()
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
import gradio as gr
|
| 3 |
from graph_tool import generate_plot
|
| 4 |
import os
|
|
|
|
| 311 |
token=hf_token
|
| 312 |
)
|
| 313 |
|
| 314 |
+
# Load model with ZeroGPU-optimized settings
|
| 315 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 316 |
model_path,
|
| 317 |
+
torch_dtype=torch.float16,
|
| 318 |
+
device_map="auto", # This will work with ZeroGPU allocation
|
| 319 |
trust_remote_code=True,
|
| 320 |
+
low_cpu_mem_usage=True,
|
| 321 |
token=hf_token,
|
| 322 |
+
attn_implementation="eager"
|
| 323 |
)
|
| 324 |
|
| 325 |
# Success path - log timing
|
|
|
|
| 357 |
# Fallback to manual Phi-3 format
|
| 358 |
return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
|
| 359 |
|
| 360 |
+
@spaces.GPU(duration=60)
|
| 361 |
def invoke(self, input: Input, config=None) -> Output:
|
| 362 |
"""Main invoke method optimized for Phi-3-mini"""
|
| 363 |
start_invoke_time = time.perf_counter()
|
|
|
|
| 416 |
log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
|
| 417 |
return f"[Error generating response: {str(e)}]"
|
| 418 |
|
| 419 |
+
@spaces.GPU(duration=120)
|
| 420 |
def stream_generate(self, input: Input, config=None):
|
| 421 |
"""Streaming generation using TextIteratorStreamer with loop detection and early escape."""
|
| 422 |
start_stream_time = time.perf_counter()
|