jdesiree commited on
Commit
9f6709f
·
verified ·
1 Parent(s): d0d37ab

@spaces.GPU decorator and spaces import for ZeroGPU

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from graph_tool import generate_plot
3
  import os
@@ -310,15 +311,15 @@ class Phi3MiniEducationalLLM(Runnable):
310
  token=hf_token
311
  )
312
 
313
- # Load model with memory-efficient settings
314
  self.model = AutoModelForCausalLM.from_pretrained(
315
  model_path,
316
- torch_dtype=torch.float16, # Use float16 to reduce memory usage
317
- device_map="auto", # Let it handle device placement
318
  trust_remote_code=True,
319
- low_cpu_mem_usage=True, # Essential for memory efficiency
320
  token=hf_token,
321
- attn_implementation="eager" # Use eager attention for compatibility
322
  )
323
 
324
  # Success path - log timing
@@ -356,6 +357,7 @@ class Phi3MiniEducationalLLM(Runnable):
356
  # Fallback to manual Phi-3 format
357
  return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
358
 
 
359
  def invoke(self, input: Input, config=None) -> Output:
360
  """Main invoke method optimized for Phi-3-mini"""
361
  start_invoke_time = time.perf_counter()
@@ -414,6 +416,7 @@ class Phi3MiniEducationalLLM(Runnable):
414
  log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
415
  return f"[Error generating response: {str(e)}]"
416
 
 
417
  def stream_generate(self, input: Input, config=None):
418
  """Streaming generation using TextIteratorStreamer with loop detection and early escape."""
419
  start_stream_time = time.perf_counter()
 
1
+ import spaces
2
  import gradio as gr
3
  from graph_tool import generate_plot
4
  import os
 
311
  token=hf_token
312
  )
313
 
314
+ # Load model with ZeroGPU-optimized settings
315
  self.model = AutoModelForCausalLM.from_pretrained(
316
  model_path,
317
+ torch_dtype=torch.float16,
318
+ device_map="auto", # This will work with ZeroGPU allocation
319
  trust_remote_code=True,
320
+ low_cpu_mem_usage=True,
321
  token=hf_token,
322
+ attn_implementation="eager"
323
  )
324
 
325
  # Success path - log timing
 
357
  # Fallback to manual Phi-3 format
358
  return f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
359
 
360
+ @spaces.GPU(duration=60)
361
  def invoke(self, input: Input, config=None) -> Output:
362
  """Main invoke method optimized for Phi-3-mini"""
363
  start_invoke_time = time.perf_counter()
 
416
  log_metric(f"LLM Invoke time (error): {invoke_time:0.4f} seconds. Model: {self.model_name}. Timestamp: {current_time:%Y-%m-%d %H:%M:%S}")
417
  return f"[Error generating response: {str(e)}]"
418
 
419
+ @spaces.GPU(duration=120)
420
  def stream_generate(self, input: Input, config=None):
421
  """Streaming generation using TextIteratorStreamer with loop detection and early escape."""
422
  start_stream_time = time.perf_counter()