Spaces:

afr2903
/

frida-cortex

Sleeping

afr2903 commited on Jan 2

Commit

35e47c9

1 Parent(s): d980649

[fix]: Fixes for ZeroGPU deployment

Files changed (3) hide show

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import random
 from types import SimpleNamespace
 import gradio as gr
 # Add paths for imports
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -67,6 +68,7 @@ tasks = Tasks()
 # Lazy-loaded fine-tuned model
 _finetuned_llm = None
 def get_finetuned_model():
     """Lazy load the fine-tuned model using llama-cpp-python"""
     global _finetuned_llm
@@ -78,7 +80,7 @@ def get_finetuned_model():
                 repo_id="diegohc/rbrgs-finetuning",
                 filename="q4/unsloth.Q4_K_M.gguf",
                 n_ctx=4096,
-                n_gpu_layers=0,  # CPU only for HF Spaces (set higher for GPU)
                 verbose=False
             )
             print("Fine-tuned model loaded successfully!")
@@ -87,6 +89,7 @@ def get_finetuned_model():
             raise
     return _finetuned_llm
 def inference_finetuned(command: str) -> list:
     """
     Run inference on the fine-tuned model.

 from types import SimpleNamespace
 import gradio as gr
+from spaces import GPU
 # Add paths for imports
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 # Lazy-loaded fine-tuned model
 _finetuned_llm = None
+@GPU(duration=60)  # Reserve GPU for 60 seconds
 def get_finetuned_model():
     """Lazy load the fine-tuned model using llama-cpp-python"""
     global _finetuned_llm
                 repo_id="diegohc/rbrgs-finetuning",
                 filename="q4/unsloth.Q4_K_M.gguf",
                 n_ctx=4096,
+                n_gpu_layers=-1,  # Use all GPU layers with ZeroGPU
                 verbose=False
             )
             print("Fine-tuned model loaded successfully!")
             raise
     return _finetuned_llm
+@GPU(duration=30)  # Reserve GPU for 30 seconds per inference
 def inference_finetuned(command: str) -> list:
     """
     Run inference on the fine-tuned model.

requirements.txt CHANGED Viewed

@@ -1,19 +1,21 @@
-# FRIDA Command Interpreter - HuggingFace Space Requirements
-# BAML for structured LLM outputs
-baml-py==0.89.0
-# Embeddings and vector storage (for grounding stage)
-chromadb>=1.0.0
-sentence-transformers>=4.0.0
-# Local fine-tuned model inference
-llama-cpp-python>=0.3.0
-# Utilities
-pydantic>=2.0.0
-python-dotenv>=1.0.0
-termcolor>=2.0.0
-# Note: gradio is pre-installed on HuggingFace Spaces
-# Required environment variable: OPENROUTER_API_KEY

+# FRIDA Command Interpreter - HuggingFace Space Requirements (ZeroGPU Optimized)
+# BAML for structured LLM outputs
+baml-py==0.89.0
+# Embeddings and vector storage (for grounding stage)
+chromadb>=1.0.0
+sentence-transformers>=4.0.0
+# Local fine-tuned model inference - CUDA pre-built wheels for ZeroGPU
+# This uses pre-built CUDA 12.4 wheels to avoid compilation
+--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
+llama-cpp-python==0.3.16
+# Utilities
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+termcolor>=2.0.0
+# Note: gradio is pre-installed on HuggingFace Spaces
+# Required environment variable: OPENROUTER_API_KEY (optional, for API-based models)

requirements_cpu.txt ADDED Viewed

+# FRIDA Command Interpreter - HuggingFace Space Requirements
+# BAML for structured LLM outputs
+baml-py==0.89.0
+# Embeddings and vector storage (for grounding stage)
+chromadb>=1.0.0
+sentence-transformers>=4.0.0
+# Local fine-tuned model inference
+# Use pre-built wheels to avoid compilation timeout
+--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+llama-cpp-python==0.3.16
+# Utilities
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+termcolor>=2.0.0
+# Note: gradio is pre-installed on HuggingFace Spaces
+# Required environment variable: OPENROUTER_API_KEY