Spaces:
Sleeping
Sleeping
[fix]: Fixes for ZeroGPU deployment
Browse files- app.py +4 -1
- requirements.txt +21 -19
- requirements_cpu.txt +21 -0
app.py
CHANGED
|
@@ -12,6 +12,7 @@ import random
|
|
| 12 |
from types import SimpleNamespace
|
| 13 |
|
| 14 |
import gradio as gr
|
|
|
|
| 15 |
|
| 16 |
# Add paths for imports
|
| 17 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
@@ -67,6 +68,7 @@ tasks = Tasks()
|
|
| 67 |
# Lazy-loaded fine-tuned model
|
| 68 |
_finetuned_llm = None
|
| 69 |
|
|
|
|
| 70 |
def get_finetuned_model():
|
| 71 |
"""Lazy load the fine-tuned model using llama-cpp-python"""
|
| 72 |
global _finetuned_llm
|
|
@@ -78,7 +80,7 @@ def get_finetuned_model():
|
|
| 78 |
repo_id="diegohc/rbrgs-finetuning",
|
| 79 |
filename="q4/unsloth.Q4_K_M.gguf",
|
| 80 |
n_ctx=4096,
|
| 81 |
-
n_gpu_layers=
|
| 82 |
verbose=False
|
| 83 |
)
|
| 84 |
print("Fine-tuned model loaded successfully!")
|
|
@@ -87,6 +89,7 @@ def get_finetuned_model():
|
|
| 87 |
raise
|
| 88 |
return _finetuned_llm
|
| 89 |
|
|
|
|
| 90 |
def inference_finetuned(command: str) -> list:
|
| 91 |
"""
|
| 92 |
Run inference on the fine-tuned model.
|
|
|
|
| 12 |
from types import SimpleNamespace
|
| 13 |
|
| 14 |
import gradio as gr
|
| 15 |
+
from spaces import GPU
|
| 16 |
|
| 17 |
# Add paths for imports
|
| 18 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
| 68 |
# Lazy-loaded fine-tuned model
|
| 69 |
_finetuned_llm = None
|
| 70 |
|
| 71 |
+
@GPU(duration=60) # Reserve GPU for 60 seconds
|
| 72 |
def get_finetuned_model():
|
| 73 |
"""Lazy load the fine-tuned model using llama-cpp-python"""
|
| 74 |
global _finetuned_llm
|
|
|
|
| 80 |
repo_id="diegohc/rbrgs-finetuning",
|
| 81 |
filename="q4/unsloth.Q4_K_M.gguf",
|
| 82 |
n_ctx=4096,
|
| 83 |
+
n_gpu_layers=-1, # Use all GPU layers with ZeroGPU
|
| 84 |
verbose=False
|
| 85 |
)
|
| 86 |
print("Fine-tuned model loaded successfully!")
|
|
|
|
| 89 |
raise
|
| 90 |
return _finetuned_llm
|
| 91 |
|
| 92 |
+
@GPU(duration=30) # Reserve GPU for 30 seconds per inference
|
| 93 |
def inference_finetuned(command: str) -> list:
|
| 94 |
"""
|
| 95 |
Run inference on the fine-tuned model.
|
requirements.txt
CHANGED
|
@@ -1,19 +1,21 @@
|
|
| 1 |
-
# FRIDA Command Interpreter - HuggingFace Space Requirements
|
| 2 |
-
|
| 3 |
-
# BAML for structured LLM outputs
|
| 4 |
-
baml-py==0.89.0
|
| 5 |
-
|
| 6 |
-
# Embeddings and vector storage (for grounding stage)
|
| 7 |
-
chromadb>=1.0.0
|
| 8 |
-
sentence-transformers>=4.0.0
|
| 9 |
-
|
| 10 |
-
# Local fine-tuned model inference
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# FRIDA Command Interpreter - HuggingFace Space Requirements (ZeroGPU Optimized)
|
| 2 |
+
|
| 3 |
+
# BAML for structured LLM outputs
|
| 4 |
+
baml-py==0.89.0
|
| 5 |
+
|
| 6 |
+
# Embeddings and vector storage (for grounding stage)
|
| 7 |
+
chromadb>=1.0.0
|
| 8 |
+
sentence-transformers>=4.0.0
|
| 9 |
+
|
| 10 |
+
# Local fine-tuned model inference - CUDA pre-built wheels for ZeroGPU
|
| 11 |
+
# This uses pre-built CUDA 12.4 wheels to avoid compilation
|
| 12 |
+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
|
| 13 |
+
llama-cpp-python==0.3.16
|
| 14 |
+
|
| 15 |
+
# Utilities
|
| 16 |
+
pydantic>=2.0.0
|
| 17 |
+
python-dotenv>=1.0.0
|
| 18 |
+
termcolor>=2.0.0
|
| 19 |
+
|
| 20 |
+
# Note: gradio is pre-installed on HuggingFace Spaces
|
| 21 |
+
# Required environment variable: OPENROUTER_API_KEY (optional, for API-based models)
|
requirements_cpu.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FRIDA Command Interpreter - HuggingFace Space Requirements
|
| 2 |
+
|
| 3 |
+
# BAML for structured LLM outputs
|
| 4 |
+
baml-py==0.89.0
|
| 5 |
+
|
| 6 |
+
# Embeddings and vector storage (for grounding stage)
|
| 7 |
+
chromadb>=1.0.0
|
| 8 |
+
sentence-transformers>=4.0.0
|
| 9 |
+
|
| 10 |
+
# Local fine-tuned model inference
|
| 11 |
+
# Use pre-built wheels to avoid compilation timeout
|
| 12 |
+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
| 13 |
+
llama-cpp-python==0.3.16
|
| 14 |
+
|
| 15 |
+
# Utilities
|
| 16 |
+
pydantic>=2.0.0
|
| 17 |
+
python-dotenv>=1.0.0
|
| 18 |
+
termcolor>=2.0.0
|
| 19 |
+
|
| 20 |
+
# Note: gradio is pre-installed on HuggingFace Spaces
|
| 21 |
+
# Required environment variable: OPENROUTER_API_KEY
|