Add ZeroGPU support: @spaces.GPU + n_gpu_layers=-1
Browse files- __pycache__/app.cpython-39.pyc +0 -0
- app.py +19 -3
- requirements.txt +1 -0
__pycache__/app.cpython-39.pyc
ADDED
|
Binary file (17.1 kB). View file
|
|
|
app.py
CHANGED
|
@@ -211,9 +211,15 @@ def demo_reply(prompt: str, state: GameState, mode: str) -> str:
|
|
| 211 |
|
| 212 |
|
| 213 |
# ---------------------------------------------------------------------------
|
| 214 |
-
# Model loading — llama-cpp-python (GGUF)
|
| 215 |
# ---------------------------------------------------------------------------
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
@lru_cache(maxsize=1)
|
| 218 |
def get_llm():
|
| 219 |
"""Load the GGUF model. Raises RuntimeError when DEMO_MODE is forced."""
|
|
@@ -226,12 +232,13 @@ def get_llm():
|
|
| 226 |
repo_id=GGUF_REPO,
|
| 227 |
filename=GGUF_FILE,
|
| 228 |
n_ctx=2048,
|
| 229 |
-
n_threads=
|
|
|
|
| 230 |
verbose=False,
|
| 231 |
)
|
| 232 |
|
| 233 |
|
| 234 |
-
def
|
| 235 |
if DEMO_MODE in {"1", "true", "yes", "on"}:
|
| 236 |
return demo_reply(messages[-1]["content"], state, fallback_mode)
|
| 237 |
|
|
@@ -256,6 +263,15 @@ def call_model(messages: List[Dict[str, str]], state: GameState, fallback_mode:
|
|
| 256 |
)
|
| 257 |
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
# ---------------------------------------------------------------------------
|
| 260 |
# Game logic
|
| 261 |
# ---------------------------------------------------------------------------
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
# ---------------------------------------------------------------------------
|
| 214 |
+
# Model loading — llama-cpp-python (GGUF) with ZeroGPU support
|
| 215 |
# ---------------------------------------------------------------------------
|
| 216 |
|
| 217 |
+
try:
|
| 218 |
+
import spaces
|
| 219 |
+
HAS_ZEROGPU = True
|
| 220 |
+
except ImportError:
|
| 221 |
+
HAS_ZEROGPU = False
|
| 222 |
+
|
| 223 |
@lru_cache(maxsize=1)
|
| 224 |
def get_llm():
|
| 225 |
"""Load the GGUF model. Raises RuntimeError when DEMO_MODE is forced."""
|
|
|
|
| 232 |
repo_id=GGUF_REPO,
|
| 233 |
filename=GGUF_FILE,
|
| 234 |
n_ctx=2048,
|
| 235 |
+
n_threads=4,
|
| 236 |
+
n_gpu_layers=-1, # offload all layers to GPU when available
|
| 237 |
verbose=False,
|
| 238 |
)
|
| 239 |
|
| 240 |
|
| 241 |
+
def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback_mode: str) -> str:
|
| 242 |
if DEMO_MODE in {"1", "true", "yes", "on"}:
|
| 243 |
return demo_reply(messages[-1]["content"], state, fallback_mode)
|
| 244 |
|
|
|
|
| 263 |
)
|
| 264 |
|
| 265 |
|
| 266 |
+
# Wrap with @spaces.GPU when ZeroGPU is available
|
| 267 |
+
if HAS_ZEROGPU:
|
| 268 |
+
@spaces.GPU
|
| 269 |
+
def call_model(messages, state, fallback_mode):
|
| 270 |
+
return _call_model_inner(messages, state, fallback_mode)
|
| 271 |
+
else:
|
| 272 |
+
call_model = _call_model_inner
|
| 273 |
+
|
| 274 |
+
|
| 275 |
# ---------------------------------------------------------------------------
|
| 276 |
# Game logic
|
| 277 |
# ---------------------------------------------------------------------------
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
gradio==4.44.1
|
| 2 |
llama-cpp-python==0.3.22
|
| 3 |
huggingface_hub>=0.24.0
|
|
|
|
|
|
| 1 |
gradio==4.44.1
|
| 2 |
llama-cpp-python==0.3.22
|
| 3 |
huggingface_hub>=0.24.0
|
| 4 |
+
spaces
|