Spaces:
Sleeping
Sleeping
Commit ·
d1f6a06
1
Parent(s): 5a67c2d
feat: custom_hf without CUDA falls back to offline baseline
Browse files- Router skips local HF load when no CUDA; auto chain skips HF the same way.
- Gradio benchmark/compare shows Info when downgrading custom_hf -> offline on CPU.
- Optional FORGE_ALLOW_CUSTOM_HF_CPU=1 for debug CPU loads; documented in README/.env.example.
Made-with: Cursor
- .env.example +3 -0
- README.md +1 -0
- app.py +14 -2
- forge/providers/router.py +21 -0
.env.example
CHANGED
|
@@ -33,6 +33,9 @@ CODE_PROVIDER_MODE=custom_hf
|
|
| 33 |
# custom_hf: max new tokens per generation (default 192 — short code tasks)
|
| 34 |
# FORGE_HF_MAX_NEW_TOKENS=192
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
# Cap local HF inference wait (seconds); increase on GPU machines if loads are slow.
|
| 37 |
ROUTER_HF_TIMEOUT_SEC=90
|
| 38 |
ROUTER_NIM_TIMEOUT_SEC=90
|
|
|
|
| 33 |
# custom_hf: max new tokens per generation (default 192 — short code tasks)
|
| 34 |
# FORGE_HF_MAX_NEW_TOKENS=192
|
| 35 |
|
| 36 |
+
# Debug only: allow loading custom_hf on CPU (slow / may OOM). Otherwise no CUDA => offline baseline.
|
| 37 |
+
# FORGE_ALLOW_CUSTOM_HF_CPU=0
|
| 38 |
+
|
| 39 |
# Cap local HF inference wait (seconds); increase on GPU machines if loads are slow.
|
| 40 |
ROUTER_HF_TIMEOUT_SEC=90
|
| 41 |
ROUTER_NIM_TIMEOUT_SEC=90
|
README.md
CHANGED
|
@@ -62,6 +62,7 @@ suggested_hardware: t4-small
|
|
| 62 |
| **`FORGE_UI_CANDIDATES`** | Gradio benchmark/compare only — candidates per step (**default `1`** for faster `custom_hf`; raise for richer ranking) |
|
| 63 |
| **`FORGE_UI_STEPS`** | Gradio only — steps per episode (**default `2`**; set **`full`**, **`default`**, or **`0`** to use global `STEPS_PER_EPISODE` = 3) |
|
| 64 |
| **`FORGE_HF_MAX_NEW_TOKENS`** | `custom_hf` completion cap (**default `192`**) — lower is faster for short coding tasks |
|
|
|
|
| 65 |
|
| 66 |
### OpenEnv HTTP API on the Hugging Face Space
|
| 67 |
|
|
|
|
| 62 |
| **`FORGE_UI_CANDIDATES`** | Gradio benchmark/compare only — candidates per step (**default `1`** for faster `custom_hf`; raise for richer ranking) |
|
| 63 |
| **`FORGE_UI_STEPS`** | Gradio only — steps per episode (**default `2`**; set **`full`**, **`default`**, or **`0`** to use global `STEPS_PER_EPISODE` = 3) |
|
| 64 |
| **`FORGE_HF_MAX_NEW_TOKENS`** | `custom_hf` completion cap (**default `192`**) — lower is faster for short coding tasks |
|
| 65 |
+
| **`FORGE_ALLOW_CUSTOM_HF_CPU`** | Set **`1`** only for debugging — otherwise **`custom_hf` without CUDA** uses the **offline** baseline (no multi‑GB CPU load) |
|
| 66 |
|
| 67 |
### OpenEnv HTTP API on the Hugging Face Space
|
| 68 |
|
app.py
CHANGED
|
@@ -93,6 +93,12 @@ def run_benchmark_ui(episodes, forge_provider_label: str):
|
|
| 93 |
mode = forge_provider_label if forge_provider_label in (
|
| 94 |
"auto", "custom_hf", "nim", "openrouter", "offline", "mock"
|
| 95 |
) else "offline"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
report = run_benchmark_mode(
|
| 97 |
policy_name="model",
|
| 98 |
episodes=ep_count,
|
|
@@ -126,6 +132,12 @@ def run_compare_ui(episodes, forge_provider_label: str):
|
|
| 126 |
mode = forge_provider_label if forge_provider_label in (
|
| 127 |
"auto", "custom_hf", "nim", "openrouter", "offline", "mock"
|
| 128 |
) else "offline"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
report = run_compare_mode(
|
| 130 |
model_policy_name="model",
|
| 131 |
episodes=ep_count,
|
|
@@ -183,9 +195,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 183 |
value=default_forge_ui_provider(),
|
| 184 |
label="Inference provider",
|
| 185 |
info=(
|
| 186 |
-
"**custom_hf** = local PyTorch + Hub weights on
|
| 187 |
"**auto** = NIM → OpenRouter → optional local HF if **HF_TOKEN** is set → else offline. "
|
| 188 |
-
"**offline** = no
|
| 189 |
"Gradio uses **`FORGE_UI_CANDIDATES`** (default 1) and **`FORGE_UI_STEPS`** (default 2 steps/episode; set `full` for config default). CLI/training use full settings."
|
| 190 |
),
|
| 191 |
)
|
|
|
|
| 93 |
mode = forge_provider_label if forge_provider_label in (
|
| 94 |
"auto", "custom_hf", "nim", "openrouter", "offline", "mock"
|
| 95 |
) else "offline"
|
| 96 |
+
if mode == "custom_hf" and not _cuda_ready():
|
| 97 |
+
gr.Info(
|
| 98 |
+
"No GPU: using the offline deterministic baseline for inference "
|
| 99 |
+
"(environment and rewards are still real; no local Hub weight load)."
|
| 100 |
+
)
|
| 101 |
+
mode = "offline"
|
| 102 |
report = run_benchmark_mode(
|
| 103 |
policy_name="model",
|
| 104 |
episodes=ep_count,
|
|
|
|
| 132 |
mode = forge_provider_label if forge_provider_label in (
|
| 133 |
"auto", "custom_hf", "nim", "openrouter", "offline", "mock"
|
| 134 |
) else "offline"
|
| 135 |
+
if mode == "custom_hf" and not _cuda_ready():
|
| 136 |
+
gr.Info(
|
| 137 |
+
"No GPU: using the offline deterministic baseline for inference "
|
| 138 |
+
"(environment and rewards are still real; no local Hub weight load)."
|
| 139 |
+
)
|
| 140 |
+
mode = "offline"
|
| 141 |
report = run_compare_mode(
|
| 142 |
model_policy_name="model",
|
| 143 |
episodes=ep_count,
|
|
|
|
| 195 |
value=default_forge_ui_provider(),
|
| 196 |
label="Inference provider",
|
| 197 |
info=(
|
| 198 |
+
"**custom_hf** = local PyTorch + Hub weights on **GPU** only; on CPU it automatically uses **offline** baseline. "
|
| 199 |
"**auto** = NIM → OpenRouter → optional local HF if **HF_TOKEN** is set → else offline. "
|
| 200 |
+
"**offline** = deterministic baseline (no Hub load; fast on CPU). "
|
| 201 |
"Gradio uses **`FORGE_UI_CANDIDATES`** (default 1) and **`FORGE_UI_STEPS`** (default 2 steps/episode; set `full` for config default). CLI/training use full settings."
|
| 202 |
),
|
| 203 |
)
|
forge/providers/router.py
CHANGED
|
@@ -8,6 +8,7 @@
|
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import logging
|
|
|
|
| 11 |
from concurrent.futures import ThreadPoolExecutor
|
| 12 |
from concurrent.futures import TimeoutError as FuturesTimeout
|
| 13 |
from typing import Callable
|
|
@@ -30,6 +31,18 @@ def get_inference_router() -> "InferenceRouter":
|
|
| 30 |
return _ROUTER_SINGLETON
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def _with_timeout(fn: Callable[[], LLMResponse], seconds: float) -> LLMResponse:
|
| 34 |
with ThreadPoolExecutor(max_workers=1) as pool:
|
| 35 |
fut = pool.submit(fn)
|
|
@@ -91,6 +104,9 @@ class InferenceRouter:
|
|
| 91 |
return self._mock.generate(prompt, system_prompt)
|
| 92 |
|
| 93 |
if mode == "custom_hf":
|
|
|
|
|
|
|
|
|
|
| 94 |
return self._try_hf(prompt, system_prompt, fallback=True)
|
| 95 |
if mode == "nim":
|
| 96 |
return self._try_nim(prompt, system_prompt, fallback=True)
|
|
@@ -116,6 +132,11 @@ class InferenceRouter:
|
|
| 116 |
return self._mock.generate(prompt, system_prompt)
|
| 117 |
|
| 118 |
def _try_hf(self, prompt: str, system_prompt: str, fallback: bool) -> LLMResponse:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
try:
|
| 120 |
sec = self._timeouts["custom_hf"]
|
| 121 |
|
|
|
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import logging
|
| 11 |
+
import os
|
| 12 |
from concurrent.futures import ThreadPoolExecutor
|
| 13 |
from concurrent.futures import TimeoutError as FuturesTimeout
|
| 14 |
from typing import Callable
|
|
|
|
| 31 |
return _ROUTER_SINGLETON
|
| 32 |
|
| 33 |
|
| 34 |
+
def _cuda_ok_for_custom_hf() -> bool:
|
| 35 |
+
"""Local HF is only practical on GPU unless explicitly overridden (debug)."""
|
| 36 |
+
if os.getenv("FORGE_ALLOW_CUSTOM_HF_CPU", "").strip().lower() in ("1", "true", "yes"):
|
| 37 |
+
return True
|
| 38 |
+
try:
|
| 39 |
+
import torch # noqa: PLC0415
|
| 40 |
+
|
| 41 |
+
return bool(torch.cuda.is_available())
|
| 42 |
+
except ImportError:
|
| 43 |
+
return False
|
| 44 |
+
|
| 45 |
+
|
| 46 |
def _with_timeout(fn: Callable[[], LLMResponse], seconds: float) -> LLMResponse:
|
| 47 |
with ThreadPoolExecutor(max_workers=1) as pool:
|
| 48 |
fut = pool.submit(fn)
|
|
|
|
| 104 |
return self._mock.generate(prompt, system_prompt)
|
| 105 |
|
| 106 |
if mode == "custom_hf":
|
| 107 |
+
if not _cuda_ok_for_custom_hf():
|
| 108 |
+
logger.info("[router] custom_hf: no CUDA — offline baseline (set FORGE_ALLOW_CUSTOM_HF_CPU=1 to force CPU load)")
|
| 109 |
+
return self._mock.generate(prompt, system_prompt)
|
| 110 |
return self._try_hf(prompt, system_prompt, fallback=True)
|
| 111 |
if mode == "nim":
|
| 112 |
return self._try_nim(prompt, system_prompt, fallback=True)
|
|
|
|
| 132 |
return self._mock.generate(prompt, system_prompt)
|
| 133 |
|
| 134 |
def _try_hf(self, prompt: str, system_prompt: str, fallback: bool) -> LLMResponse:
|
| 135 |
+
if not _cuda_ok_for_custom_hf():
|
| 136 |
+
if fallback:
|
| 137 |
+
logger.info("[router] custom_hf skipped in chain: no CUDA")
|
| 138 |
+
return self._mock.generate(prompt, system_prompt)
|
| 139 |
+
raise RuntimeError("custom_hf requires CUDA (or FORGE_ALLOW_CUSTOM_HF_CPU=1)")
|
| 140 |
try:
|
| 141 |
sec = self._timeouts["custom_hf"]
|
| 142 |
|