Spaces:

sanjay7676
/

Team404_FORGE

Sleeping

sanjay7676 commited on 30 days ago

Commit

d1f6a06

1 Parent(s): 5a67c2d

feat: custom_hf without CUDA falls back to offline baseline

- Router skips local HF load when no CUDA; auto chain skips HF the same way.
- Gradio benchmark/compare shows Info when downgrading custom_hf -> offline on CPU.
- Optional FORGE_ALLOW_CUSTOM_HF_CPU=1 for debug CPU loads; documented in README/.env.example.

Made-with: Cursor

Files changed (4) hide show

.env.example +3 -0
README.md +1 -0
app.py +14 -2
forge/providers/router.py +21 -0

.env.example CHANGED Viewed

@@ -33,6 +33,9 @@ CODE_PROVIDER_MODE=custom_hf
 # custom_hf: max new tokens per generation (default 192 — short code tasks)
 # FORGE_HF_MAX_NEW_TOKENS=192
 # Cap local HF inference wait (seconds); increase on GPU machines if loads are slow.
 ROUTER_HF_TIMEOUT_SEC=90
 ROUTER_NIM_TIMEOUT_SEC=90

 # custom_hf: max new tokens per generation (default 192 — short code tasks)
 # FORGE_HF_MAX_NEW_TOKENS=192
+# Debug only: allow loading custom_hf on CPU (slow / may OOM). Otherwise no CUDA => offline baseline.
+# FORGE_ALLOW_CUSTOM_HF_CPU=0
 # Cap local HF inference wait (seconds); increase on GPU machines if loads are slow.
 ROUTER_HF_TIMEOUT_SEC=90
 ROUTER_NIM_TIMEOUT_SEC=90

README.md CHANGED Viewed

@@ -62,6 +62,7 @@ suggested_hardware: t4-small
 | **`FORGE_UI_CANDIDATES`** | Gradio benchmark/compare only — candidates per step (**default `1`** for faster `custom_hf`; raise for richer ranking) |
 | **`FORGE_UI_STEPS`** | Gradio only — steps per episode (**default `2`**; set **`full`**, **`default`**, or **`0`** to use global `STEPS_PER_EPISODE` = 3) |
 | **`FORGE_HF_MAX_NEW_TOKENS`** | `custom_hf` completion cap (**default `192`**) — lower is faster for short coding tasks |
 ### OpenEnv HTTP API on the Hugging Face Space

 | **`FORGE_UI_CANDIDATES`** | Gradio benchmark/compare only — candidates per step (**default `1`** for faster `custom_hf`; raise for richer ranking) |
 | **`FORGE_UI_STEPS`** | Gradio only — steps per episode (**default `2`**; set **`full`**, **`default`**, or **`0`** to use global `STEPS_PER_EPISODE` = 3) |
 | **`FORGE_HF_MAX_NEW_TOKENS`** | `custom_hf` completion cap (**default `192`**) — lower is faster for short coding tasks |
+| **`FORGE_ALLOW_CUSTOM_HF_CPU`** | Set **`1`** only for debugging — otherwise **`custom_hf` without CUDA** uses the **offline** baseline (no multi‑GB CPU load) |
 ### OpenEnv HTTP API on the Hugging Face Space

app.py CHANGED Viewed

@@ -93,6 +93,12 @@ def run_benchmark_ui(episodes, forge_provider_label: str):
     mode = forge_provider_label if forge_provider_label in (
         "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
     ) else "offline"
     report = run_benchmark_mode(
         policy_name="model",
         episodes=ep_count,
@@ -126,6 +132,12 @@ def run_compare_ui(episodes, forge_provider_label: str):
     mode = forge_provider_label if forge_provider_label in (
         "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
     ) else "offline"
     report = run_compare_mode(
         model_policy_name="model",
         episodes=ep_count,
@@ -183,9 +195,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 value=default_forge_ui_provider(),
                 label="Inference provider",
                 info=(
-                    "**custom_hf** = local PyTorch + Hub weights on this machine (default on **GPU**). "
                     "**auto** = NIM → OpenRouter → optional local HF if **HF_TOKEN** is set → else offline. "
-                    "**offline** = no external APIs (CPU-friendly fallback). "
                     "Gradio uses **`FORGE_UI_CANDIDATES`** (default 1) and **`FORGE_UI_STEPS`** (default 2 steps/episode; set `full` for config default). CLI/training use full settings."
                 ),
             )

     mode = forge_provider_label if forge_provider_label in (
         "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
     ) else "offline"
+    if mode == "custom_hf" and not _cuda_ready():
+        gr.Info(
+            "No GPU: using the offline deterministic baseline for inference "
+            "(environment and rewards are still real; no local Hub weight load)."
+        )
+        mode = "offline"
     report = run_benchmark_mode(
         policy_name="model",
         episodes=ep_count,
     mode = forge_provider_label if forge_provider_label in (
         "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
     ) else "offline"
+    if mode == "custom_hf" and not _cuda_ready():
+        gr.Info(
+            "No GPU: using the offline deterministic baseline for inference "
+            "(environment and rewards are still real; no local Hub weight load)."
+        )
+        mode = "offline"
     report = run_compare_mode(
         model_policy_name="model",
         episodes=ep_count,
                 value=default_forge_ui_provider(),
                 label="Inference provider",
                 info=(
+                    "**custom_hf** = local PyTorch + Hub weights on **GPU** only; on CPU it automatically uses **offline** baseline. "
                     "**auto** = NIM → OpenRouter → optional local HF if **HF_TOKEN** is set → else offline. "
+                    "**offline** = deterministic baseline (no Hub load; fast on CPU). "
                     "Gradio uses **`FORGE_UI_CANDIDATES`** (default 1) and **`FORGE_UI_STEPS`** (default 2 steps/episode; set `full` for config default). CLI/training use full settings."
                 ),
             )

forge/providers/router.py CHANGED Viewed

@@ -8,6 +8,7 @@
 from __future__ import annotations
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import TimeoutError as FuturesTimeout
 from typing import Callable
@@ -30,6 +31,18 @@ def get_inference_router() -> "InferenceRouter":
     return _ROUTER_SINGLETON
 def _with_timeout(fn: Callable[[], LLMResponse], seconds: float) -> LLMResponse:
     with ThreadPoolExecutor(max_workers=1) as pool:
         fut = pool.submit(fn)
@@ -91,6 +104,9 @@ class InferenceRouter:
             return self._mock.generate(prompt, system_prompt)
         if mode == "custom_hf":
             return self._try_hf(prompt, system_prompt, fallback=True)
         if mode == "nim":
             return self._try_nim(prompt, system_prompt, fallback=True)
@@ -116,6 +132,11 @@ class InferenceRouter:
         return self._mock.generate(prompt, system_prompt)
     def _try_hf(self, prompt: str, system_prompt: str, fallback: bool) -> LLMResponse:
         try:
             sec = self._timeouts["custom_hf"]

 from __future__ import annotations
 import logging
+import os
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import TimeoutError as FuturesTimeout
 from typing import Callable
     return _ROUTER_SINGLETON
+def _cuda_ok_for_custom_hf() -> bool:
+    """Local HF is only practical on GPU unless explicitly overridden (debug)."""
+    if os.getenv("FORGE_ALLOW_CUSTOM_HF_CPU", "").strip().lower() in ("1", "true", "yes"):
+        return True
+    try:
+        import torch  # noqa: PLC0415
+        return bool(torch.cuda.is_available())
+    except ImportError:
+        return False
 def _with_timeout(fn: Callable[[], LLMResponse], seconds: float) -> LLMResponse:
     with ThreadPoolExecutor(max_workers=1) as pool:
         fut = pool.submit(fn)
             return self._mock.generate(prompt, system_prompt)
         if mode == "custom_hf":
+            if not _cuda_ok_for_custom_hf():
+                logger.info("[router] custom_hf: no CUDA — offline baseline (set FORGE_ALLOW_CUSTOM_HF_CPU=1 to force CPU load)")
+                return self._mock.generate(prompt, system_prompt)
             return self._try_hf(prompt, system_prompt, fallback=True)
         if mode == "nim":
             return self._try_nim(prompt, system_prompt, fallback=True)
         return self._mock.generate(prompt, system_prompt)
     def _try_hf(self, prompt: str, system_prompt: str, fallback: bool) -> LLMResponse:
+        if not _cuda_ok_for_custom_hf():
+            if fallback:
+                logger.info("[router] custom_hf skipped in chain: no CUDA")
+                return self._mock.generate(prompt, system_prompt)
+            raise RuntimeError("custom_hf requires CUDA (or FORGE_ALLOW_CUSTOM_HF_CPU=1)")
         try:
             sec = self._timeouts["custom_hf"]