sanjay7676 commited on
Commit
d1f6a06
·
1 Parent(s): 5a67c2d

feat: custom_hf without CUDA falls back to offline baseline

Browse files

- Router skips local HF load when no CUDA; auto chain skips HF the same way.
- Gradio benchmark/compare shows Info when downgrading custom_hf -> offline on CPU.
- Optional FORGE_ALLOW_CUSTOM_HF_CPU=1 for debug CPU loads; documented in README/.env.example.

Made-with: Cursor

Files changed (4) hide show
  1. .env.example +3 -0
  2. README.md +1 -0
  3. app.py +14 -2
  4. forge/providers/router.py +21 -0
.env.example CHANGED
@@ -33,6 +33,9 @@ CODE_PROVIDER_MODE=custom_hf
33
  # custom_hf: max new tokens per generation (default 192 — short code tasks)
34
  # FORGE_HF_MAX_NEW_TOKENS=192
35
 
 
 
 
36
  # Cap local HF inference wait (seconds); increase on GPU machines if loads are slow.
37
  ROUTER_HF_TIMEOUT_SEC=90
38
  ROUTER_NIM_TIMEOUT_SEC=90
 
33
  # custom_hf: max new tokens per generation (default 192 — short code tasks)
34
  # FORGE_HF_MAX_NEW_TOKENS=192
35
 
36
+ # Debug only: allow loading custom_hf on CPU (slow / may OOM). Otherwise no CUDA => offline baseline.
37
+ # FORGE_ALLOW_CUSTOM_HF_CPU=0
38
+
39
  # Cap local HF inference wait (seconds); increase on GPU machines if loads are slow.
40
  ROUTER_HF_TIMEOUT_SEC=90
41
  ROUTER_NIM_TIMEOUT_SEC=90
README.md CHANGED
@@ -62,6 +62,7 @@ suggested_hardware: t4-small
62
  | **`FORGE_UI_CANDIDATES`** | Gradio benchmark/compare only — candidates per step (**default `1`** for faster `custom_hf`; raise for richer ranking) |
63
  | **`FORGE_UI_STEPS`** | Gradio only — steps per episode (**default `2`**; set **`full`**, **`default`**, or **`0`** to use global `STEPS_PER_EPISODE` = 3) |
64
  | **`FORGE_HF_MAX_NEW_TOKENS`** | `custom_hf` completion cap (**default `192`**) — lower is faster for short coding tasks |
 
65
 
66
  ### OpenEnv HTTP API on the Hugging Face Space
67
 
 
62
  | **`FORGE_UI_CANDIDATES`** | Gradio benchmark/compare only — candidates per step (**default `1`** for faster `custom_hf`; raise for richer ranking) |
63
  | **`FORGE_UI_STEPS`** | Gradio only — steps per episode (**default `2`**; set **`full`**, **`default`**, or **`0`** to use global `STEPS_PER_EPISODE` = 3) |
64
  | **`FORGE_HF_MAX_NEW_TOKENS`** | `custom_hf` completion cap (**default `192`**) — lower is faster for short coding tasks |
65
+ | **`FORGE_ALLOW_CUSTOM_HF_CPU`** | Set **`1`** only for debugging — otherwise **`custom_hf` without CUDA** uses the **offline** baseline (no multi‑GB CPU load) |
66
 
67
  ### OpenEnv HTTP API on the Hugging Face Space
68
 
app.py CHANGED
@@ -93,6 +93,12 @@ def run_benchmark_ui(episodes, forge_provider_label: str):
93
  mode = forge_provider_label if forge_provider_label in (
94
  "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
95
  ) else "offline"
 
 
 
 
 
 
96
  report = run_benchmark_mode(
97
  policy_name="model",
98
  episodes=ep_count,
@@ -126,6 +132,12 @@ def run_compare_ui(episodes, forge_provider_label: str):
126
  mode = forge_provider_label if forge_provider_label in (
127
  "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
128
  ) else "offline"
 
 
 
 
 
 
129
  report = run_compare_mode(
130
  model_policy_name="model",
131
  episodes=ep_count,
@@ -183,9 +195,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
183
  value=default_forge_ui_provider(),
184
  label="Inference provider",
185
  info=(
186
- "**custom_hf** = local PyTorch + Hub weights on this machine (default on **GPU**). "
187
  "**auto** = NIM → OpenRouter → optional local HF if **HF_TOKEN** is set → else offline. "
188
- "**offline** = no external APIs (CPU-friendly fallback). "
189
  "Gradio uses **`FORGE_UI_CANDIDATES`** (default 1) and **`FORGE_UI_STEPS`** (default 2 steps/episode; set `full` for config default). CLI/training use full settings."
190
  ),
191
  )
 
93
  mode = forge_provider_label if forge_provider_label in (
94
  "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
95
  ) else "offline"
96
+ if mode == "custom_hf" and not _cuda_ready():
97
+ gr.Info(
98
+ "No GPU: using the offline deterministic baseline for inference "
99
+ "(environment and rewards are still real; no local Hub weight load)."
100
+ )
101
+ mode = "offline"
102
  report = run_benchmark_mode(
103
  policy_name="model",
104
  episodes=ep_count,
 
132
  mode = forge_provider_label if forge_provider_label in (
133
  "auto", "custom_hf", "nim", "openrouter", "offline", "mock"
134
  ) else "offline"
135
+ if mode == "custom_hf" and not _cuda_ready():
136
+ gr.Info(
137
+ "No GPU: using the offline deterministic baseline for inference "
138
+ "(environment and rewards are still real; no local Hub weight load)."
139
+ )
140
+ mode = "offline"
141
  report = run_compare_mode(
142
  model_policy_name="model",
143
  episodes=ep_count,
 
195
  value=default_forge_ui_provider(),
196
  label="Inference provider",
197
  info=(
198
+ "**custom_hf** = local PyTorch + Hub weights on **GPU** only; on CPU it automatically uses **offline** baseline. "
199
  "**auto** = NIM → OpenRouter → optional local HF if **HF_TOKEN** is set → else offline. "
200
+ "**offline** = deterministic baseline (no Hub load; fast on CPU). "
201
  "Gradio uses **`FORGE_UI_CANDIDATES`** (default 1) and **`FORGE_UI_STEPS`** (default 2 steps/episode; set `full` for config default). CLI/training use full settings."
202
  ),
203
  )
forge/providers/router.py CHANGED
@@ -8,6 +8,7 @@
8
  from __future__ import annotations
9
 
10
  import logging
 
11
  from concurrent.futures import ThreadPoolExecutor
12
  from concurrent.futures import TimeoutError as FuturesTimeout
13
  from typing import Callable
@@ -30,6 +31,18 @@ def get_inference_router() -> "InferenceRouter":
30
  return _ROUTER_SINGLETON
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def _with_timeout(fn: Callable[[], LLMResponse], seconds: float) -> LLMResponse:
34
  with ThreadPoolExecutor(max_workers=1) as pool:
35
  fut = pool.submit(fn)
@@ -91,6 +104,9 @@ class InferenceRouter:
91
  return self._mock.generate(prompt, system_prompt)
92
 
93
  if mode == "custom_hf":
 
 
 
94
  return self._try_hf(prompt, system_prompt, fallback=True)
95
  if mode == "nim":
96
  return self._try_nim(prompt, system_prompt, fallback=True)
@@ -116,6 +132,11 @@ class InferenceRouter:
116
  return self._mock.generate(prompt, system_prompt)
117
 
118
  def _try_hf(self, prompt: str, system_prompt: str, fallback: bool) -> LLMResponse:
 
 
 
 
 
119
  try:
120
  sec = self._timeouts["custom_hf"]
121
 
 
8
  from __future__ import annotations
9
 
10
  import logging
11
+ import os
12
  from concurrent.futures import ThreadPoolExecutor
13
  from concurrent.futures import TimeoutError as FuturesTimeout
14
  from typing import Callable
 
31
  return _ROUTER_SINGLETON
32
 
33
 
34
+ def _cuda_ok_for_custom_hf() -> bool:
35
+ """Local HF is only practical on GPU unless explicitly overridden (debug)."""
36
+ if os.getenv("FORGE_ALLOW_CUSTOM_HF_CPU", "").strip().lower() in ("1", "true", "yes"):
37
+ return True
38
+ try:
39
+ import torch # noqa: PLC0415
40
+
41
+ return bool(torch.cuda.is_available())
42
+ except ImportError:
43
+ return False
44
+
45
+
46
  def _with_timeout(fn: Callable[[], LLMResponse], seconds: float) -> LLMResponse:
47
  with ThreadPoolExecutor(max_workers=1) as pool:
48
  fut = pool.submit(fn)
 
104
  return self._mock.generate(prompt, system_prompt)
105
 
106
  if mode == "custom_hf":
107
+ if not _cuda_ok_for_custom_hf():
108
+ logger.info("[router] custom_hf: no CUDA — offline baseline (set FORGE_ALLOW_CUSTOM_HF_CPU=1 to force CPU load)")
109
+ return self._mock.generate(prompt, system_prompt)
110
  return self._try_hf(prompt, system_prompt, fallback=True)
111
  if mode == "nim":
112
  return self._try_nim(prompt, system_prompt, fallback=True)
 
132
  return self._mock.generate(prompt, system_prompt)
133
 
134
  def _try_hf(self, prompt: str, system_prompt: str, fallback: bool) -> LLMResponse:
135
+ if not _cuda_ok_for_custom_hf():
136
+ if fallback:
137
+ logger.info("[router] custom_hf skipped in chain: no CUDA")
138
+ return self._mock.generate(prompt, system_prompt)
139
+ raise RuntimeError("custom_hf requires CUDA (or FORGE_ALLOW_CUSTOM_HF_CPU=1)")
140
  try:
141
  sec = self._timeouts["custom_hf"]
142