Remove paid providers from menu; circuit-break on 402 (no credits)
Browse files- _choose_provider: offer only free providers (HF SmolLM3, Groq, Ollama);
Claude/Gemini can still be forced via SOCI_PROVIDER env var
- HFInferenceClient: add 402 to circuit-breaker (same as 401/403/410) so
the sim stops retrying when the token lacks Inference Providers permission
- Fix: add hf_soci_token Space secret with a personal HF token that has
Inference Providers (Write) permission to enable SmolLM3 for free
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/soci/api/server.py +4 -4
- src/soci/engine/llm.py +6 -4
src/soci/api/server.py
CHANGED
|
@@ -263,13 +263,13 @@ def _choose_provider() -> str:
|
|
| 263 |
or os.environ.get("HW_WR_TOKEN")
|
| 264 |
)
|
| 265 |
|
|
|
|
|
|
|
| 266 |
options = []
|
| 267 |
if has_hf:
|
| 268 |
-
options.append(("hf", "HF Inference (free,
|
| 269 |
if has_groq:
|
| 270 |
-
options.append(("groq", "Groq (
|
| 271 |
-
if has_claude:
|
| 272 |
-
options.append(("claude", "Claude (Anthropic API, paid)"))
|
| 273 |
options.append(("ollama", "Ollama (local, free, no rate limit)"))
|
| 274 |
|
| 275 |
# If only one option, use it
|
|
|
|
| 263 |
or os.environ.get("HW_WR_TOKEN")
|
| 264 |
)
|
| 265 |
|
| 266 |
+
# Only free providers are offered; paid providers (Claude, Gemini) can be
|
| 267 |
+
# forced via SOCI_PROVIDER / LLM_PROVIDER env var if needed.
|
| 268 |
options = []
|
| 269 |
if has_hf:
|
| 270 |
+
options.append(("hf", "HF Inference (free, SmolLM3 via hf-inference)"))
|
| 271 |
if has_groq:
|
| 272 |
+
options.append(("groq", "Groq (free tier, 30 req/min)"))
|
|
|
|
|
|
|
| 273 |
options.append(("ollama", "Ollama (local, free, no rate limit)"))
|
| 274 |
|
| 275 |
# If only one option, use it
|
src/soci/engine/llm.py
CHANGED
|
@@ -936,13 +936,15 @@ class HFInferenceClient:
|
|
| 936 |
return ""
|
| 937 |
logger.warning(f"HF rate limited, waiting {wait}s")
|
| 938 |
await asyncio.sleep(wait)
|
| 939 |
-
elif status in (401, 403, 410):
|
| 940 |
-
# Auth failure
|
|
|
|
|
|
|
| 941 |
self._rate_limited_until = time.monotonic() + 3600
|
| 942 |
self._auth_error = body
|
| 943 |
logger.error(
|
| 944 |
-
f"HF auth error ({status}): {body} — "
|
| 945 |
-
"
|
| 946 |
)
|
| 947 |
return ""
|
| 948 |
elif status in (503, 504):
|
|
|
|
| 936 |
return ""
|
| 937 |
logger.warning(f"HF rate limited, waiting {wait}s")
|
| 938 |
await asyncio.sleep(wait)
|
| 939 |
+
elif status in (401, 402, 403, 410):
|
| 940 |
+
# Auth/payment failure — circuit-break for 1h to stop spam retries.
|
| 941 |
+
# 402 means no credits (token lacks Inference Providers permission).
|
| 942 |
+
# Fix: add hf_soci_token secret in Space with a token that has inference perms.
|
| 943 |
self._rate_limited_until = time.monotonic() + 3600
|
| 944 |
self._auth_error = body
|
| 945 |
logger.error(
|
| 946 |
+
f"HF auth error ({status}): {body[:120]} — "
|
| 947 |
+
"Add hf_soci_token Space secret with a token that has Inference Providers permission"
|
| 948 |
)
|
| 949 |
return ""
|
| 950 |
elif status in (503, 504):
|