Spaces:

RayMelius
/

soci2

Running

RayMelius Claude Sonnet 4.6 commited on 16 days ago

Commit

5e37c7d

1 Parent(s): 7aa1d5f

Remove paid providers from menu; circuit-break on 402 (no credits)

- _choose_provider: offer only free providers (HF SmolLM3, Groq, Ollama);
Claude/Gemini can still be forced via SOCI_PROVIDER env var
- HFInferenceClient: add 402 to circuit-breaker (same as 401/403/410) so
the sim stops retrying when the token lacks Inference Providers permission
- Fix: add hf_soci_token Space secret with a personal HF token that has
Inference Providers (Write) permission to enable SmolLM3 for free

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

src/soci/api/server.py +4 -4
src/soci/engine/llm.py +6 -4

src/soci/api/server.py CHANGED Viewed

@@ -263,13 +263,13 @@ def _choose_provider() -> str:
         or os.environ.get("HW_WR_TOKEN")
     )
     options = []
     if has_hf:
-        options.append(("hf", "HF Inference (free, serverless, auto-available in HF Spaces)"))
     if has_groq:
-        options.append(("groq", "Groq (fast cloud, free tier 30 req/min)"))
-    if has_claude:
-        options.append(("claude", "Claude (Anthropic API, paid)"))
     options.append(("ollama", "Ollama (local, free, no rate limit)"))
     # If only one option, use it

         or os.environ.get("HW_WR_TOKEN")
     )
+    # Only free providers are offered; paid providers (Claude, Gemini) can be
+    # forced via SOCI_PROVIDER / LLM_PROVIDER env var if needed.
     options = []
     if has_hf:
+        options.append(("hf", "HF Inference (free, SmolLM3 via hf-inference)"))
     if has_groq:
+        options.append(("groq", "Groq (free tier, 30 req/min)"))
     options.append(("ollama", "Ollama (local, free, no rate limit)"))
     # If only one option, use it

src/soci/engine/llm.py CHANGED Viewed

@@ -936,13 +936,15 @@ class HFInferenceClient:
                         return ""
                     logger.warning(f"HF rate limited, waiting {wait}s")
                     await asyncio.sleep(wait)
-                elif status in (401, 403, 410):
-                    # Auth failure, gated model, or gone endpoint — disable for a long window
                     self._rate_limited_until = time.monotonic() + 3600
                     self._auth_error = body
                     logger.error(
-                        f"HF auth error ({status}): {body} — "
-                        "Check HF_TOKEN and accept model license at huggingface.co"
                     )
                     return ""
                 elif status in (503, 504):

                         return ""
                     logger.warning(f"HF rate limited, waiting {wait}s")
                     await asyncio.sleep(wait)
+                elif status in (401, 402, 403, 410):
+                    # Auth/payment failure — circuit-break for 1h to stop spam retries.
+                    # 402 means no credits (token lacks Inference Providers permission).
+                    # Fix: add hf_soci_token secret in Space with a token that has inference perms.
                     self._rate_limited_until = time.monotonic() + 3600
                     self._auth_error = body
                     logger.error(
+                        f"HF auth error ({status}): {body[:120]} — "
+                        "Add hf_soci_token Space secret with a token that has Inference Providers permission"
                     )
                     return ""
                 elif status in (503, 504):