RayMelius Claude Sonnet 4.6 commited on
Commit
5e37c7d
·
1 Parent(s): 7aa1d5f

Remove paid providers from menu; circuit-break on 402 (no credits)

Browse files

- _choose_provider: offer only free providers (HF SmolLM3, Groq, Ollama);
Claude/Gemini can still be forced via SOCI_PROVIDER env var
- HFInferenceClient: add 402 to circuit-breaker (same as 401/403/410) so
the sim stops retrying when the token lacks Inference Providers permission
- Fix: add hf_soci_token Space secret with a personal HF token that has
Inference Providers (Write) permission to enable SmolLM3 for free

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. src/soci/api/server.py +4 -4
  2. src/soci/engine/llm.py +6 -4
src/soci/api/server.py CHANGED
@@ -263,13 +263,13 @@ def _choose_provider() -> str:
263
  or os.environ.get("HW_WR_TOKEN")
264
  )
265
 
 
 
266
  options = []
267
  if has_hf:
268
- options.append(("hf", "HF Inference (free, serverless, auto-available in HF Spaces)"))
269
  if has_groq:
270
- options.append(("groq", "Groq (fast cloud, free tier 30 req/min)"))
271
- if has_claude:
272
- options.append(("claude", "Claude (Anthropic API, paid)"))
273
  options.append(("ollama", "Ollama (local, free, no rate limit)"))
274
 
275
  # If only one option, use it
 
263
  or os.environ.get("HW_WR_TOKEN")
264
  )
265
 
266
+ # Only free providers are offered; paid providers (Claude, Gemini) can be
267
+ # forced via SOCI_PROVIDER / LLM_PROVIDER env var if needed.
268
  options = []
269
  if has_hf:
270
+ options.append(("hf", "HF Inference (free, SmolLM3 via hf-inference)"))
271
  if has_groq:
272
+ options.append(("groq", "Groq (free tier, 30 req/min)"))
 
 
273
  options.append(("ollama", "Ollama (local, free, no rate limit)"))
274
 
275
  # If only one option, use it
src/soci/engine/llm.py CHANGED
@@ -936,13 +936,15 @@ class HFInferenceClient:
936
  return ""
937
  logger.warning(f"HF rate limited, waiting {wait}s")
938
  await asyncio.sleep(wait)
939
- elif status in (401, 403, 410):
940
- # Auth failure, gated model, or gone endpoint disable for a long window
 
 
941
  self._rate_limited_until = time.monotonic() + 3600
942
  self._auth_error = body
943
  logger.error(
944
- f"HF auth error ({status}): {body} — "
945
- "Check HF_TOKEN and accept model license at huggingface.co"
946
  )
947
  return ""
948
  elif status in (503, 504):
 
936
  return ""
937
  logger.warning(f"HF rate limited, waiting {wait}s")
938
  await asyncio.sleep(wait)
939
+ elif status in (401, 402, 403, 410):
940
+ # Auth/payment failure — circuit-break for 1h to stop spam retries.
941
+ # 402 means no credits (token lacks Inference Providers permission).
942
+ # Fix: add hf_soci_token secret in Space with a token that has inference perms.
943
  self._rate_limited_until = time.monotonic() + 3600
944
  self._auth_error = body
945
  logger.error(
946
+ f"HF auth error ({status}): {body[:120]} — "
947
+ "Add hf_soci_token Space secret with a token that has Inference Providers permission"
948
  )
949
  return ""
950
  elif status in (503, 504):