RayMelius Claude Sonnet 4.6 commited on
Commit
beb2a11
Β·
1 Parent(s): 5e37c7d

Clean up token env vars; add Gemini to free provider menu

Browse files

- HFInferenceClient: drop hf_soci_token / soci_token / HW_WR_TOKEN β€”
HF_TOKEN is the only supported var (auto-injected by HF Spaces)
- create_llm_client: same β€” only HF_TOKEN triggers HF auto-detection
- _choose_provider: show Groq + Gemini (both free-tier) before HF and Ollama;
remove has_claude (Claude never appeared in the menu anyway)
- HF now labelled "requires HF PRO / credits" to set correct expectations
- SOCI_PROVIDER / LLM_PROVIDER env var still overrides everything (can force
any provider including claude)

Tokens safe to delete from HF settings: soci_token, soci_read, hf_soci_token
SOCI_PROVIDER is not a secret β€” safe to use as a public Space variable

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. src/soci/api/server.py +8 -12
  2. src/soci/engine/llm.py +9 -22
src/soci/api/server.py CHANGED
@@ -253,23 +253,19 @@ def _choose_provider() -> str:
253
  if provider in ("claude", "groq", "gemini", "hf", "ollama"):
254
  return provider
255
 
256
- # Check if keys are available
257
- has_claude = bool(os.environ.get("ANTHROPIC_API_KEY"))
258
  has_groq = bool(os.environ.get("GROQ_API_KEY"))
259
- has_hf = bool(
260
- os.environ.get("HF_TOKEN")
261
- or os.environ.get("hf_soci_token")
262
- or os.environ.get("soci_token")
263
- or os.environ.get("HW_WR_TOKEN")
264
- )
265
 
266
- # Only free providers are offered; paid providers (Claude, Gemini) can be
267
- # forced via SOCI_PROVIDER / LLM_PROVIDER env var if needed.
268
  options = []
269
- if has_hf:
270
- options.append(("hf", "HF Inference (free, SmolLM3 via hf-inference)"))
271
  if has_groq:
272
  options.append(("groq", "Groq (free tier, 30 req/min)"))
 
 
 
 
273
  options.append(("ollama", "Ollama (local, free, no rate limit)"))
274
 
275
  # If only one option, use it
 
253
  if provider in ("claude", "groq", "gemini", "hf", "ollama"):
254
  return provider
255
 
256
+ # Check which keys are available
 
257
  has_groq = bool(os.environ.get("GROQ_API_KEY"))
258
+ has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
259
+ has_hf = bool(os.environ.get("HF_TOKEN"))
 
 
 
 
260
 
261
+ # Free providers only; Claude can be forced via SOCI_PROVIDER / LLM_PROVIDER.
 
262
  options = []
 
 
263
  if has_groq:
264
  options.append(("groq", "Groq (free tier, 30 req/min)"))
265
+ if has_gemini:
266
+ options.append(("gemini", "Gemini (free tier, 15 req/min via AI Studio)"))
267
+ if has_hf:
268
+ options.append(("hf", "HF Inference (requires HF PRO / credits)"))
269
  options.append(("ollama", "Ollama (local, free, no rate limit)"))
270
 
271
  # If only one option, use it
src/soci/engine/llm.py CHANGED
@@ -813,12 +813,11 @@ class GeminiClient:
813
  # ============================================================
814
 
815
  class HFInferenceClient:
816
- """Hugging Face Serverless Inference via OpenAI-compatible endpoint.
817
 
818
- Free tier (no credit card required):
819
- - Llama-3.2-3B-Instruct, Qwen2.5-7B-Instruct, Mistral-7B, and many others.
820
- - HF_TOKEN is auto-injected in HF Spaces β€” no manual setup needed.
821
- - Get a token at https://huggingface.co/settings/tokens
822
  """
823
 
824
  def __init__(
@@ -827,20 +826,10 @@ class HFInferenceClient:
827
  default_model: str = MODEL_HF_SMOL,
828
  max_retries: int = 3,
829
  ) -> None:
830
- # Priority: explicit arg β†’ named secrets (personal token) β†’ Space auto-injected HF_TOKEN
831
- # HF_TOKEN is auto-injected in HF Spaces but only has basic inference (no credits for routed models).
832
- # A personal token stored as hf_soci_token / soci_token / HW_WR_TOKEN takes precedence.
833
- self.api_key = (
834
- api_key
835
- or os.environ.get("hf_soci_token", "")
836
- or os.environ.get("soci_token", "")
837
- or os.environ.get("HW_WR_TOKEN", "")
838
- or os.environ.get("HF_TOKEN", "")
839
- )
840
  if not self.api_key:
841
  logger.warning(
842
- "Neither HF_TOKEN nor soci_token is set β€” HF Inference will not make LLM calls. "
843
- "Get a free token at https://huggingface.co/settings/tokens"
844
  )
845
  self.default_model = default_model
846
  self.max_retries = max_retries
@@ -888,7 +877,7 @@ class HFInferenceClient:
888
  max_tokens: int = 1024,
889
  ) -> str:
890
  if not self.api_key:
891
- self._last_error = "HF_TOKEN / soci_token not set β€” add it to your HF Space secrets"
892
  return ""
893
  if self._is_quota_exhausted():
894
  logger.debug("HF quota circuit breaker active β€” skipping complete()")
@@ -939,12 +928,11 @@ class HFInferenceClient:
939
  elif status in (401, 402, 403, 410):
940
  # Auth/payment failure β€” circuit-break for 1h to stop spam retries.
941
  # 402 means no credits (token lacks Inference Providers permission).
942
- # Fix: add hf_soci_token secret in Space with a token that has inference perms.
943
  self._rate_limited_until = time.monotonic() + 3600
944
  self._auth_error = body
945
  logger.error(
946
  f"HF auth error ({status}): {body[:120]} β€” "
947
- "Add hf_soci_token Space secret with a token that has Inference Providers permission"
948
  )
949
  return ""
950
  elif status in (503, 504):
@@ -1033,8 +1021,7 @@ def create_llm_client(
1033
  provider = PROVIDER_GROQ
1034
  elif os.environ.get("GEMINI_API_KEY"):
1035
  provider = PROVIDER_GEMINI
1036
- elif (os.environ.get("HF_TOKEN") or os.environ.get("hf_soci_token")
1037
- or os.environ.get("soci_token") or os.environ.get("HW_WR_TOKEN")):
1038
  provider = PROVIDER_HF
1039
  else:
1040
  provider = PROVIDER_OLLAMA
 
813
  # ============================================================
814
 
815
  class HFInferenceClient:
816
+ """Hugging Face Serverless Inference via router.huggingface.co/v1.
817
 
818
+ Requires an HF_TOKEN with 'Inference Providers (Write)' permission.
819
+ HF_TOKEN is auto-injected in HF Spaces but only has repo-read access;
820
+ a PRO account or purchased credits is needed for LLM inference.
 
821
  """
822
 
823
  def __init__(
 
826
  default_model: str = MODEL_HF_SMOL,
827
  max_retries: int = 3,
828
  ) -> None:
829
+ self.api_key = api_key or os.environ.get("HF_TOKEN", "")
 
 
 
 
 
 
 
 
 
830
  if not self.api_key:
831
  logger.warning(
832
+ "HF_TOKEN is not set β€” HF Inference will not make LLM calls."
 
833
  )
834
  self.default_model = default_model
835
  self.max_retries = max_retries
 
877
  max_tokens: int = 1024,
878
  ) -> str:
879
  if not self.api_key:
880
+ self._last_error = "HF_TOKEN not set"
881
  return ""
882
  if self._is_quota_exhausted():
883
  logger.debug("HF quota circuit breaker active β€” skipping complete()")
 
928
  elif status in (401, 402, 403, 410):
929
  # Auth/payment failure β€” circuit-break for 1h to stop spam retries.
930
  # 402 means no credits (token lacks Inference Providers permission).
 
931
  self._rate_limited_until = time.monotonic() + 3600
932
  self._auth_error = body
933
  logger.error(
934
  f"HF auth error ({status}): {body[:120]} β€” "
935
+ "HF_TOKEN needs 'Inference Providers (Write)' permission"
936
  )
937
  return ""
938
  elif status in (503, 504):
 
1021
  provider = PROVIDER_GROQ
1022
  elif os.environ.get("GEMINI_API_KEY"):
1023
  provider = PROVIDER_GEMINI
1024
+ elif os.environ.get("HF_TOKEN"):
 
1025
  provider = PROVIDER_HF
1026
  else:
1027
  provider = PROVIDER_OLLAMA