Clean up token env vars; add Gemini to free provider menu
Browse files- HFInferenceClient: drop hf_soci_token / soci_token / HW_WR_TOKEN β
HF_TOKEN is the only supported var (auto-injected by HF Spaces)
- create_llm_client: same β only HF_TOKEN triggers HF auto-detection
- _choose_provider: show Groq + Gemini (both free-tier) before HF and Ollama;
remove has_claude (Claude never appeared in the menu anyway)
- HF now labelled "requires HF PRO / credits" to set correct expectations
- SOCI_PROVIDER / LLM_PROVIDER env var still overrides everything (can force
any provider including claude)
Tokens safe to delete from HF settings: soci_token, soci_read, hf_soci_token
SOCI_PROVIDER is not a secret β safe to use as a public Space variable
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/soci/api/server.py +8 -12
- src/soci/engine/llm.py +9 -22
src/soci/api/server.py
CHANGED
|
@@ -253,23 +253,19 @@ def _choose_provider() -> str:
|
|
| 253 |
if provider in ("claude", "groq", "gemini", "hf", "ollama"):
|
| 254 |
return provider
|
| 255 |
|
| 256 |
-
# Check
|
| 257 |
-
has_claude = bool(os.environ.get("ANTHROPIC_API_KEY"))
|
| 258 |
has_groq = bool(os.environ.get("GROQ_API_KEY"))
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
or os.environ.get("hf_soci_token")
|
| 262 |
-
or os.environ.get("soci_token")
|
| 263 |
-
or os.environ.get("HW_WR_TOKEN")
|
| 264 |
-
)
|
| 265 |
|
| 266 |
-
#
|
| 267 |
-
# forced via SOCI_PROVIDER / LLM_PROVIDER env var if needed.
|
| 268 |
options = []
|
| 269 |
-
if has_hf:
|
| 270 |
-
options.append(("hf", "HF Inference (free, SmolLM3 via hf-inference)"))
|
| 271 |
if has_groq:
|
| 272 |
options.append(("groq", "Groq (free tier, 30 req/min)"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
options.append(("ollama", "Ollama (local, free, no rate limit)"))
|
| 274 |
|
| 275 |
# If only one option, use it
|
|
|
|
| 253 |
if provider in ("claude", "groq", "gemini", "hf", "ollama"):
|
| 254 |
return provider
|
| 255 |
|
| 256 |
+
# Check which keys are available
|
|
|
|
| 257 |
has_groq = bool(os.environ.get("GROQ_API_KEY"))
|
| 258 |
+
has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
|
| 259 |
+
has_hf = bool(os.environ.get("HF_TOKEN"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
+
# Free providers only; Claude can be forced via SOCI_PROVIDER / LLM_PROVIDER.
|
|
|
|
| 262 |
options = []
|
|
|
|
|
|
|
| 263 |
if has_groq:
|
| 264 |
options.append(("groq", "Groq (free tier, 30 req/min)"))
|
| 265 |
+
if has_gemini:
|
| 266 |
+
options.append(("gemini", "Gemini (free tier, 15 req/min via AI Studio)"))
|
| 267 |
+
if has_hf:
|
| 268 |
+
options.append(("hf", "HF Inference (requires HF PRO / credits)"))
|
| 269 |
options.append(("ollama", "Ollama (local, free, no rate limit)"))
|
| 270 |
|
| 271 |
# If only one option, use it
|
src/soci/engine/llm.py
CHANGED
|
@@ -813,12 +813,11 @@ class GeminiClient:
|
|
| 813 |
# ============================================================
|
| 814 |
|
| 815 |
class HFInferenceClient:
|
| 816 |
-
"""Hugging Face Serverless Inference via
|
| 817 |
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
- Get a token at https://huggingface.co/settings/tokens
|
| 822 |
"""
|
| 823 |
|
| 824 |
def __init__(
|
|
@@ -827,20 +826,10 @@ class HFInferenceClient:
|
|
| 827 |
default_model: str = MODEL_HF_SMOL,
|
| 828 |
max_retries: int = 3,
|
| 829 |
) -> None:
|
| 830 |
-
|
| 831 |
-
# HF_TOKEN is auto-injected in HF Spaces but only has basic inference (no credits for routed models).
|
| 832 |
-
# A personal token stored as hf_soci_token / soci_token / HW_WR_TOKEN takes precedence.
|
| 833 |
-
self.api_key = (
|
| 834 |
-
api_key
|
| 835 |
-
or os.environ.get("hf_soci_token", "")
|
| 836 |
-
or os.environ.get("soci_token", "")
|
| 837 |
-
or os.environ.get("HW_WR_TOKEN", "")
|
| 838 |
-
or os.environ.get("HF_TOKEN", "")
|
| 839 |
-
)
|
| 840 |
if not self.api_key:
|
| 841 |
logger.warning(
|
| 842 |
-
"
|
| 843 |
-
"Get a free token at https://huggingface.co/settings/tokens"
|
| 844 |
)
|
| 845 |
self.default_model = default_model
|
| 846 |
self.max_retries = max_retries
|
|
@@ -888,7 +877,7 @@ class HFInferenceClient:
|
|
| 888 |
max_tokens: int = 1024,
|
| 889 |
) -> str:
|
| 890 |
if not self.api_key:
|
| 891 |
-
self._last_error = "HF_TOKEN
|
| 892 |
return ""
|
| 893 |
if self._is_quota_exhausted():
|
| 894 |
logger.debug("HF quota circuit breaker active β skipping complete()")
|
|
@@ -939,12 +928,11 @@ class HFInferenceClient:
|
|
| 939 |
elif status in (401, 402, 403, 410):
|
| 940 |
# Auth/payment failure β circuit-break for 1h to stop spam retries.
|
| 941 |
# 402 means no credits (token lacks Inference Providers permission).
|
| 942 |
-
# Fix: add hf_soci_token secret in Space with a token that has inference perms.
|
| 943 |
self._rate_limited_until = time.monotonic() + 3600
|
| 944 |
self._auth_error = body
|
| 945 |
logger.error(
|
| 946 |
f"HF auth error ({status}): {body[:120]} β "
|
| 947 |
-
"
|
| 948 |
)
|
| 949 |
return ""
|
| 950 |
elif status in (503, 504):
|
|
@@ -1033,8 +1021,7 @@ def create_llm_client(
|
|
| 1033 |
provider = PROVIDER_GROQ
|
| 1034 |
elif os.environ.get("GEMINI_API_KEY"):
|
| 1035 |
provider = PROVIDER_GEMINI
|
| 1036 |
-
elif
|
| 1037 |
-
or os.environ.get("soci_token") or os.environ.get("HW_WR_TOKEN")):
|
| 1038 |
provider = PROVIDER_HF
|
| 1039 |
else:
|
| 1040 |
provider = PROVIDER_OLLAMA
|
|
|
|
| 813 |
# ============================================================
|
| 814 |
|
| 815 |
class HFInferenceClient:
|
| 816 |
+
"""Hugging Face Serverless Inference via router.huggingface.co/v1.
|
| 817 |
|
| 818 |
+
Requires an HF_TOKEN with 'Inference Providers (Write)' permission.
|
| 819 |
+
HF_TOKEN is auto-injected in HF Spaces but only has repo-read access;
|
| 820 |
+
a PRO account or purchased credits is needed for LLM inference.
|
|
|
|
| 821 |
"""
|
| 822 |
|
| 823 |
def __init__(
|
|
|
|
| 826 |
default_model: str = MODEL_HF_SMOL,
|
| 827 |
max_retries: int = 3,
|
| 828 |
) -> None:
|
| 829 |
+
self.api_key = api_key or os.environ.get("HF_TOKEN", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 830 |
if not self.api_key:
|
| 831 |
logger.warning(
|
| 832 |
+
"HF_TOKEN is not set β HF Inference will not make LLM calls."
|
|
|
|
| 833 |
)
|
| 834 |
self.default_model = default_model
|
| 835 |
self.max_retries = max_retries
|
|
|
|
| 877 |
max_tokens: int = 1024,
|
| 878 |
) -> str:
|
| 879 |
if not self.api_key:
|
| 880 |
+
self._last_error = "HF_TOKEN not set"
|
| 881 |
return ""
|
| 882 |
if self._is_quota_exhausted():
|
| 883 |
logger.debug("HF quota circuit breaker active β skipping complete()")
|
|
|
|
| 928 |
elif status in (401, 402, 403, 410):
|
| 929 |
# Auth/payment failure β circuit-break for 1h to stop spam retries.
|
| 930 |
# 402 means no credits (token lacks Inference Providers permission).
|
|
|
|
| 931 |
self._rate_limited_until = time.monotonic() + 3600
|
| 932 |
self._auth_error = body
|
| 933 |
logger.error(
|
| 934 |
f"HF auth error ({status}): {body[:120]} β "
|
| 935 |
+
"HF_TOKEN needs 'Inference Providers (Write)' permission"
|
| 936 |
)
|
| 937 |
return ""
|
| 938 |
elif status in (503, 504):
|
|
|
|
| 1021 |
provider = PROVIDER_GROQ
|
| 1022 |
elif os.environ.get("GEMINI_API_KEY"):
|
| 1023 |
provider = PROVIDER_GEMINI
|
| 1024 |
+
elif os.environ.get("HF_TOKEN"):
|
|
|
|
| 1025 |
provider = PROVIDER_HF
|
| 1026 |
else:
|
| 1027 |
provider = PROVIDER_OLLAMA
|