Add Hugging Face Serverless Inference as free LLM provider
Browse files- HFInferenceClient uses OpenAI-compatible endpoint at api-inference.huggingface.co
- Default model: Qwen2.5-7B-Instruct (also supports Llama-3.2-3B, Mistral-7B)
- HF_TOKEN auto-injected in HF Spaces β zero config needed on Spaces deployment
- Circuit breaker on quota exhaustion (same pattern as Groq/Gemini)
- Auto-detected in create_llm_client() when HF_TOKEN is set (after Gemini, before Ollama)
- Provider switcher UI shows π€ icon for hf provider
- routes.py and server.py updated to accept "hf" as a valid provider
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/soci/api/routes.py +3 -1
- src/soci/api/server.py +1 -1
- src/soci/engine/llm.py +161 -5
- web/index.html +1 -1
src/soci/api/routes.py
CHANGED
|
@@ -278,6 +278,8 @@ async def get_llm_providers():
|
|
| 278 |
providers.append({"id": "groq", "label": "Groq (Llama 8B)", "icon": "β‘"})
|
| 279 |
if os.environ.get("GEMINI_API_KEY"):
|
| 280 |
providers.append({"id": "gemini", "label": "Gemini 2.0 Flash", "icon": "β¦"})
|
|
|
|
|
|
|
| 281 |
providers.append( {"id": "ollama", "label": "Ollama (local)", "icon": "π¦"})
|
| 282 |
return {"current": current, "providers": providers}
|
| 283 |
|
|
@@ -286,7 +288,7 @@ async def get_llm_providers():
|
|
| 286 |
async def set_llm_provider(req: SwitchProviderRequest):
|
| 287 |
"""Hot-swap the active LLM provider."""
|
| 288 |
from soci.api.server import switch_llm_provider
|
| 289 |
-
valid = {"claude", "groq", "gemini", "ollama"}
|
| 290 |
if req.provider not in valid:
|
| 291 |
raise HTTPException(status_code=400, detail=f"Unknown provider '{req.provider}'")
|
| 292 |
try:
|
|
|
|
| 278 |
providers.append({"id": "groq", "label": "Groq (Llama 8B)", "icon": "β‘"})
|
| 279 |
if os.environ.get("GEMINI_API_KEY"):
|
| 280 |
providers.append({"id": "gemini", "label": "Gemini 2.0 Flash", "icon": "β¦"})
|
| 281 |
+
if os.environ.get("HF_TOKEN"):
|
| 282 |
+
providers.append({"id": "hf", "label": "HF Qwen 2.5 7B", "icon": "π€"})
|
| 283 |
providers.append( {"id": "ollama", "label": "Ollama (local)", "icon": "π¦"})
|
| 284 |
return {"current": current, "providers": providers}
|
| 285 |
|
|
|
|
| 288 |
async def set_llm_provider(req: SwitchProviderRequest):
|
| 289 |
"""Hot-swap the active LLM provider."""
|
| 290 |
from soci.api.server import switch_llm_provider
|
| 291 |
+
valid = {"claude", "groq", "gemini", "hf", "ollama"}
|
| 292 |
if req.provider not in valid:
|
| 293 |
raise HTTPException(status_code=400, detail=f"Unknown provider '{req.provider}'")
|
| 294 |
try:
|
src/soci/api/server.py
CHANGED
|
@@ -250,7 +250,7 @@ def _choose_provider() -> str:
|
|
| 250 |
"""
|
| 251 |
# Check explicit env vars first
|
| 252 |
provider = os.environ.get("SOCI_PROVIDER", "").lower() or os.environ.get("LLM_PROVIDER", "").lower()
|
| 253 |
-
if provider in ("claude", "groq", "gemini", "ollama"):
|
| 254 |
return provider
|
| 255 |
|
| 256 |
# Check if keys are available
|
|
|
|
| 250 |
"""
|
| 251 |
# Check explicit env vars first
|
| 252 |
provider = os.environ.get("SOCI_PROVIDER", "").lower() or os.environ.get("LLM_PROVIDER", "").lower()
|
| 253 |
+
if provider in ("claude", "groq", "gemini", "hf", "ollama"):
|
| 254 |
return provider
|
| 255 |
|
| 256 |
# Check if keys are available
|
src/soci/engine/llm.py
CHANGED
|
@@ -19,6 +19,7 @@ PROVIDER_CLAUDE = "claude"
|
|
| 19 |
PROVIDER_OLLAMA = "ollama"
|
| 20 |
PROVIDER_GROQ = "groq"
|
| 21 |
PROVIDER_GEMINI = "gemini"
|
|
|
|
| 22 |
|
| 23 |
# Claude model IDs
|
| 24 |
MODEL_SONNET = "claude-sonnet-4-5-20250929"
|
|
@@ -40,6 +41,11 @@ MODEL_GROQ_MIXTRAL = "mixtral-8x7b-32768"
|
|
| 40 |
MODEL_GEMINI_FLASH = "gemini-2.0-flash"
|
| 41 |
MODEL_GEMINI_PRO = "gemini-1.5-pro"
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Approximate cost per 1M tokens (USD) β Ollama is free, Groq is very cheap
|
| 44 |
COST_PER_1M = {
|
| 45 |
MODEL_SONNET: {"input": 3.0, "output": 15.0},
|
|
@@ -799,6 +805,149 @@ class GeminiClient:
|
|
| 799 |
return {}
|
| 800 |
|
| 801 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
# ============================================================
|
| 803 |
# Factory β create the right client based on config
|
| 804 |
# ============================================================
|
|
@@ -807,27 +956,31 @@ def create_llm_client(
|
|
| 807 |
provider: Optional[str] = None,
|
| 808 |
model: Optional[str] = None,
|
| 809 |
ollama_url: str = "http://localhost:11434",
|
| 810 |
-
) -> ClaudeClient | OllamaClient | GroqClient:
|
| 811 |
"""Create an LLM client based on environment or explicit config.
|
| 812 |
|
| 813 |
Provider detection order:
|
| 814 |
1. Explicit provider argument
|
| 815 |
2. LLM_PROVIDER env var
|
| 816 |
3. If ANTHROPIC_API_KEY is set β Claude
|
| 817 |
-
4. If GROQ_API_KEY is set β Groq (fast cloud
|
| 818 |
-
5.
|
|
|
|
|
|
|
| 819 |
"""
|
| 820 |
if provider is None:
|
| 821 |
provider = os.environ.get("LLM_PROVIDER", "").lower()
|
| 822 |
|
| 823 |
if not provider:
|
| 824 |
-
# Auto-detect: Claude β Groq β Gemini β Ollama
|
| 825 |
if os.environ.get("ANTHROPIC_API_KEY"):
|
| 826 |
provider = PROVIDER_CLAUDE
|
| 827 |
elif os.environ.get("GROQ_API_KEY"):
|
| 828 |
provider = PROVIDER_GROQ
|
| 829 |
elif os.environ.get("GEMINI_API_KEY"):
|
| 830 |
provider = PROVIDER_GEMINI
|
|
|
|
|
|
|
| 831 |
else:
|
| 832 |
provider = PROVIDER_OLLAMA
|
| 833 |
|
|
@@ -840,11 +993,14 @@ def create_llm_client(
|
|
| 840 |
elif provider == PROVIDER_GEMINI:
|
| 841 |
default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
|
| 842 |
return GeminiClient(default_model=default_model)
|
|
|
|
|
|
|
|
|
|
| 843 |
elif provider == PROVIDER_OLLAMA:
|
| 844 |
default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
|
| 845 |
return OllamaClient(base_url=ollama_url, default_model=default_model)
|
| 846 |
else:
|
| 847 |
-
raise ValueError(f"Unknown LLM provider: {provider}. Use 'claude', 'groq', 'gemini', or 'ollama'.")
|
| 848 |
|
| 849 |
|
| 850 |
# --- Prompt Templates ---
|
|
|
|
| 19 |
PROVIDER_OLLAMA = "ollama"
|
| 20 |
PROVIDER_GROQ = "groq"
|
| 21 |
PROVIDER_GEMINI = "gemini"
|
| 22 |
+
PROVIDER_HF = "hf"
|
| 23 |
|
| 24 |
# Claude model IDs
|
| 25 |
MODEL_SONNET = "claude-sonnet-4-5-20250929"
|
|
|
|
| 41 |
MODEL_GEMINI_FLASH = "gemini-2.0-flash"
|
| 42 |
MODEL_GEMINI_PRO = "gemini-1.5-pro"
|
| 43 |
|
| 44 |
+
# Hugging Face Serverless Inference model IDs (free, no credit card)
|
| 45 |
+
MODEL_HF_LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
|
| 46 |
+
MODEL_HF_QWEN = "Qwen/Qwen2.5-7B-Instruct"
|
| 47 |
+
MODEL_HF_MISTRAL = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 48 |
+
|
| 49 |
# Approximate cost per 1M tokens (USD) β Ollama is free, Groq is very cheap
|
| 50 |
COST_PER_1M = {
|
| 51 |
MODEL_SONNET: {"input": 3.0, "output": 15.0},
|
|
|
|
| 805 |
return {}
|
| 806 |
|
| 807 |
|
| 808 |
+
# ============================================================
|
| 809 |
+
# Hugging Face Serverless Inference Client (free tier)
|
| 810 |
+
# ============================================================
|
| 811 |
+
|
| 812 |
+
class HFInferenceClient:
|
| 813 |
+
"""Hugging Face Serverless Inference via OpenAI-compatible endpoint.
|
| 814 |
+
|
| 815 |
+
Free tier (no credit card required):
|
| 816 |
+
- Llama-3.2-3B-Instruct, Qwen2.5-7B-Instruct, Mistral-7B, and many others.
|
| 817 |
+
- HF_TOKEN is auto-injected in HF Spaces β no manual setup needed.
|
| 818 |
+
- Get a token at https://huggingface.co/settings/tokens
|
| 819 |
+
"""
|
| 820 |
+
|
| 821 |
+
def __init__(
|
| 822 |
+
self,
|
| 823 |
+
api_key: Optional[str] = None,
|
| 824 |
+
default_model: str = MODEL_HF_QWEN,
|
| 825 |
+
max_retries: int = 3,
|
| 826 |
+
) -> None:
|
| 827 |
+
self.api_key = api_key or os.environ.get("HF_TOKEN", "")
|
| 828 |
+
if not self.api_key:
|
| 829 |
+
raise ValueError(
|
| 830 |
+
"HF_TOKEN not set. Get a free token at https://huggingface.co/settings/tokens"
|
| 831 |
+
)
|
| 832 |
+
self.default_model = default_model
|
| 833 |
+
self.max_retries = max_retries
|
| 834 |
+
self.usage = LLMUsage()
|
| 835 |
+
self.provider = PROVIDER_HF
|
| 836 |
+
self._http = httpx.AsyncClient(
|
| 837 |
+
base_url="https://api-inference.huggingface.co/v1/",
|
| 838 |
+
headers={
|
| 839 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 840 |
+
"Content-Type": "application/json",
|
| 841 |
+
},
|
| 842 |
+
timeout=120.0, # HF can be slow under load
|
| 843 |
+
)
|
| 844 |
+
self._rate_limited_until: float = 0.0
|
| 845 |
+
|
| 846 |
+
def _is_quota_exhausted(self) -> bool:
|
| 847 |
+
return time.monotonic() < self._rate_limited_until
|
| 848 |
+
|
| 849 |
+
def _map_model(self, model: str) -> str:
|
| 850 |
+
"""Map Claude/Groq/Gemini model names to HF equivalents."""
|
| 851 |
+
mapping = {
|
| 852 |
+
MODEL_SONNET: self.default_model,
|
| 853 |
+
MODEL_HAIKU: self.default_model,
|
| 854 |
+
MODEL_GROQ_LLAMA_8B: MODEL_HF_LLAMA,
|
| 855 |
+
MODEL_GEMINI_FLASH: self.default_model,
|
| 856 |
+
}
|
| 857 |
+
return mapping.get(model, self.default_model)
|
| 858 |
+
|
| 859 |
+
@property
|
| 860 |
+
def llm_status(self) -> str:
|
| 861 |
+
return "limited" if self._is_quota_exhausted() else "active"
|
| 862 |
+
|
| 863 |
+
async def complete(
|
| 864 |
+
self,
|
| 865 |
+
system: str,
|
| 866 |
+
user_message: str,
|
| 867 |
+
model: Optional[str] = None,
|
| 868 |
+
temperature: float = 0.7,
|
| 869 |
+
max_tokens: int = 1024,
|
| 870 |
+
) -> str:
|
| 871 |
+
if self._is_quota_exhausted():
|
| 872 |
+
logger.debug("HF quota circuit breaker active β skipping complete()")
|
| 873 |
+
return ""
|
| 874 |
+
|
| 875 |
+
model = self._map_model(model or self.default_model)
|
| 876 |
+
payload = {
|
| 877 |
+
"model": model,
|
| 878 |
+
"messages": [
|
| 879 |
+
{"role": "system", "content": system},
|
| 880 |
+
{"role": "user", "content": user_message},
|
| 881 |
+
],
|
| 882 |
+
"temperature": temperature,
|
| 883 |
+
"max_tokens": max_tokens,
|
| 884 |
+
}
|
| 885 |
+
|
| 886 |
+
for attempt in range(self.max_retries):
|
| 887 |
+
try:
|
| 888 |
+
resp = await self._http.post("chat/completions", json=payload)
|
| 889 |
+
resp.raise_for_status()
|
| 890 |
+
data = resp.json()
|
| 891 |
+
usage = data.get("usage", {})
|
| 892 |
+
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
| 893 |
+
return data["choices"][0]["message"]["content"]
|
| 894 |
+
except httpx.HTTPStatusError as e:
|
| 895 |
+
status = e.response.status_code
|
| 896 |
+
if status == 429:
|
| 897 |
+
retry_after = e.response.headers.get("retry-after", "10")
|
| 898 |
+
try:
|
| 899 |
+
wait = float(retry_after)
|
| 900 |
+
except (ValueError, TypeError):
|
| 901 |
+
wait = 10.0
|
| 902 |
+
if wait > 60:
|
| 903 |
+
self._rate_limited_until = time.monotonic() + wait
|
| 904 |
+
logger.warning(f"HF quota exhausted for {wait:.0f}s")
|
| 905 |
+
return ""
|
| 906 |
+
logger.warning(f"HF rate limited, waiting {wait}s")
|
| 907 |
+
await asyncio.sleep(wait)
|
| 908 |
+
elif status in (503, 504):
|
| 909 |
+
# Model loading / gateway timeout β back off and retry
|
| 910 |
+
wait = 5.0 * (attempt + 1)
|
| 911 |
+
logger.warning(f"HF model loading ({status}), waiting {wait}s")
|
| 912 |
+
await asyncio.sleep(wait)
|
| 913 |
+
else:
|
| 914 |
+
logger.error(f"HF HTTP error: {status} {e.response.text[:200]}")
|
| 915 |
+
if attempt == self.max_retries - 1:
|
| 916 |
+
return ""
|
| 917 |
+
await asyncio.sleep(2)
|
| 918 |
+
except Exception as e:
|
| 919 |
+
logger.error(f"HF error: {e}")
|
| 920 |
+
if attempt == self.max_retries - 1:
|
| 921 |
+
return ""
|
| 922 |
+
await asyncio.sleep(2)
|
| 923 |
+
return ""
|
| 924 |
+
|
| 925 |
+
async def complete_json(
|
| 926 |
+
self,
|
| 927 |
+
system: str,
|
| 928 |
+
user_message: str,
|
| 929 |
+
model: Optional[str] = None,
|
| 930 |
+
temperature: float = 0.7,
|
| 931 |
+
max_tokens: int = 1024,
|
| 932 |
+
) -> dict:
|
| 933 |
+
if self._is_quota_exhausted():
|
| 934 |
+
logger.debug("HF quota circuit breaker active β skipping complete_json()")
|
| 935 |
+
return {}
|
| 936 |
+
|
| 937 |
+
json_instruction = (
|
| 938 |
+
"\n\nRespond ONLY with valid JSON. No markdown, no explanation, no extra text. "
|
| 939 |
+
"Just the JSON object."
|
| 940 |
+
)
|
| 941 |
+
text = await self.complete(
|
| 942 |
+
system=system,
|
| 943 |
+
user_message=user_message + json_instruction,
|
| 944 |
+
model=model,
|
| 945 |
+
temperature=temperature,
|
| 946 |
+
max_tokens=max_tokens,
|
| 947 |
+
)
|
| 948 |
+
return _parse_json_response(text)
|
| 949 |
+
|
| 950 |
+
|
| 951 |
# ============================================================
|
| 952 |
# Factory β create the right client based on config
|
| 953 |
# ============================================================
|
|
|
|
| 956 |
provider: Optional[str] = None,
|
| 957 |
model: Optional[str] = None,
|
| 958 |
ollama_url: str = "http://localhost:11434",
|
| 959 |
+
) -> ClaudeClient | OllamaClient | GroqClient | GeminiClient | HFInferenceClient:
|
| 960 |
"""Create an LLM client based on environment or explicit config.
|
| 961 |
|
| 962 |
Provider detection order:
|
| 963 |
1. Explicit provider argument
|
| 964 |
2. LLM_PROVIDER env var
|
| 965 |
3. If ANTHROPIC_API_KEY is set β Claude
|
| 966 |
+
4. If GROQ_API_KEY is set β Groq (fast cloud)
|
| 967 |
+
5. If GEMINI_API_KEY is set β Gemini (free tier)
|
| 968 |
+
6. If HF_TOKEN is set β HF Inference (free, auto-available in HF Spaces)
|
| 969 |
+
7. Default β Ollama (local)
|
| 970 |
"""
|
| 971 |
if provider is None:
|
| 972 |
provider = os.environ.get("LLM_PROVIDER", "").lower()
|
| 973 |
|
| 974 |
if not provider:
|
| 975 |
+
# Auto-detect: Claude β Groq β Gemini β HF β Ollama
|
| 976 |
if os.environ.get("ANTHROPIC_API_KEY"):
|
| 977 |
provider = PROVIDER_CLAUDE
|
| 978 |
elif os.environ.get("GROQ_API_KEY"):
|
| 979 |
provider = PROVIDER_GROQ
|
| 980 |
elif os.environ.get("GEMINI_API_KEY"):
|
| 981 |
provider = PROVIDER_GEMINI
|
| 982 |
+
elif os.environ.get("HF_TOKEN"):
|
| 983 |
+
provider = PROVIDER_HF
|
| 984 |
else:
|
| 985 |
provider = PROVIDER_OLLAMA
|
| 986 |
|
|
|
|
| 993 |
elif provider == PROVIDER_GEMINI:
|
| 994 |
default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
|
| 995 |
return GeminiClient(default_model=default_model)
|
| 996 |
+
elif provider == PROVIDER_HF:
|
| 997 |
+
default_model = model or os.environ.get("HF_MODEL", MODEL_HF_QWEN)
|
| 998 |
+
return HFInferenceClient(default_model=default_model)
|
| 999 |
elif provider == PROVIDER_OLLAMA:
|
| 1000 |
default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
|
| 1001 |
return OllamaClient(base_url=ollama_url, default_model=default_model)
|
| 1002 |
else:
|
| 1003 |
+
raise ValueError(f"Unknown LLM provider: {provider}. Use 'claude', 'groq', 'gemini', 'hf', or 'ollama'.")
|
| 1004 |
|
| 1005 |
|
| 1006 |
# --- Prompt Templates ---
|
web/index.html
CHANGED
|
@@ -2892,7 +2892,7 @@ function processStateData(data) {
|
|
| 2892 |
.replace(/-\d{8}$/, '') // remove trailing date e.g. -20251001
|
| 2893 |
.replace(/-instant$/, '') // groq suffix
|
| 2894 |
.replace(/^gemini-/, ''); // "gemini-2.0-flash" β "2.0-flash"
|
| 2895 |
-
const providerIcon = { gemini: 'β¦', groq: 'β‘', claude: 'β', ollama: 'π¦' };
|
| 2896 |
const icon = providerIcon[data.llm_provider] || 'β‘';
|
| 2897 |
|
| 2898 |
// Status: limited > skipped > idle > active (calls happening)
|
|
|
|
| 2892 |
.replace(/-\d{8}$/, '') // remove trailing date e.g. -20251001
|
| 2893 |
.replace(/-instant$/, '') // groq suffix
|
| 2894 |
.replace(/^gemini-/, ''); // "gemini-2.0-flash" β "2.0-flash"
|
| 2895 |
+
const providerIcon = { gemini: 'β¦', groq: 'β‘', claude: 'β', ollama: 'π¦', hf: 'π€' };
|
| 2896 |
const icon = providerIcon[data.llm_provider] || 'β‘';
|
| 2897 |
|
| 2898 |
// Status: limited > skipped > idle > active (calls happening)
|