RayMelius Claude Sonnet 4.6 commited on
Commit
15a201c
Β·
1 Parent(s): 1abc674

Add Hugging Face Serverless Inference as free LLM provider

Browse files

- HFInferenceClient uses OpenAI-compatible endpoint at api-inference.huggingface.co
- Default model: Qwen2.5-7B-Instruct (also supports Llama-3.2-3B, Mistral-7B)
- HF_TOKEN auto-injected in HF Spaces β€” zero config needed on Spaces deployment
- Circuit breaker on quota exhaustion (same pattern as Groq/Gemini)
- Auto-detected in create_llm_client() when HF_TOKEN is set (after Gemini, before Ollama)
- Provider switcher UI shows πŸ€— icon for hf provider
- routes.py and server.py updated to accept "hf" as a valid provider

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

src/soci/api/routes.py CHANGED
@@ -278,6 +278,8 @@ async def get_llm_providers():
278
  providers.append({"id": "groq", "label": "Groq (Llama 8B)", "icon": "⚑"})
279
  if os.environ.get("GEMINI_API_KEY"):
280
  providers.append({"id": "gemini", "label": "Gemini 2.0 Flash", "icon": "✦"})
 
 
281
  providers.append( {"id": "ollama", "label": "Ollama (local)", "icon": "πŸ¦™"})
282
  return {"current": current, "providers": providers}
283
 
@@ -286,7 +288,7 @@ async def get_llm_providers():
286
  async def set_llm_provider(req: SwitchProviderRequest):
287
  """Hot-swap the active LLM provider."""
288
  from soci.api.server import switch_llm_provider
289
- valid = {"claude", "groq", "gemini", "ollama"}
290
  if req.provider not in valid:
291
  raise HTTPException(status_code=400, detail=f"Unknown provider '{req.provider}'")
292
  try:
 
278
  providers.append({"id": "groq", "label": "Groq (Llama 8B)", "icon": "⚑"})
279
  if os.environ.get("GEMINI_API_KEY"):
280
  providers.append({"id": "gemini", "label": "Gemini 2.0 Flash", "icon": "✦"})
281
+ if os.environ.get("HF_TOKEN"):
282
+ providers.append({"id": "hf", "label": "HF Qwen 2.5 7B", "icon": "πŸ€—"})
283
  providers.append( {"id": "ollama", "label": "Ollama (local)", "icon": "πŸ¦™"})
284
  return {"current": current, "providers": providers}
285
 
 
288
  async def set_llm_provider(req: SwitchProviderRequest):
289
  """Hot-swap the active LLM provider."""
290
  from soci.api.server import switch_llm_provider
291
+ valid = {"claude", "groq", "gemini", "hf", "ollama"}
292
  if req.provider not in valid:
293
  raise HTTPException(status_code=400, detail=f"Unknown provider '{req.provider}'")
294
  try:
src/soci/api/server.py CHANGED
@@ -250,7 +250,7 @@ def _choose_provider() -> str:
250
  """
251
  # Check explicit env vars first
252
  provider = os.environ.get("SOCI_PROVIDER", "").lower() or os.environ.get("LLM_PROVIDER", "").lower()
253
- if provider in ("claude", "groq", "gemini", "ollama"):
254
  return provider
255
 
256
  # Check if keys are available
 
250
  """
251
  # Check explicit env vars first
252
  provider = os.environ.get("SOCI_PROVIDER", "").lower() or os.environ.get("LLM_PROVIDER", "").lower()
253
+ if provider in ("claude", "groq", "gemini", "hf", "ollama"):
254
  return provider
255
 
256
  # Check if keys are available
src/soci/engine/llm.py CHANGED
@@ -19,6 +19,7 @@ PROVIDER_CLAUDE = "claude"
19
  PROVIDER_OLLAMA = "ollama"
20
  PROVIDER_GROQ = "groq"
21
  PROVIDER_GEMINI = "gemini"
 
22
 
23
  # Claude model IDs
24
  MODEL_SONNET = "claude-sonnet-4-5-20250929"
@@ -40,6 +41,11 @@ MODEL_GROQ_MIXTRAL = "mixtral-8x7b-32768"
40
  MODEL_GEMINI_FLASH = "gemini-2.0-flash"
41
  MODEL_GEMINI_PRO = "gemini-1.5-pro"
42
 
 
 
 
 
 
43
  # Approximate cost per 1M tokens (USD) β€” Ollama is free, Groq is very cheap
44
  COST_PER_1M = {
45
  MODEL_SONNET: {"input": 3.0, "output": 15.0},
@@ -799,6 +805,149 @@ class GeminiClient:
799
  return {}
800
 
801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  # ============================================================
803
  # Factory β€” create the right client based on config
804
  # ============================================================
@@ -807,27 +956,31 @@ def create_llm_client(
807
  provider: Optional[str] = None,
808
  model: Optional[str] = None,
809
  ollama_url: str = "http://localhost:11434",
810
- ) -> ClaudeClient | OllamaClient | GroqClient:
811
  """Create an LLM client based on environment or explicit config.
812
 
813
  Provider detection order:
814
  1. Explicit provider argument
815
  2. LLM_PROVIDER env var
816
  3. If ANTHROPIC_API_KEY is set β†’ Claude
817
- 4. If GROQ_API_KEY is set β†’ Groq (fast cloud, parallel)
818
- 5. Default β†’ Ollama (free, local)
 
 
819
  """
820
  if provider is None:
821
  provider = os.environ.get("LLM_PROVIDER", "").lower()
822
 
823
  if not provider:
824
- # Auto-detect: Claude β†’ Groq β†’ Gemini β†’ Ollama
825
  if os.environ.get("ANTHROPIC_API_KEY"):
826
  provider = PROVIDER_CLAUDE
827
  elif os.environ.get("GROQ_API_KEY"):
828
  provider = PROVIDER_GROQ
829
  elif os.environ.get("GEMINI_API_KEY"):
830
  provider = PROVIDER_GEMINI
 
 
831
  else:
832
  provider = PROVIDER_OLLAMA
833
 
@@ -840,11 +993,14 @@ def create_llm_client(
840
  elif provider == PROVIDER_GEMINI:
841
  default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
842
  return GeminiClient(default_model=default_model)
 
 
 
843
  elif provider == PROVIDER_OLLAMA:
844
  default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
845
  return OllamaClient(base_url=ollama_url, default_model=default_model)
846
  else:
847
- raise ValueError(f"Unknown LLM provider: {provider}. Use 'claude', 'groq', 'gemini', or 'ollama'.")
848
 
849
 
850
  # --- Prompt Templates ---
 
19
  PROVIDER_OLLAMA = "ollama"
20
  PROVIDER_GROQ = "groq"
21
  PROVIDER_GEMINI = "gemini"
22
+ PROVIDER_HF = "hf"
23
 
24
  # Claude model IDs
25
  MODEL_SONNET = "claude-sonnet-4-5-20250929"
 
41
  MODEL_GEMINI_FLASH = "gemini-2.0-flash"
42
  MODEL_GEMINI_PRO = "gemini-1.5-pro"
43
 
44
+ # Hugging Face Serverless Inference model IDs (free, no credit card)
45
+ MODEL_HF_LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
46
+ MODEL_HF_QWEN = "Qwen/Qwen2.5-7B-Instruct"
47
+ MODEL_HF_MISTRAL = "mistralai/Mistral-7B-Instruct-v0.3"
48
+
49
  # Approximate cost per 1M tokens (USD) β€” Ollama is free, Groq is very cheap
50
  COST_PER_1M = {
51
  MODEL_SONNET: {"input": 3.0, "output": 15.0},
 
805
  return {}
806
 
807
 
808
+ # ============================================================
809
+ # Hugging Face Serverless Inference Client (free tier)
810
+ # ============================================================
811
+
812
+ class HFInferenceClient:
813
+ """Hugging Face Serverless Inference via OpenAI-compatible endpoint.
814
+
815
+ Free tier (no credit card required):
816
+ - Llama-3.2-3B-Instruct, Qwen2.5-7B-Instruct, Mistral-7B, and many others.
817
+ - HF_TOKEN is auto-injected in HF Spaces β€” no manual setup needed.
818
+ - Get a token at https://huggingface.co/settings/tokens
819
+ """
820
+
821
+ def __init__(
822
+ self,
823
+ api_key: Optional[str] = None,
824
+ default_model: str = MODEL_HF_QWEN,
825
+ max_retries: int = 3,
826
+ ) -> None:
827
+ self.api_key = api_key or os.environ.get("HF_TOKEN", "")
828
+ if not self.api_key:
829
+ raise ValueError(
830
+ "HF_TOKEN not set. Get a free token at https://huggingface.co/settings/tokens"
831
+ )
832
+ self.default_model = default_model
833
+ self.max_retries = max_retries
834
+ self.usage = LLMUsage()
835
+ self.provider = PROVIDER_HF
836
+ self._http = httpx.AsyncClient(
837
+ base_url="https://api-inference.huggingface.co/v1/",
838
+ headers={
839
+ "Authorization": f"Bearer {self.api_key}",
840
+ "Content-Type": "application/json",
841
+ },
842
+ timeout=120.0, # HF can be slow under load
843
+ )
844
+ self._rate_limited_until: float = 0.0
845
+
846
+ def _is_quota_exhausted(self) -> bool:
847
+ return time.monotonic() < self._rate_limited_until
848
+
849
+ def _map_model(self, model: str) -> str:
850
+ """Map Claude/Groq/Gemini model names to HF equivalents."""
851
+ mapping = {
852
+ MODEL_SONNET: self.default_model,
853
+ MODEL_HAIKU: self.default_model,
854
+ MODEL_GROQ_LLAMA_8B: MODEL_HF_LLAMA,
855
+ MODEL_GEMINI_FLASH: self.default_model,
856
+ }
857
+ return mapping.get(model, self.default_model)
858
+
859
+ @property
860
+ def llm_status(self) -> str:
861
+ return "limited" if self._is_quota_exhausted() else "active"
862
+
863
+ async def complete(
864
+ self,
865
+ system: str,
866
+ user_message: str,
867
+ model: Optional[str] = None,
868
+ temperature: float = 0.7,
869
+ max_tokens: int = 1024,
870
+ ) -> str:
871
+ if self._is_quota_exhausted():
872
+ logger.debug("HF quota circuit breaker active β€” skipping complete()")
873
+ return ""
874
+
875
+ model = self._map_model(model or self.default_model)
876
+ payload = {
877
+ "model": model,
878
+ "messages": [
879
+ {"role": "system", "content": system},
880
+ {"role": "user", "content": user_message},
881
+ ],
882
+ "temperature": temperature,
883
+ "max_tokens": max_tokens,
884
+ }
885
+
886
+ for attempt in range(self.max_retries):
887
+ try:
888
+ resp = await self._http.post("chat/completions", json=payload)
889
+ resp.raise_for_status()
890
+ data = resp.json()
891
+ usage = data.get("usage", {})
892
+ self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
893
+ return data["choices"][0]["message"]["content"]
894
+ except httpx.HTTPStatusError as e:
895
+ status = e.response.status_code
896
+ if status == 429:
897
+ retry_after = e.response.headers.get("retry-after", "10")
898
+ try:
899
+ wait = float(retry_after)
900
+ except (ValueError, TypeError):
901
+ wait = 10.0
902
+ if wait > 60:
903
+ self._rate_limited_until = time.monotonic() + wait
904
+ logger.warning(f"HF quota exhausted for {wait:.0f}s")
905
+ return ""
906
+ logger.warning(f"HF rate limited, waiting {wait}s")
907
+ await asyncio.sleep(wait)
908
+ elif status in (503, 504):
909
+ # Model loading / gateway timeout β€” back off and retry
910
+ wait = 5.0 * (attempt + 1)
911
+ logger.warning(f"HF model loading ({status}), waiting {wait}s")
912
+ await asyncio.sleep(wait)
913
+ else:
914
+ logger.error(f"HF HTTP error: {status} {e.response.text[:200]}")
915
+ if attempt == self.max_retries - 1:
916
+ return ""
917
+ await asyncio.sleep(2)
918
+ except Exception as e:
919
+ logger.error(f"HF error: {e}")
920
+ if attempt == self.max_retries - 1:
921
+ return ""
922
+ await asyncio.sleep(2)
923
+ return ""
924
+
925
+ async def complete_json(
926
+ self,
927
+ system: str,
928
+ user_message: str,
929
+ model: Optional[str] = None,
930
+ temperature: float = 0.7,
931
+ max_tokens: int = 1024,
932
+ ) -> dict:
933
+ if self._is_quota_exhausted():
934
+ logger.debug("HF quota circuit breaker active β€” skipping complete_json()")
935
+ return {}
936
+
937
+ json_instruction = (
938
+ "\n\nRespond ONLY with valid JSON. No markdown, no explanation, no extra text. "
939
+ "Just the JSON object."
940
+ )
941
+ text = await self.complete(
942
+ system=system,
943
+ user_message=user_message + json_instruction,
944
+ model=model,
945
+ temperature=temperature,
946
+ max_tokens=max_tokens,
947
+ )
948
+ return _parse_json_response(text)
949
+
950
+
951
  # ============================================================
952
  # Factory β€” create the right client based on config
953
  # ============================================================
 
956
  provider: Optional[str] = None,
957
  model: Optional[str] = None,
958
  ollama_url: str = "http://localhost:11434",
959
+ ) -> ClaudeClient | OllamaClient | GroqClient | GeminiClient | HFInferenceClient:
960
  """Create an LLM client based on environment or explicit config.
961
 
962
  Provider detection order:
963
  1. Explicit provider argument
964
  2. LLM_PROVIDER env var
965
  3. If ANTHROPIC_API_KEY is set β†’ Claude
966
+ 4. If GROQ_API_KEY is set β†’ Groq (fast cloud)
967
+ 5. If GEMINI_API_KEY is set β†’ Gemini (free tier)
968
+ 6. If HF_TOKEN is set β†’ HF Inference (free, auto-available in HF Spaces)
969
+ 7. Default β†’ Ollama (local)
970
  """
971
  if provider is None:
972
  provider = os.environ.get("LLM_PROVIDER", "").lower()
973
 
974
  if not provider:
975
+ # Auto-detect: Claude β†’ Groq β†’ Gemini β†’ HF β†’ Ollama
976
  if os.environ.get("ANTHROPIC_API_KEY"):
977
  provider = PROVIDER_CLAUDE
978
  elif os.environ.get("GROQ_API_KEY"):
979
  provider = PROVIDER_GROQ
980
  elif os.environ.get("GEMINI_API_KEY"):
981
  provider = PROVIDER_GEMINI
982
+ elif os.environ.get("HF_TOKEN"):
983
+ provider = PROVIDER_HF
984
  else:
985
  provider = PROVIDER_OLLAMA
986
 
 
993
  elif provider == PROVIDER_GEMINI:
994
  default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
995
  return GeminiClient(default_model=default_model)
996
+ elif provider == PROVIDER_HF:
997
+ default_model = model or os.environ.get("HF_MODEL", MODEL_HF_QWEN)
998
+ return HFInferenceClient(default_model=default_model)
999
  elif provider == PROVIDER_OLLAMA:
1000
  default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
1001
  return OllamaClient(base_url=ollama_url, default_model=default_model)
1002
  else:
1003
+ raise ValueError(f"Unknown LLM provider: {provider}. Use 'claude', 'groq', 'gemini', 'hf', or 'ollama'.")
1004
 
1005
 
1006
  # --- Prompt Templates ---
web/index.html CHANGED
@@ -2892,7 +2892,7 @@ function processStateData(data) {
2892
  .replace(/-\d{8}$/, '') // remove trailing date e.g. -20251001
2893
  .replace(/-instant$/, '') // groq suffix
2894
  .replace(/^gemini-/, ''); // "gemini-2.0-flash" β†’ "2.0-flash"
2895
- const providerIcon = { gemini: '✦', groq: '⚑', claude: 'β—†', ollama: 'πŸ¦™' };
2896
  const icon = providerIcon[data.llm_provider] || '⚑';
2897
 
2898
  // Status: limited > skipped > idle > active (calls happening)
 
2892
  .replace(/-\d{8}$/, '') // remove trailing date e.g. -20251001
2893
  .replace(/-instant$/, '') // groq suffix
2894
  .replace(/^gemini-/, ''); // "gemini-2.0-flash" β†’ "2.0-flash"
2895
+ const providerIcon = { gemini: '✦', groq: '⚑', claude: 'β—†', ollama: 'πŸ¦™', hf: 'πŸ€—' };
2896
  const icon = providerIcon[data.llm_provider] || '⚑';
2897
 
2898
  // Status: limited > skipped > idle > active (calls happening)