RayMelius Claude Sonnet 4.6 commited on
Commit
7aa1d5f
·
1 Parent(s): 3e092a2

Fix HF LLM: use free SmolLM3 as default, strip thinking tokens

Browse files

The Qwen/7B model was causing 402 (credit balance depleted) because it
auto-routes to a paid third-party provider via the HF router.
SmolLM3-3B with :hf-inference runs on HF's own CPU cluster for free.

- Default model: HuggingFaceTB/SmolLM3-3B:hf-inference (no credits needed)
- Prepend /no_think to system prompts to disable SmolLM3 reasoning mode
- Strip <think>...</think> blocks from all HF responses as a safety net
- Add HW_WR_TOKEN to recognized env var names (Space write token)
- HW_WR_TOKEN / hf_soci_token now take priority over HF_TOKEN so a
personal token with credits overrides the Space's limited auto-token
- create_llm_client: detect HW_WR_TOKEN for HF auto-selection
- _choose_provider: detect HW_WR_TOKEN to offer HF in the menu
- Tested: complete() and complete_json() both work correctly with SmolLM3

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. src/soci/api/server.py +1 -0
  2. src/soci/engine/llm.py +23 -6
src/soci/api/server.py CHANGED
@@ -260,6 +260,7 @@ def _choose_provider() -> str:
260
  os.environ.get("HF_TOKEN")
261
  or os.environ.get("hf_soci_token")
262
  or os.environ.get("soci_token")
 
263
  )
264
 
265
  options = []
 
260
  os.environ.get("HF_TOKEN")
261
  or os.environ.get("hf_soci_token")
262
  or os.environ.get("soci_token")
263
+ or os.environ.get("HW_WR_TOKEN")
264
  )
265
 
266
  options = []
src/soci/engine/llm.py CHANGED
@@ -824,10 +824,19 @@ class HFInferenceClient:
824
  def __init__(
825
  self,
826
  api_key: Optional[str] = None,
827
- default_model: str = MODEL_HF_QWEN,
828
  max_retries: int = 3,
829
  ) -> None:
830
- self.api_key = api_key or os.environ.get("HF_TOKEN", "") or os.environ.get("hf_soci_token", "") or os.environ.get("soci_token", "")
 
 
 
 
 
 
 
 
 
831
  if not self.api_key:
832
  logger.warning(
833
  "Neither HF_TOKEN nor soci_token is set — HF Inference will not make LLM calls. "
@@ -886,10 +895,13 @@ class HFInferenceClient:
886
  return ""
887
 
888
  model = self._map_model(model or self.default_model)
 
 
 
889
  payload = {
890
  "model": model,
891
  "messages": [
892
- {"role": "system", "content": system},
893
  {"role": "user", "content": user_message},
894
  ],
895
  "temperature": temperature,
@@ -904,7 +916,11 @@ class HFInferenceClient:
904
  usage = data.get("usage", {})
905
  self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
906
  self._last_error = "" # clear on success
907
- return data["choices"][0]["message"]["content"]
 
 
 
 
908
  except httpx.HTTPStatusError as e:
909
  status = e.response.status_code
910
  body = e.response.text[:300]
@@ -1015,7 +1031,8 @@ def create_llm_client(
1015
  provider = PROVIDER_GROQ
1016
  elif os.environ.get("GEMINI_API_KEY"):
1017
  provider = PROVIDER_GEMINI
1018
- elif os.environ.get("HF_TOKEN") or os.environ.get("hf_soci_token") or os.environ.get("soci_token"):
 
1019
  provider = PROVIDER_HF
1020
  else:
1021
  provider = PROVIDER_OLLAMA
@@ -1030,7 +1047,7 @@ def create_llm_client(
1030
  default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
1031
  return GeminiClient(default_model=default_model)
1032
  elif provider == PROVIDER_HF:
1033
- default_model = model or os.environ.get("HF_MODEL", MODEL_HF_QWEN)
1034
  return HFInferenceClient(default_model=default_model)
1035
  elif provider == PROVIDER_OLLAMA:
1036
  default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
 
824
  def __init__(
825
  self,
826
  api_key: Optional[str] = None,
827
+ default_model: str = MODEL_HF_SMOL,
828
  max_retries: int = 3,
829
  ) -> None:
830
+ # Priority: explicit arg named secrets (personal token) Space auto-injected HF_TOKEN
831
+ # HF_TOKEN is auto-injected in HF Spaces but only has basic inference (no credits for routed models).
832
+ # A personal token stored as hf_soci_token / soci_token / HW_WR_TOKEN takes precedence.
833
+ self.api_key = (
834
+ api_key
835
+ or os.environ.get("hf_soci_token", "")
836
+ or os.environ.get("soci_token", "")
837
+ or os.environ.get("HW_WR_TOKEN", "")
838
+ or os.environ.get("HF_TOKEN", "")
839
+ )
840
  if not self.api_key:
841
  logger.warning(
842
  "Neither HF_TOKEN nor soci_token is set — HF Inference will not make LLM calls. "
 
895
  return ""
896
 
897
  model = self._map_model(model or self.default_model)
898
+ # /no_think disables chain-of-thought on SmolLM3 and similar thinking models;
899
+ # harmless for other models since it's prepended before the system prompt.
900
+ system_with_flag = "/no_think\n" + system
901
  payload = {
902
  "model": model,
903
  "messages": [
904
+ {"role": "system", "content": system_with_flag},
905
  {"role": "user", "content": user_message},
906
  ],
907
  "temperature": temperature,
 
916
  usage = data.get("usage", {})
917
  self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
918
  self._last_error = "" # clear on success
919
+ text = data["choices"][0]["message"]["content"] or ""
920
+ # Strip any <think>...</think> blocks that thinking models may emit
921
+ import re as _re
922
+ text = _re.sub(r"<think>.*?</think>", "", text, flags=_re.DOTALL).strip()
923
+ return text
924
  except httpx.HTTPStatusError as e:
925
  status = e.response.status_code
926
  body = e.response.text[:300]
 
1031
  provider = PROVIDER_GROQ
1032
  elif os.environ.get("GEMINI_API_KEY"):
1033
  provider = PROVIDER_GEMINI
1034
+ elif (os.environ.get("HF_TOKEN") or os.environ.get("hf_soci_token")
1035
+ or os.environ.get("soci_token") or os.environ.get("HW_WR_TOKEN")):
1036
  provider = PROVIDER_HF
1037
  else:
1038
  provider = PROVIDER_OLLAMA
 
1047
  default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
1048
  return GeminiClient(default_model=default_model)
1049
  elif provider == PROVIDER_HF:
1050
+ default_model = model or os.environ.get("HF_MODEL", MODEL_HF_SMOL)
1051
  return HFInferenceClient(default_model=default_model)
1052
  elif provider == PROVIDER_OLLAMA:
1053
  default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)