Fix HF LLM: use free SmolLM3 as default, strip thinking tokens
Browse filesThe Qwen/7B model was causing 402 (credit balance depleted) because it
auto-routes to a paid third-party provider via the HF router.
SmolLM3-3B with :hf-inference runs on HF's own CPU cluster for free.
- Default model: HuggingFaceTB/SmolLM3-3B:hf-inference (no credits needed)
- Prepend /no_think to system prompts to disable SmolLM3 reasoning mode
- Strip <think>...</think> blocks from all HF responses as a safety net
- Add HW_WR_TOKEN to recognized env var names (Space write token)
- HW_WR_TOKEN / hf_soci_token now take priority over HF_TOKEN so a
personal token with credits overrides the Space's limited auto-token
- create_llm_client: detect HW_WR_TOKEN for HF auto-selection
- _choose_provider: detect HW_WR_TOKEN to offer HF in the menu
- Tested: complete() and complete_json() both work correctly with SmolLM3
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/soci/api/server.py +1 -0
- src/soci/engine/llm.py +23 -6
|
@@ -260,6 +260,7 @@ def _choose_provider() -> str:
|
|
| 260 |
os.environ.get("HF_TOKEN")
|
| 261 |
or os.environ.get("hf_soci_token")
|
| 262 |
or os.environ.get("soci_token")
|
|
|
|
| 263 |
)
|
| 264 |
|
| 265 |
options = []
|
|
|
|
| 260 |
os.environ.get("HF_TOKEN")
|
| 261 |
or os.environ.get("hf_soci_token")
|
| 262 |
or os.environ.get("soci_token")
|
| 263 |
+
or os.environ.get("HW_WR_TOKEN")
|
| 264 |
)
|
| 265 |
|
| 266 |
options = []
|
|
@@ -824,10 +824,19 @@ class HFInferenceClient:
|
|
| 824 |
def __init__(
|
| 825 |
self,
|
| 826 |
api_key: Optional[str] = None,
|
| 827 |
-
default_model: str =
|
| 828 |
max_retries: int = 3,
|
| 829 |
) -> None:
|
| 830 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
if not self.api_key:
|
| 832 |
logger.warning(
|
| 833 |
"Neither HF_TOKEN nor soci_token is set — HF Inference will not make LLM calls. "
|
|
@@ -886,10 +895,13 @@ class HFInferenceClient:
|
|
| 886 |
return ""
|
| 887 |
|
| 888 |
model = self._map_model(model or self.default_model)
|
|
|
|
|
|
|
|
|
|
| 889 |
payload = {
|
| 890 |
"model": model,
|
| 891 |
"messages": [
|
| 892 |
-
{"role": "system", "content":
|
| 893 |
{"role": "user", "content": user_message},
|
| 894 |
],
|
| 895 |
"temperature": temperature,
|
|
@@ -904,7 +916,11 @@ class HFInferenceClient:
|
|
| 904 |
usage = data.get("usage", {})
|
| 905 |
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
| 906 |
self._last_error = "" # clear on success
|
| 907 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
except httpx.HTTPStatusError as e:
|
| 909 |
status = e.response.status_code
|
| 910 |
body = e.response.text[:300]
|
|
@@ -1015,7 +1031,8 @@ def create_llm_client(
|
|
| 1015 |
provider = PROVIDER_GROQ
|
| 1016 |
elif os.environ.get("GEMINI_API_KEY"):
|
| 1017 |
provider = PROVIDER_GEMINI
|
| 1018 |
-
elif os.environ.get("HF_TOKEN") or os.environ.get("hf_soci_token")
|
|
|
|
| 1019 |
provider = PROVIDER_HF
|
| 1020 |
else:
|
| 1021 |
provider = PROVIDER_OLLAMA
|
|
@@ -1030,7 +1047,7 @@ def create_llm_client(
|
|
| 1030 |
default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
|
| 1031 |
return GeminiClient(default_model=default_model)
|
| 1032 |
elif provider == PROVIDER_HF:
|
| 1033 |
-
default_model = model or os.environ.get("HF_MODEL",
|
| 1034 |
return HFInferenceClient(default_model=default_model)
|
| 1035 |
elif provider == PROVIDER_OLLAMA:
|
| 1036 |
default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
|
|
|
|
| 824 |
def __init__(
|
| 825 |
self,
|
| 826 |
api_key: Optional[str] = None,
|
| 827 |
+
default_model: str = MODEL_HF_SMOL,
|
| 828 |
max_retries: int = 3,
|
| 829 |
) -> None:
|
| 830 |
+
# Priority: explicit arg → named secrets (personal token) → Space auto-injected HF_TOKEN
|
| 831 |
+
# HF_TOKEN is auto-injected in HF Spaces but only has basic inference (no credits for routed models).
|
| 832 |
+
# A personal token stored as hf_soci_token / soci_token / HW_WR_TOKEN takes precedence.
|
| 833 |
+
self.api_key = (
|
| 834 |
+
api_key
|
| 835 |
+
or os.environ.get("hf_soci_token", "")
|
| 836 |
+
or os.environ.get("soci_token", "")
|
| 837 |
+
or os.environ.get("HW_WR_TOKEN", "")
|
| 838 |
+
or os.environ.get("HF_TOKEN", "")
|
| 839 |
+
)
|
| 840 |
if not self.api_key:
|
| 841 |
logger.warning(
|
| 842 |
"Neither HF_TOKEN nor soci_token is set — HF Inference will not make LLM calls. "
|
|
|
|
| 895 |
return ""
|
| 896 |
|
| 897 |
model = self._map_model(model or self.default_model)
|
| 898 |
+
# /no_think disables chain-of-thought on SmolLM3 and similar thinking models;
|
| 899 |
+
# harmless for other models since it's prepended before the system prompt.
|
| 900 |
+
system_with_flag = "/no_think\n" + system
|
| 901 |
payload = {
|
| 902 |
"model": model,
|
| 903 |
"messages": [
|
| 904 |
+
{"role": "system", "content": system_with_flag},
|
| 905 |
{"role": "user", "content": user_message},
|
| 906 |
],
|
| 907 |
"temperature": temperature,
|
|
|
|
| 916 |
usage = data.get("usage", {})
|
| 917 |
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
| 918 |
self._last_error = "" # clear on success
|
| 919 |
+
text = data["choices"][0]["message"]["content"] or ""
|
| 920 |
+
# Strip any <think>...</think> blocks that thinking models may emit
|
| 921 |
+
import re as _re
|
| 922 |
+
text = _re.sub(r"<think>.*?</think>", "", text, flags=_re.DOTALL).strip()
|
| 923 |
+
return text
|
| 924 |
except httpx.HTTPStatusError as e:
|
| 925 |
status = e.response.status_code
|
| 926 |
body = e.response.text[:300]
|
|
|
|
| 1031 |
provider = PROVIDER_GROQ
|
| 1032 |
elif os.environ.get("GEMINI_API_KEY"):
|
| 1033 |
provider = PROVIDER_GEMINI
|
| 1034 |
+
elif (os.environ.get("HF_TOKEN") or os.environ.get("hf_soci_token")
|
| 1035 |
+
or os.environ.get("soci_token") or os.environ.get("HW_WR_TOKEN")):
|
| 1036 |
provider = PROVIDER_HF
|
| 1037 |
else:
|
| 1038 |
provider = PROVIDER_OLLAMA
|
|
|
|
| 1047 |
default_model = model or os.environ.get("GEMINI_MODEL", MODEL_GEMINI_FLASH)
|
| 1048 |
return GeminiClient(default_model=default_model)
|
| 1049 |
elif provider == PROVIDER_HF:
|
| 1050 |
+
default_model = model or os.environ.get("HF_MODEL", MODEL_HF_SMOL)
|
| 1051 |
return HFInferenceClient(default_model=default_model)
|
| 1052 |
elif provider == PROVIDER_OLLAMA:
|
| 1053 |
default_model = model or os.environ.get("OLLAMA_MODEL", MODEL_LLAMA)
|