"""Build InferenceClient with a provider that accepts the user's HF token.""" from __future__ import annotations import os from huggingface_hub import InferenceClient def inference_client_kwargs(token: str) -> dict: """ Default: **no** ``provider`` → the library uses ``auto``: first provider for this model per your https://hf.co/settings/inference-providers order. Forcing ``hf-inference`` breaks many chat models (e.g. Qwen2.5-7B-Instruct is only on together / featherless-ai — the router then returns **404** for …/hf-inference/models/…). Set ``HF_INFERENCE_PROVIDER`` to pin one provider (e.g. ``together``, ``sambanova``) or ``auto`` explicitly. Use ``hf-inference`` only for models that actually list it. """ raw = os.environ.get("HF_INFERENCE_PROVIDER") if raw is None: return {"token": token} r = raw.strip().lower() if r in ("", "auto"): return {"token": token} return {"token": token, "provider": r} def make_inference_client(token: str) -> InferenceClient: return InferenceClient(**inference_client_kwargs(token))