import logging import os from functools import lru_cache from typing import Any, List, Optional from huggingface_hub import InferenceClient from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage from langchain_core.outputs import ChatGeneration, ChatResult from pydantic import Field logger = logging.getLogger(__name__) MODEL_ID = "allenai/Olmo-3-7B-Instruct" FALLBACK_MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" def get_hf_token() -> str: token = ( os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") ) if not token: raise ValueError( "HF_TOKEN is not set. Add it as a Space secret (Settings → Repository secrets)." ) # LangChain / huggingface_hub also read this name os.environ.setdefault("HUGGINGFACEHUB_API_TOKEN", token) return token @lru_cache(maxsize=1) def get_inference_client() -> InferenceClient: return InferenceClient(api_key=get_hf_token()) class HuggingFaceInferenceChat(BaseChatModel): """LangChain chat model using Hugging Face Inference API chat.completions.""" model_id: str = Field(default=MODEL_ID) max_tokens: int = 512 temperature: float = 0.2 @property def _llm_type(self) -> str: return "huggingface-inference-chat" def _to_hf_messages(self, messages: List[BaseMessage]) -> list[dict[str, str]]: hf_messages: list[dict[str, str]] = [] for msg in messages: if isinstance(msg, SystemMessage): hf_messages.append({"role": "system", "content": str(msg.content)}) elif isinstance(msg, HumanMessage): hf_messages.append({"role": "user", "content": str(msg.content)}) elif isinstance(msg, AIMessage): hf_messages.append({"role": "assistant", "content": str(msg.content)}) return hf_messages def _generate( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Any = None, **kwargs: Any, ) -> ChatResult: client = get_inference_client() response = client.chat.completions.create( model=self.model_id, messages=self._to_hf_messages(messages), max_tokens=self.max_tokens, temperature=self.temperature, ) if not response.choices: raise RuntimeError(f"No choices returned for model {self.model_id}") content = response.choices[0].message.content or "" return ChatResult( generations=[ChatGeneration(message=AIMessage(content=content))] ) _llm: HuggingFaceInferenceChat | None = None def get_llm(model_id: str = MODEL_ID) -> HuggingFaceInferenceChat: global _llm if _llm is None or _llm.model_id != model_id: get_hf_token() _llm = HuggingFaceInferenceChat(model_id=model_id) return _llm def invoke_chat(messages: List[BaseMessage], model_id: str = MODEL_ID) -> str: """Call primary model, then fallback if the provider rejects the request.""" last_error: Exception | None = None for mid in (model_id, FALLBACK_MODEL_ID): try: llm = get_llm(mid) result = llm.invoke(messages) text = result.content if isinstance(result.content, str) else str(result.content) if text.strip(): return text.strip() except Exception as exc: last_error = exc logger.warning("HF chat failed for model %s: %s", mid, exc) global _llm _llm = None raise RuntimeError(str(last_error) if last_error else "Unknown inference error") if __name__ == "__main__": logging.basicConfig(level=logging.INFO) out = invoke_chat( [ SystemMessage(content="You are an ATS resume analyst."), HumanMessage( content=( "ATS Scores: Semantic 0.45, Keyword 0.70, Final 0.68. " "Missing: docker, tensorflow. Skill overlap: 70%. " "Write 3 short sections: Score Explanation, Weak Areas, Actionable Improvements." ) ), ] ) print(out)