| import logging |
| import os |
| from functools import lru_cache |
| from typing import Any, List, Optional |
|
|
| from huggingface_hub import InferenceClient |
| from langchain_core.language_models.chat_models import BaseChatModel |
| from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage |
| from langchain_core.outputs import ChatGeneration, ChatResult |
| from pydantic import Field |
|
|
| logger = logging.getLogger(__name__) |
|
|
| MODEL_ID = "allenai/Olmo-3-7B-Instruct" |
| FALLBACK_MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" |
|
|
|
|
| def get_hf_token() -> str: |
| token = ( |
| os.environ.get("HF_TOKEN") |
| or os.environ.get("HUGGINGFACEHUB_API_TOKEN") |
| or os.environ.get("HUGGING_FACE_HUB_TOKEN") |
| ) |
| if not token: |
| raise ValueError( |
| "HF_TOKEN is not set. Add it as a Space secret (Settings → Repository secrets)." |
| ) |
| |
| os.environ.setdefault("HUGGINGFACEHUB_API_TOKEN", token) |
| return token |
|
|
|
|
| @lru_cache(maxsize=1) |
| def get_inference_client() -> InferenceClient: |
| return InferenceClient(api_key=get_hf_token()) |
|
|
|
|
| class HuggingFaceInferenceChat(BaseChatModel): |
| """LangChain chat model using Hugging Face Inference API chat.completions.""" |
|
|
| model_id: str = Field(default=MODEL_ID) |
| max_tokens: int = 512 |
| temperature: float = 0.2 |
|
|
| @property |
| def _llm_type(self) -> str: |
| return "huggingface-inference-chat" |
|
|
| def _to_hf_messages(self, messages: List[BaseMessage]) -> list[dict[str, str]]: |
| hf_messages: list[dict[str, str]] = [] |
| for msg in messages: |
| if isinstance(msg, SystemMessage): |
| hf_messages.append({"role": "system", "content": str(msg.content)}) |
| elif isinstance(msg, HumanMessage): |
| hf_messages.append({"role": "user", "content": str(msg.content)}) |
| elif isinstance(msg, AIMessage): |
| hf_messages.append({"role": "assistant", "content": str(msg.content)}) |
| return hf_messages |
|
|
| def _generate( |
| self, |
| messages: List[BaseMessage], |
| stop: Optional[List[str]] = None, |
| run_manager: Any = None, |
| **kwargs: Any, |
| ) -> ChatResult: |
| client = get_inference_client() |
| response = client.chat.completions.create( |
| model=self.model_id, |
| messages=self._to_hf_messages(messages), |
| max_tokens=self.max_tokens, |
| temperature=self.temperature, |
| ) |
| if not response.choices: |
| raise RuntimeError(f"No choices returned for model {self.model_id}") |
| content = response.choices[0].message.content or "" |
| return ChatResult( |
| generations=[ChatGeneration(message=AIMessage(content=content))] |
| ) |
|
|
|
|
| _llm: HuggingFaceInferenceChat | None = None |
|
|
|
|
| def get_llm(model_id: str = MODEL_ID) -> HuggingFaceInferenceChat: |
| global _llm |
| if _llm is None or _llm.model_id != model_id: |
| get_hf_token() |
| _llm = HuggingFaceInferenceChat(model_id=model_id) |
| return _llm |
|
|
|
|
| def invoke_chat(messages: List[BaseMessage], model_id: str = MODEL_ID) -> str: |
| """Call primary model, then fallback if the provider rejects the request.""" |
| last_error: Exception | None = None |
| for mid in (model_id, FALLBACK_MODEL_ID): |
| try: |
| llm = get_llm(mid) |
| result = llm.invoke(messages) |
| text = result.content if isinstance(result.content, str) else str(result.content) |
| if text.strip(): |
| return text.strip() |
| except Exception as exc: |
| last_error = exc |
| logger.warning("HF chat failed for model %s: %s", mid, exc) |
| global _llm |
| _llm = None |
| raise RuntimeError(str(last_error) if last_error else "Unknown inference error") |
|
|
|
|
| if __name__ == "__main__": |
| logging.basicConfig(level=logging.INFO) |
| out = invoke_chat( |
| [ |
| SystemMessage(content="You are an ATS resume analyst."), |
| HumanMessage( |
| content=( |
| "ATS Scores: Semantic 0.45, Keyword 0.70, Final 0.68. " |
| "Missing: docker, tensorflow. Skill overlap: 70%. " |
| "Write 3 short sections: Score Explanation, Weak Areas, Actionable Improvements." |
| ) |
| ), |
| ] |
| ) |
| print(out) |
|
|