from __future__ import annotations import os from typing import Any from dotenv import load_dotenv from huggingface_hub import InferenceClient load_dotenv() class HFLLMClient: def __init__(self) -> None: self.api_key = os.getenv("HF_TOKEN") print("HF token present:", bool(self.api_key)) if not self.api_key: raise ValueError("HF_TOKEN is not set") self.model = os.getenv("HF_MODEL", "Qwen/Qwen2.5-7B-Instruct") self.max_tokens = int(os.getenv("HF_MAX_TOKENS", "128")) self.temperature = float(os.getenv("HF_TEMPERATURE", "0.1")) self.client = InferenceClient( provider="auto", api_key=self.api_key, ) def generate(self, prompt: str) -> str: """ Generate a deterministic short answer for benchmark submission tasks. """ try: output = self.client.chat_completion( model=self.model, messages=[ { "role": "system", "content": ( "You are an exact-match benchmark solver. " "Return only the final answer with no explanation." ), }, { "role": "user", "content": prompt, }, ], max_tokens=self.max_tokens, temperature=self.temperature, ) text = self._extract_text(output) print("LLM response preview:", text[:300]) return text except Exception as e: raise ValueError(f"Inference call failed: {e}") from e @staticmethod def _extract_text(output: Any) -> str: """ Safely extract text from HF chat completion responses. """ if output is None: return "" try: text = output.choices[0].message.content except Exception: return "" if text is None: return "" if isinstance(text, str): return text.strip() if isinstance(text, list): parts = [] for item in text: if isinstance(item, dict): piece = item.get("text") or item.get("content") or "" if piece: parts.append(str(piece)) elif item is not None: parts.append(str(item)) return " ".join(parts).strip() return str(text).strip()