import json import logging from typing import AsyncGenerator from src.core.ports.llm_port import LlmPort from src.core.config import settings logger = logging.getLogger(__name__) # HF Inference Providers router — OpenAI-compatible # Only models with chat/instruct fine-tuning work on the chat completions endpoint _ROUTER_URL = "https://router.huggingface.co/v1" # Chat-compatible instruct models available on HF router (free tier) _DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct" _FALLBACK_MODELS = [ "meta-llama/Llama-3.1-8B-Instruct", "mistralai/Mistral-Nemo-Instruct-2407", "Qwen/Qwen2.5-7B-Instruct", "microsoft/Phi-3.5-mini-instruct", "HuggingFaceH4/zephyr-7b-beta", ] class HuggingFaceAdapter(LlmPort): """ HuggingFace Inference Providers adapter — free tier with HF token. Uses the new router.huggingface.co OpenAI-compatible endpoint. Get token: https://huggingface.co/settings/tokens → New token → Fine-grained → enable "Make calls to Inference Providers" Set HF_TOKEN env var. Optionally set HF_MODEL to choose a specific model. Free limits: generous free tier, no credit card needed. """ def __init__(self): self.token = settings.HF_TOKEN self.model = (settings.HF_MODEL or _DEFAULT_MODEL).strip() self.llm = None if self.token and self.token != "your-hf-token-here": try: from langchain_openai import ChatOpenAI self.llm = ChatOpenAI( api_key=self.token, base_url=_ROUTER_URL, model=f"{self.model}:fastest", # :fastest = auto-select best provider temperature=0.2, max_tokens=1024, ) logger.info(f"✅ HuggingFace adapter ready — model: {self.model}:fastest") except Exception as e: logger.error(f"Failed to initialize HuggingFace adapter: {e}") else: logger.warning("HF_TOKEN not set — HuggingFace adapter disabled.") def generate(self, prompt: str) -> str: if not self.llm: return "HuggingFace token not configured." try: return self.llm.invoke(prompt).content except Exception as e: error_msg = str(e) if "429" in error_msg or "rate" in error_msg.lower(): return "HuggingFace rate limit reached. Please try again shortly." logger.error(f"HuggingFace generate error: {e}") return f"HuggingFace error: {error_msg}" async def generate_stream(self, prompt: str) -> AsyncGenerator[str, None]: if not self.llm: yield f"data: {json.dumps({'token': 'HuggingFace token not configured.'})}\n\n" yield "data: [DONE]\n\n" return try: for chunk in self.llm.stream(prompt): if hasattr(chunk, "content") and chunk.content: yield f"data: {json.dumps({'token': chunk.content})}\n\n" yield "data: [DONE]\n\n" except Exception as e: error_msg = str(e) msg = "HuggingFace rate limit reached." if "429" in error_msg else f"HuggingFace error: {error_msg}" yield f"data: {json.dumps({'token': msg})}\n\n" yield "data: [DONE]\n\n"