Spaces:
Running
Running
| import json | |
| import logging | |
| from typing import AsyncGenerator | |
| from src.core.ports.llm_port import LlmPort | |
| from src.core.config import settings | |
| logger = logging.getLogger(__name__) | |
| # HF Inference Providers router β OpenAI-compatible | |
| # Only models with chat/instruct fine-tuning work on the chat completions endpoint | |
| _ROUTER_URL = "https://router.huggingface.co/v1" | |
| # Chat-compatible instruct models available on HF router (free tier) | |
| _DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct" | |
| _FALLBACK_MODELS = [ | |
| "meta-llama/Llama-3.1-8B-Instruct", | |
| "mistralai/Mistral-Nemo-Instruct-2407", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "microsoft/Phi-3.5-mini-instruct", | |
| "HuggingFaceH4/zephyr-7b-beta", | |
| ] | |
| class HuggingFaceAdapter(LlmPort): | |
| """ | |
| HuggingFace Inference Providers adapter β free tier with HF token. | |
| Uses the new router.huggingface.co OpenAI-compatible endpoint. | |
| Get token: https://huggingface.co/settings/tokens | |
| β New token β Fine-grained β enable "Make calls to Inference Providers" | |
| Set HF_TOKEN env var. Optionally set HF_MODEL to choose a specific model. | |
| Free limits: generous free tier, no credit card needed. | |
| """ | |
| def __init__(self): | |
| self.token = settings.HF_TOKEN | |
| self.model = (settings.HF_MODEL or _DEFAULT_MODEL).strip() | |
| self.llm = None | |
| if self.token and self.token != "your-hf-token-here": | |
| try: | |
| from langchain_openai import ChatOpenAI | |
| self.llm = ChatOpenAI( | |
| api_key=self.token, | |
| base_url=_ROUTER_URL, | |
| model=f"{self.model}:fastest", # :fastest = auto-select best provider | |
| temperature=0.2, | |
| max_tokens=1024, | |
| ) | |
| logger.info(f"β HuggingFace adapter ready β model: {self.model}:fastest") | |
| except Exception as e: | |
| logger.error(f"Failed to initialize HuggingFace adapter: {e}") | |
| else: | |
| logger.warning("HF_TOKEN not set β HuggingFace adapter disabled.") | |
| def generate(self, prompt: str) -> str: | |
| if not self.llm: | |
| return "HuggingFace token not configured." | |
| try: | |
| return self.llm.invoke(prompt).content | |
| except Exception as e: | |
| error_msg = str(e) | |
| if "429" in error_msg or "rate" in error_msg.lower(): | |
| return "HuggingFace rate limit reached. Please try again shortly." | |
| logger.error(f"HuggingFace generate error: {e}") | |
| return f"HuggingFace error: {error_msg}" | |
| async def generate_stream(self, prompt: str) -> AsyncGenerator[str, None]: | |
| if not self.llm: | |
| yield f"data: {json.dumps({'token': 'HuggingFace token not configured.'})}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return | |
| try: | |
| for chunk in self.llm.stream(prompt): | |
| if hasattr(chunk, "content") and chunk.content: | |
| yield f"data: {json.dumps({'token': chunk.content})}\n\n" | |
| yield "data: [DONE]\n\n" | |
| except Exception as e: | |
| error_msg = str(e) | |
| msg = "HuggingFace rate limit reached." if "429" in error_msg else f"HuggingFace error: {error_msg}" | |
| yield f"data: {json.dumps({'token': msg})}\n\n" | |
| yield "data: [DONE]\n\n" | |