Spaces:
Running
Running
File size: 3,355 Bytes
a63c61f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | import json
import logging
from typing import AsyncGenerator
from src.core.ports.llm_port import LlmPort
from src.core.config import settings
logger = logging.getLogger(__name__)
# HF Inference Providers router β OpenAI-compatible
# Only models with chat/instruct fine-tuning work on the chat completions endpoint
_ROUTER_URL = "https://router.huggingface.co/v1"
# Chat-compatible instruct models available on HF router (free tier)
_DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
_FALLBACK_MODELS = [
"meta-llama/Llama-3.1-8B-Instruct",
"mistralai/Mistral-Nemo-Instruct-2407",
"Qwen/Qwen2.5-7B-Instruct",
"microsoft/Phi-3.5-mini-instruct",
"HuggingFaceH4/zephyr-7b-beta",
]
class HuggingFaceAdapter(LlmPort):
"""
HuggingFace Inference Providers adapter β free tier with HF token.
Uses the new router.huggingface.co OpenAI-compatible endpoint.
Get token: https://huggingface.co/settings/tokens
β New token β Fine-grained β enable "Make calls to Inference Providers"
Set HF_TOKEN env var. Optionally set HF_MODEL to choose a specific model.
Free limits: generous free tier, no credit card needed.
"""
def __init__(self):
self.token = settings.HF_TOKEN
self.model = (settings.HF_MODEL or _DEFAULT_MODEL).strip()
self.llm = None
if self.token and self.token != "your-hf-token-here":
try:
from langchain_openai import ChatOpenAI
self.llm = ChatOpenAI(
api_key=self.token,
base_url=_ROUTER_URL,
model=f"{self.model}:fastest", # :fastest = auto-select best provider
temperature=0.2,
max_tokens=1024,
)
logger.info(f"β
HuggingFace adapter ready β model: {self.model}:fastest")
except Exception as e:
logger.error(f"Failed to initialize HuggingFace adapter: {e}")
else:
logger.warning("HF_TOKEN not set β HuggingFace adapter disabled.")
def generate(self, prompt: str) -> str:
if not self.llm:
return "HuggingFace token not configured."
try:
return self.llm.invoke(prompt).content
except Exception as e:
error_msg = str(e)
if "429" in error_msg or "rate" in error_msg.lower():
return "HuggingFace rate limit reached. Please try again shortly."
logger.error(f"HuggingFace generate error: {e}")
return f"HuggingFace error: {error_msg}"
async def generate_stream(self, prompt: str) -> AsyncGenerator[str, None]:
if not self.llm:
yield f"data: {json.dumps({'token': 'HuggingFace token not configured.'})}\n\n"
yield "data: [DONE]\n\n"
return
try:
for chunk in self.llm.stream(prompt):
if hasattr(chunk, "content") and chunk.content:
yield f"data: {json.dumps({'token': chunk.content})}\n\n"
yield "data: [DONE]\n\n"
except Exception as e:
error_msg = str(e)
msg = "HuggingFace rate limit reached." if "429" in error_msg else f"HuggingFace error: {error_msg}"
yield f"data: {json.dumps({'token': msg})}\n\n"
yield "data: [DONE]\n\n"
|