Spaces:

Peterase
/

rag-api-node-1

Running

App Files Files Community

rag-api-node-1 / src /infrastructure /adapters /huggingface_adapter.py

Peterase

feat(rag): implement hybrid search with live sources and production-grade intent classification

a63c61f 11 days ago

raw

history blame contribute delete

3.36 kB

	import json
	import logging
	from typing import AsyncGenerator

	from src.core.ports.llm_port import LlmPort
	from src.core.config import settings

	logger = logging.getLogger(__name__)

	# HF Inference Providers router — OpenAI-compatible
	# Only models with chat/instruct fine-tuning work on the chat completions endpoint
	_ROUTER_URL = "https://router.huggingface.co/v1"

	# Chat-compatible instruct models available on HF router (free tier)
	_DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
	_FALLBACK_MODELS = [
	"meta-llama/Llama-3.1-8B-Instruct",
	"mistralai/Mistral-Nemo-Instruct-2407",
	"Qwen/Qwen2.5-7B-Instruct",
	"microsoft/Phi-3.5-mini-instruct",
	"HuggingFaceH4/zephyr-7b-beta",
	]


	class HuggingFaceAdapter(LlmPort):
	"""
	HuggingFace Inference Providers adapter — free tier with HF token.
	Uses the new router.huggingface.co OpenAI-compatible endpoint.

	Get token: https://huggingface.co/settings/tokens
	→ New token → Fine-grained → enable "Make calls to Inference Providers"
	Set HF_TOKEN env var. Optionally set HF_MODEL to choose a specific model.

	Free limits: generous free tier, no credit card needed.
	"""

	def __init__(self):
	self.token = settings.HF_TOKEN
	self.model = (settings.HF_MODEL or _DEFAULT_MODEL).strip()
	self.llm = None

	if self.token and self.token != "your-hf-token-here":
	try:
	from langchain_openai import ChatOpenAI
	self.llm = ChatOpenAI(
	api_key=self.token,
	base_url=_ROUTER_URL,
	model=f"{self.model}:fastest", # :fastest = auto-select best provider
	temperature=0.2,
	max_tokens=1024,
	)
	logger.info(f"✅ HuggingFace adapter ready — model: {self.model}:fastest")
	except Exception as e:
	logger.error(f"Failed to initialize HuggingFace adapter: {e}")
	else:
	logger.warning("HF_TOKEN not set — HuggingFace adapter disabled.")

	def generate(self, prompt: str) -> str:
	if not self.llm:
	return "HuggingFace token not configured."
	try:
	return self.llm.invoke(prompt).content
	except Exception as e:
	error_msg = str(e)
	if "429" in error_msg or "rate" in error_msg.lower():
	return "HuggingFace rate limit reached. Please try again shortly."
	logger.error(f"HuggingFace generate error: {e}")
	return f"HuggingFace error: {error_msg}"

	async def generate_stream(self, prompt: str) -> AsyncGenerator[str, None]:
	if not self.llm:
	yield f"data: {json.dumps({'token': 'HuggingFace token not configured.'})}\n\n"
	yield "data: [DONE]\n\n"
	return
	try:
	for chunk in self.llm.stream(prompt):
	if hasattr(chunk, "content") and chunk.content:
	yield f"data: {json.dumps({'token': chunk.content})}\n\n"
	yield "data: [DONE]\n\n"
	except Exception as e:
	error_msg = str(e)
	msg = "HuggingFace rate limit reached." if "429" in error_msg else f"HuggingFace error: {error_msg}"
	yield f"data: {json.dumps({'token': msg})}\n\n"
	yield "data: [DONE]\n\n"