File size: 3,355 Bytes
a63c61f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
import logging
from typing import AsyncGenerator

from src.core.ports.llm_port import LlmPort
from src.core.config import settings

logger = logging.getLogger(__name__)

# HF Inference Providers router β€” OpenAI-compatible
# Only models with chat/instruct fine-tuning work on the chat completions endpoint
_ROUTER_URL = "https://router.huggingface.co/v1"

# Chat-compatible instruct models available on HF router (free tier)
_DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
_FALLBACK_MODELS = [
    "meta-llama/Llama-3.1-8B-Instruct",
    "mistralai/Mistral-Nemo-Instruct-2407",
    "Qwen/Qwen2.5-7B-Instruct",
    "microsoft/Phi-3.5-mini-instruct",
    "HuggingFaceH4/zephyr-7b-beta",
]


class HuggingFaceAdapter(LlmPort):
    """
    HuggingFace Inference Providers adapter β€” free tier with HF token.
    Uses the new router.huggingface.co OpenAI-compatible endpoint.

    Get token: https://huggingface.co/settings/tokens
      β†’ New token β†’ Fine-grained β†’ enable "Make calls to Inference Providers"
    Set HF_TOKEN env var. Optionally set HF_MODEL to choose a specific model.

    Free limits: generous free tier, no credit card needed.
    """

    def __init__(self):
        self.token = settings.HF_TOKEN
        self.model = (settings.HF_MODEL or _DEFAULT_MODEL).strip()
        self.llm = None

        if self.token and self.token != "your-hf-token-here":
            try:
                from langchain_openai import ChatOpenAI
                self.llm = ChatOpenAI(
                    api_key=self.token,
                    base_url=_ROUTER_URL,
                    model=f"{self.model}:fastest",   # :fastest = auto-select best provider
                    temperature=0.2,
                    max_tokens=1024,
                )
                logger.info(f"βœ… HuggingFace adapter ready β€” model: {self.model}:fastest")
            except Exception as e:
                logger.error(f"Failed to initialize HuggingFace adapter: {e}")
        else:
            logger.warning("HF_TOKEN not set β€” HuggingFace adapter disabled.")

    def generate(self, prompt: str) -> str:
        if not self.llm:
            return "HuggingFace token not configured."
        try:
            return self.llm.invoke(prompt).content
        except Exception as e:
            error_msg = str(e)
            if "429" in error_msg or "rate" in error_msg.lower():
                return "HuggingFace rate limit reached. Please try again shortly."
            logger.error(f"HuggingFace generate error: {e}")
            return f"HuggingFace error: {error_msg}"

    async def generate_stream(self, prompt: str) -> AsyncGenerator[str, None]:
        if not self.llm:
            yield f"data: {json.dumps({'token': 'HuggingFace token not configured.'})}\n\n"
            yield "data: [DONE]\n\n"
            return
        try:
            for chunk in self.llm.stream(prompt):
                if hasattr(chunk, "content") and chunk.content:
                    yield f"data: {json.dumps({'token': chunk.content})}\n\n"
            yield "data: [DONE]\n\n"
        except Exception as e:
            error_msg = str(e)
            msg = "HuggingFace rate limit reached." if "429" in error_msg else f"HuggingFace error: {error_msg}"
            yield f"data: {json.dumps({'token': msg})}\n\n"
            yield "data: [DONE]\n\n"