File size: 4,871 Bytes
0214972
a64025f
 
330e02a
5d60eec
a64025f
 
 
330e02a
0214972
 
 
cb13dc9
0214972
a64025f
0214972
 
cb13dc9
 
a64025f
 
 
2844ebb
 
 
330e02a
 
 
 
a64025f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330e02a
2844ebb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a64025f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330e02a
a64025f
330e02a
a64025f
 
cb13dc9
a64025f
 
 
 
 
 
 
 
0214972
330e02a
 
2844ebb
 
 
a64025f
2844ebb
a64025f
2844ebb
 
 
 
 
330e02a
 
 
 
 
 
 
 
 
 
 
 
2844ebb
 
 
 
 
 
 
 
 
 
330e02a
a64025f
 
 
 
 
 
 
 
 
 
 
5d60eec
 
a64025f
5d60eec
 
a64025f
70b94cb
 
a64025f
70b94cb
 
a64025f
 
70b94cb
a64025f
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
LLM module. HuggingFace Inference API as primary.
Works natively from HF Spaces — same infrastructure.
OpenRouter and Groq as fallback providers.

WHY HF Inference API?
HF Spaces can always reach HuggingFace's own APIs.
No network routing issues. Uses existing HF_TOKEN.
Same Llama 3.3 70B model as others.
"""

import os
import logging
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential

load_dotenv()
logger = logging.getLogger(__name__)

# ── HuggingFace Inference API ─────────────────────────────
_hf_client = None

# ── OpenRouter (free tier, reliable fallback) ──────────────
_openrouter_client = None

# ── Groq fallback (works locally, may be blocked on HF Spaces) ──
_groq_client = None


def _init_hf():
    global _hf_client
    token = os.getenv("HF_TOKEN")
    if not token:
        logger.warning("HF_TOKEN not set — HF Inference API disabled")
        return False
    try:
        from huggingface_hub import InferenceClient
        _hf_client = InferenceClient(
            model="meta-llama/Llama-3.3-70B-Instruct",
            token=token
        )
        logger.info("HF Inference API ready (Llama-3.3-70B)")
        return True
    except Exception as e:
        logger.error(f"HF Inference API init failed: {e}")
        return False


def _init_openrouter():
    global _openrouter_client
    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        return False
    try:
        from openai import OpenAI
        _openrouter_client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )
        logger.info("OpenRouter ready as fallback")
        return True
    except Exception as e:
        logger.error(f"OpenRouter init failed: {e}")
        return False


def _init_groq():
    global _groq_client
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        return False
    try:
        from groq import Groq
        _groq_client = Groq(api_key=api_key)
        logger.info("Groq ready as fallback")
        return True
    except Exception as e:
        logger.error(f"Groq init failed: {e}")
        return False


_hf_ready = _init_hf()
_openrouter_ready = _init_openrouter()
_groq_ready = _init_groq()


def _call_hf(messages: list) -> str:
    """Call HuggingFace Inference API."""
    response = _hf_client.chat_completion(
        messages=messages,
        max_tokens=1500,
        temperature=0.3,
    )
    return response.choices[0].message.content


def _call_openrouter(messages: list) -> str:
    """Call OpenRouter free tier."""
    response = _openrouter_client.chat.completions.create(
        model="meta-llama/llama-3.3-70b-instruct:free",
        messages=messages,
        max_tokens=1500,
        temperature=0.3,
    )
    return response.choices[0].message.content


def _call_groq(messages: list) -> str:
    """Call Groq as fallback."""
    response = _groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=messages,
        temperature=0.3,
        max_tokens=1500
    )
    return response.choices[0].message.content


def _call_with_fallback(messages: list) -> str:
    """Try HF first, then OpenRouter, then Groq."""
    if _hf_ready and _hf_client:
        try:
            return _call_hf(messages)
        except Exception as e:
            logger.warning(f"HF Inference failed: {e}, trying OpenRouter")

    if _openrouter_ready and _openrouter_client:
        try:
            return _call_openrouter(messages)
        except Exception as e:
            logger.warning(f"OpenRouter failed: {e}, trying Groq")

    if _groq_ready and _groq_client:
        try:
            return _call_groq(messages)
        except Exception as e:
            logger.error(f"Groq also failed: {e}")

    raise Exception("All LLM providers failed")


@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
def call_llm_raw(messages: list) -> str:
    """
    Call LLM with pre-built messages list.
    Used by V2 agent for Pass 1 and Pass 3.
    """
    return _call_with_fallback(messages)


@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=8))
def call_llm(query: str, context: str) -> str:
    """
    Call LLM with query and context.
    Used by V1 agent.
    """
    messages = [
        {
            "role": "system",
            "content": "You are NyayaSetu, an Indian legal research assistant. Answer only from provided excerpts. Cite judgment IDs. End with: NOTE: This is not legal advice."
        },
        {
            "role": "user",
            "content": f"QUESTION: {query}\n\nSOURCES:\n{context}\n\nAnswer based on sources. Cite judgment IDs."
        }
    ]
    return _call_with_fallback(messages)