File size: 16,488 Bytes
690700c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2b2f2d
 
690700c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
"""
Multi-LLM Manager with Groq (ChatGroq) and HuggingFace Fallback Logic

Architecture:
- Primary: Groq API with 3 keys (sequential fallback)
- Fallback: HuggingFace Inference API with 3 tokens (sequential fallback)
- Llama 3 8B for chat interface
- Llama 3 70B for evaluation

Fallback Logic:
1. Try GROQ_API_KEY_1
2. If fails, try GROQ_API_KEY_2
3. If fails, try GROQ_API_KEY_3
4. If all Groq keys fail, try HF_TOKEN_1
5. If fails, try HF_TOKEN_2
6. If fails, try HF_TOKEN_3
"""

import time
from typing import List, Dict, Optional, Literal
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from huggingface_hub import InferenceClient
from app.config import settings

# ============================================================================
# GROQ MANAGER WITH FALLBACK
# ============================================================================
class GroqManager:
    """
    Groq API Manager with multiple API key fallback support
    Uses ChatGroq from langchain_groq
    """
    
    def __init__(self):
        """Initialize Groq manager with all available API keys"""
        self.api_keys = settings.get_groq_api_keys()
        self.chat_model_name = settings.GROQ_CHAT_MODEL  # llama-3.1-8b-instant
        self.eval_model_name = settings.GROQ_EVAL_MODEL  # llama-3.3-70b-versatile
        
        # Track current key index
        self.current_key_index = 0
        
        # Rate limiting tracking
        self.requests_this_minute = 0
        self.last_reset = time.time()
        
        if not self.api_keys:
            raise ValueError("No Groq API keys configured. Set GROQ_API_KEY_1 in .env")
        
        print(f"βœ… Groq Manager initialized with {len(self.api_keys)} API key(s)")
        print(f"   Chat Model: {self.chat_model_name}")
        print(f"   Eval Model: {self.eval_model_name}")
    
    def _check_rate_limits(self):
        """
        Check and reset rate limit counters.
        Groq Free: 30 requests/min
        """
        current_time = time.time()
        
        # Reset counters every minute
        if current_time - self.last_reset > 60:
            self.requests_this_minute = 0
            self.last_reset = current_time
        
        # Check if limits exceeded
        # =================================================================
        # Uncomment below if rate limiting enforcement is needed
        # =================================================================

        # if self.requests_this_minute >= settings.GROQ_REQUESTS_PER_MINUTE:
        #     wait_time = 60 - (current_time - self.last_reset)
        #     print(f"⚠️ Groq rate limit hit. Waiting {wait_time:.1f}s...")
        #     time.sleep(wait_time)
        #     self._check_rate_limits()
    
    def _create_llm(self, api_key: str, model_name: str) -> ChatGroq:
        """Create ChatGroq instance with given API key and model"""
        return ChatGroq(
            api_key=api_key,
            model_name=model_name,
            temperature=settings.LLM_TEMPERATURE,
            max_tokens=settings.LLM_MAX_TOKENS,
            max_retries=0  # Disable automatic retries, we handle fallback manually
        )
    
    async def generate(
        self,
        messages: List[Dict[str, str]],
        system_prompt: Optional[str] = None,
        task: Literal["chat", "evaluation"] = "chat"
    ) -> str:
        """
        Generate response using Groq with fallback logic.
        
        Args:
            messages: List of conversation messages
            system_prompt: Optional system prompt
            task: Task type to determine model (chat uses 8B, evaluation uses 70B)
        
        Returns:
            str: Generated response text
        
        Raises:
            Exception: If all Groq API keys fail
        """
        self._check_rate_limits()
        
        # Select model based on task
        model_name = self.eval_model_name if task == "evaluation" else self.chat_model_name
        
        # Format messages for LangChain
        formatted_messages = []
        
        # Add system message if provided
        if system_prompt:
            formatted_messages.append(SystemMessage(content=system_prompt))
        
        # Convert conversation messages
        for msg in messages:
            if msg['role'] == 'user':
                formatted_messages.append(HumanMessage(content=msg['content']))
            elif msg['role'] == 'assistant':
                formatted_messages.append(AIMessage(content=msg['content']))
        
        # Try each Groq API key sequentially
        for key_index, api_key in enumerate(self.api_keys, 1):
            try:
                print(f"πŸ”‘ Trying Groq API Key {key_index}/{len(self.api_keys)} with {model_name}...")
                
                # Create LLM instance with current key
                llm = self._create_llm(api_key, model_name)
                
                # Generate response
                response = await llm.ainvoke(formatted_messages)
                
                # Track rate limits
                self.requests_this_minute += 1
                
                print(f"βœ… Groq API Key {key_index} succeeded")
                return response.content
                
            except Exception as e:
                print(f"❌ Groq API Key {key_index} failed: {e}")
                
                # If this was the last key, raise exception
                if key_index == len(self.api_keys):
                    print(f"❌ All {len(self.api_keys)} Groq API keys exhausted")
                    raise Exception(f"All Groq API keys failed. Last error: {e}")
                
                # Otherwise, continue to next key
                print(f"⏭️ Falling back to next Groq API key...")
                continue

# ============================================================================
# HUGGINGFACE MANAGER WITH FALLBACK
# ============================================================================
class HuggingFaceManager:
    """
    HuggingFace Inference API Manager with multiple token fallback support
    Uses InferenceClient from huggingface_hub
    """
    
    def __init__(self):
        """Initialize HuggingFace manager with all available tokens"""
        self.tokens = settings.get_hf_tokens()
        self.chat_model_name = settings.HF_CHAT_MODEL
        self.eval_model_name = settings.HF_EVAL_MODEL
        
        if not self.tokens:
            raise ValueError("No HuggingFace tokens configured. Set HF_TOKEN_1 in .env")
        
        print(f"βœ… HuggingFace Manager initialized with {len(self.tokens)} token(s)")
        print(f"   Chat Model: {self.chat_model_name}")
        print(f"   Eval Model: {self.eval_model_name}")
    
    def _create_client(self, token: str, model_name: str) -> InferenceClient:
        """Create InferenceClient instance with given token and model"""
        return InferenceClient(
            model=model_name,
            token=token
        )
    
    async def generate(
        self,
        messages: List[Dict[str, str]],
        system_prompt: Optional[str] = None,
        task: Literal["chat", "evaluation"] = "chat"
    ) -> str:
        """
        Generate response using HuggingFace Inference API with fallback logic.
        
        Args:
            messages: List of conversation messages
            system_prompt: Optional system prompt
            task: Task type to determine model
        
        Returns:
            str: Generated response text
        
        Raises:
            Exception: If all HuggingFace tokens fail
        """
        # Select model based on task
        model_name = self.eval_model_name if task == "evaluation" else self.chat_model_name
        
        # Format messages for HuggingFace chat API
        formatted_messages = []
        
        # Add system message if provided
        if system_prompt:
            formatted_messages.append({
                "role": "system",
                "content": system_prompt
            })
        
        # Convert conversation messages
        for msg in messages:
            formatted_messages.append({
                "role": msg['role'],
                "content": msg['content']
            })
        
        # Try each HuggingFace token sequentially
        for token_index, token in enumerate(self.tokens, 1):
            try:
                print(f"πŸ”‘ Trying HuggingFace Token {token_index}/{len(self.tokens)} with {model_name}...")
                
                # Create client with current token
                client = self._create_client(token, model_name)
                
                # Generate response using chat completion
                response = client.chat_completion(
                    messages=formatted_messages,
                    max_tokens=settings.LLM_MAX_TOKENS,
                    temperature=settings.LLM_TEMPERATURE
                )
                
                # Extract content from response
                content = response.choices[0].message.content
                
                print(f"βœ… HuggingFace Token {token_index} succeeded")
                return content
                
            except Exception as e:
                print(f"❌ HuggingFace Token {token_index} failed: {e}")
                
                # If this was the last token, raise exception
                if token_index == len(self.tokens):
                    print(f"❌ All {len(self.tokens)} HuggingFace tokens exhausted")
                    raise Exception(f"All HuggingFace tokens failed. Last error: {e}")
                
                # Otherwise, continue to next token
                print(f"⏭️ Falling back to next HuggingFace token...")
                continue

# ============================================================================
# UNIFIED LLM MANAGER (Groq Primary, HuggingFace Fallback)
# ============================================================================
class LLMManager:
    """
    Unified LLM Manager with cascading fallback logic:
    1. Try all Groq API keys (primary)
    2. If all fail, try all HuggingFace tokens (fallback)
    
    Models:
    - Chat: Llama 3 8B (for user-facing chat responses)
    - Evaluation: Llama 3 70B (for response evaluation)
    """
    
    def __init__(self):
        """Initialize all LLM managers"""
        self.groq = None
        self.huggingface = None
        
        # Initialize Groq if configured
        if settings.is_groq_enabled():
            try:
                self.groq = GroqManager()
            except Exception as e:
                print(f"⚠️ Failed to initialize Groq: {e}")
        
        # Initialize HuggingFace if configured
        if settings.is_hf_enabled():
            try:
                self.huggingface = HuggingFaceManager()
            except Exception as e:
                print(f"⚠️ Failed to initialize HuggingFace: {e}")
        
        # Check if at least one is available
        if not self.groq and not self.huggingface:
            raise ValueError("No LLM provider configured. Set either Groq or HuggingFace credentials in .env")
        
        print("βœ… LLM Manager initialized with fallback logic")
    
    async def generate(
        self,
        messages: List[Dict[str, str]],
        system_prompt: Optional[str] = None,
        task: Literal["chat", "evaluation"] = "chat"
    ) -> str:
        """
        Generate response with cascading fallback logic.
        
        Fallback order:
        1. Try all Groq API keys (3 keys)
        2. If all Groq keys fail, try all HuggingFace tokens (3 tokens)
        
        Args:
            messages: Conversation messages
            system_prompt: Optional system prompt
            task: Task type - "chat" (8B) or "evaluation" (70B)
        
        Returns:
            str: Generated response
        
        Raises:
            ValueError: If all providers fail
        """
        # Try Groq first (if available)
        if self.groq:
            try:
                print("πŸš€ Attempting Groq API (Primary)...")
                response = await self.groq.generate(messages, system_prompt, task)
                return response
            except Exception as groq_error:
                print(f"❌ All Groq API keys failed: {groq_error}")
                
                # Fall back to HuggingFace if available
                if self.huggingface:
                    print("πŸ”„ Falling back to HuggingFace Inference API...")
                else:
                    raise ValueError(f"Groq failed and no HuggingFace fallback configured: {groq_error}")
        
        # Try HuggingFace (if Groq failed or not available)
        if self.huggingface:
            try:
                print("πŸš€ Attempting HuggingFace API (Fallback)...")
                response = await self.huggingface.generate(messages, system_prompt, task)
                return response
            except Exception as hf_error:
                raise ValueError(f"All LLM providers exhausted. HuggingFace error: {hf_error}")
        
        raise ValueError("No LLM provider available")
    
    async def generate_chat_response(
        self,
        query: str,
        context: str,
        history: List[Dict[str, str]]
    ) -> str:
        """
        Generate chat response (uses Llama 3 8B).
        
        Args:
            query: User query
            context: Retrieved context (from FAISS)
            history: Conversation history
        
        Returns:
            str: Chat response
        """
        # Import the detailed prompt
        from app.services.chat_service import BANKING_SYSTEM_PROMPT
        
        # Build enhanced system prompt with context
        system_prompt = BANKING_SYSTEM_PROMPT
        if context:
            system_prompt += f"\n\nRelevant Knowledge Base Context:\n{context}"
        else:
            system_prompt += "\n\nNo specific banking documents were retrieved for this query. Provide a helpful general response while acknowledging your banking specialization."
        
        # Build messages
        messages = history + [{'role': 'user', 'content': query}]
        
        # Generate using chat task (Llama 3 8B)
        return await self.generate(messages, system_prompt, task="chat")
    
    async def evaluate_response(
        self,
        query: str,
        response: str,
        context: str = ""
    ) -> Dict:
        """
        Evaluate response quality (uses Llama 3 70B for better evaluation).
        Used during RL training.
        
        Args:
            query: User query
            response: Generated response
            context: Retrieved context (if any)
        
        Returns:
            dict: Evaluation results
            {'quality': 'Good'/'Bad', 'explanation': '...'}
        """
        eval_prompt = f"""Evaluate this response:

Query: {query}
Response: {response}
Context used: {context if context else 'None'}

Is this response Good or Bad? Respond with just "Good" or "Bad" and brief explanation."""
        
        messages = [{'role': 'user', 'content': eval_prompt}]
        
        # Generate using evaluation task (Llama 3 70B)
        result = await self.generate(messages, task="evaluation")
        
        # Parse result
        quality = "Good" if "Good" in result else "Bad"
        
        return {
            'quality': quality,
            'explanation': result
        }

# ============================================================================
# GLOBAL LLM MANAGER INSTANCE
# ============================================================================
llm_manager = LLMManager()

# ============================================================================
# USAGE EXAMPLE (for reference)
# ============================================================================
"""
# In your service file:
from app.core.llm_manager import llm_manager

# Generate chat response (uses Llama 3 8B with Groq β†’ HF fallback)
response = await llm_manager.generate_chat_response(
    query="What is my account balance?",
    context="Your balance is $1000",
    history=[]
)

# Evaluate response (uses Llama 3 70B with Groq β†’ HF fallback)
evaluation = await llm_manager.evaluate_response(
    query="What is my balance?",
    response="Your balance is $1000",
    context="Balance: $1000"
)
"""