""" Multi-LLM Manager with Groq (ChatGroq) and HuggingFace Fallback Logic Architecture: - Primary: Groq API with 3 keys (sequential fallback) - Fallback: HuggingFace Inference API with 3 tokens (sequential fallback) - Llama 3 8B for chat interface - Llama 3 70B for evaluation Fallback Logic: 1. Try GROQ_API_KEY_1 2. If fails, try GROQ_API_KEY_2 3. If fails, try GROQ_API_KEY_3 4. If all Groq keys fail, try HF_TOKEN_1 5. If fails, try HF_TOKEN_2 6. If fails, try HF_TOKEN_3 """ import time from typing import List, Dict, Optional, Literal from langchain_groq import ChatGroq from langchain_core.messages import HumanMessage, SystemMessage, AIMessage from huggingface_hub import InferenceClient from app.config import settings # ============================================================================ # GROQ MANAGER WITH FALLBACK # ============================================================================ class GroqManager: """ Groq API Manager with multiple API key fallback support Uses ChatGroq from langchain_groq """ def __init__(self): """Initialize Groq manager with all available API keys""" self.api_keys = settings.get_groq_api_keys() self.chat_model_name = settings.GROQ_CHAT_MODEL # llama-3.1-8b-instant self.eval_model_name = settings.GROQ_EVAL_MODEL # llama-3.3-70b-versatile # Track current key index self.current_key_index = 0 # Rate limiting tracking self.requests_this_minute = 0 self.last_reset = time.time() if not self.api_keys: raise ValueError("No Groq API keys configured. Set GROQ_API_KEY_1 in .env") print(f"✅ Groq Manager initialized with {len(self.api_keys)} API key(s)") print(f" Chat Model: {self.chat_model_name}") print(f" Eval Model: {self.eval_model_name}") def _check_rate_limits(self): """ Check and reset rate limit counters. Groq Free: 30 requests/min """ current_time = time.time() # Reset counters every minute if current_time - self.last_reset > 60: self.requests_this_minute = 0 self.last_reset = current_time # Check if limits exceeded # ================================================================= # Uncomment below if rate limiting enforcement is needed # ================================================================= # if self.requests_this_minute >= settings.GROQ_REQUESTS_PER_MINUTE: # wait_time = 60 - (current_time - self.last_reset) # print(f"⚠️ Groq rate limit hit. Waiting {wait_time:.1f}s...") # time.sleep(wait_time) # self._check_rate_limits() def _create_llm(self, api_key: str, model_name: str) -> ChatGroq: """Create ChatGroq instance with given API key and model""" return ChatGroq( api_key=api_key, model_name=model_name, temperature=settings.LLM_TEMPERATURE, max_tokens=settings.LLM_MAX_TOKENS, max_retries=0 # Disable automatic retries, we handle fallback manually ) async def generate( self, messages: List[Dict[str, str]], system_prompt: Optional[str] = None, task: Literal["chat", "evaluation"] = "chat" ) -> str: """ Generate response using Groq with fallback logic. Args: messages: List of conversation messages system_prompt: Optional system prompt task: Task type to determine model (chat uses 8B, evaluation uses 70B) Returns: str: Generated response text Raises: Exception: If all Groq API keys fail """ self._check_rate_limits() # Select model based on task model_name = self.eval_model_name if task == "evaluation" else self.chat_model_name # Format messages for LangChain formatted_messages = [] # Add system message if provided if system_prompt: formatted_messages.append(SystemMessage(content=system_prompt)) # Convert conversation messages for msg in messages: if msg['role'] == 'user': formatted_messages.append(HumanMessage(content=msg['content'])) elif msg['role'] == 'assistant': formatted_messages.append(AIMessage(content=msg['content'])) # Try each Groq API key sequentially for key_index, api_key in enumerate(self.api_keys, 1): try: print(f"🔑 Trying Groq API Key {key_index}/{len(self.api_keys)} with {model_name}...") # Create LLM instance with current key llm = self._create_llm(api_key, model_name) # Generate response response = await llm.ainvoke(formatted_messages) # Track rate limits self.requests_this_minute += 1 print(f"✅ Groq API Key {key_index} succeeded") return response.content except Exception as e: print(f"❌ Groq API Key {key_index} failed: {e}") # If this was the last key, raise exception if key_index == len(self.api_keys): print(f"❌ All {len(self.api_keys)} Groq API keys exhausted") raise Exception(f"All Groq API keys failed. Last error: {e}") # Otherwise, continue to next key print(f"⏭️ Falling back to next Groq API key...") continue # ============================================================================ # HUGGINGFACE MANAGER WITH FALLBACK # ============================================================================ class HuggingFaceManager: """ HuggingFace Inference API Manager with multiple token fallback support Uses InferenceClient from huggingface_hub """ def __init__(self): """Initialize HuggingFace manager with all available tokens""" self.tokens = settings.get_hf_tokens() self.chat_model_name = settings.HF_CHAT_MODEL self.eval_model_name = settings.HF_EVAL_MODEL if not self.tokens: raise ValueError("No HuggingFace tokens configured. Set HF_TOKEN_1 in .env") print(f"✅ HuggingFace Manager initialized with {len(self.tokens)} token(s)") print(f" Chat Model: {self.chat_model_name}") print(f" Eval Model: {self.eval_model_name}") def _create_client(self, token: str, model_name: str) -> InferenceClient: """Create InferenceClient instance with given token and model""" return InferenceClient( model=model_name, token=token ) async def generate( self, messages: List[Dict[str, str]], system_prompt: Optional[str] = None, task: Literal["chat", "evaluation"] = "chat" ) -> str: """ Generate response using HuggingFace Inference API with fallback logic. Args: messages: List of conversation messages system_prompt: Optional system prompt task: Task type to determine model Returns: str: Generated response text Raises: Exception: If all HuggingFace tokens fail """ # Select model based on task model_name = self.eval_model_name if task == "evaluation" else self.chat_model_name # Format messages for HuggingFace chat API formatted_messages = [] # Add system message if provided if system_prompt: formatted_messages.append({ "role": "system", "content": system_prompt }) # Convert conversation messages for msg in messages: formatted_messages.append({ "role": msg['role'], "content": msg['content'] }) # Try each HuggingFace token sequentially for token_index, token in enumerate(self.tokens, 1): try: print(f"🔑 Trying HuggingFace Token {token_index}/{len(self.tokens)} with {model_name}...") # Create client with current token client = self._create_client(token, model_name) # Generate response using chat completion response = client.chat_completion( messages=formatted_messages, max_tokens=settings.LLM_MAX_TOKENS, temperature=settings.LLM_TEMPERATURE ) # Extract content from response content = response.choices[0].message.content print(f"✅ HuggingFace Token {token_index} succeeded") return content except Exception as e: print(f"❌ HuggingFace Token {token_index} failed: {e}") # If this was the last token, raise exception if token_index == len(self.tokens): print(f"❌ All {len(self.tokens)} HuggingFace tokens exhausted") raise Exception(f"All HuggingFace tokens failed. Last error: {e}") # Otherwise, continue to next token print(f"⏭️ Falling back to next HuggingFace token...") continue # ============================================================================ # UNIFIED LLM MANAGER (Groq Primary, HuggingFace Fallback) # ============================================================================ class LLMManager: """ Unified LLM Manager with cascading fallback logic: 1. Try all Groq API keys (primary) 2. If all fail, try all HuggingFace tokens (fallback) Models: - Chat: Llama 3 8B (for user-facing chat responses) - Evaluation: Llama 3 70B (for response evaluation) """ def __init__(self): """Initialize all LLM managers""" self.groq = None self.huggingface = None # Initialize Groq if configured if settings.is_groq_enabled(): try: self.groq = GroqManager() except Exception as e: print(f"⚠️ Failed to initialize Groq: {e}") # Initialize HuggingFace if configured if settings.is_hf_enabled(): try: self.huggingface = HuggingFaceManager() except Exception as e: print(f"⚠️ Failed to initialize HuggingFace: {e}") # Check if at least one is available if not self.groq and not self.huggingface: raise ValueError("No LLM provider configured. Set either Groq or HuggingFace credentials in .env") print("✅ LLM Manager initialized with fallback logic") async def generate( self, messages: List[Dict[str, str]], system_prompt: Optional[str] = None, task: Literal["chat", "evaluation"] = "chat" ) -> str: """ Generate response with cascading fallback logic. Fallback order: 1. Try all Groq API keys (3 keys) 2. If all Groq keys fail, try all HuggingFace tokens (3 tokens) Args: messages: Conversation messages system_prompt: Optional system prompt task: Task type - "chat" (8B) or "evaluation" (70B) Returns: str: Generated response Raises: ValueError: If all providers fail """ # Try Groq first (if available) if self.groq: try: print("🚀 Attempting Groq API (Primary)...") response = await self.groq.generate(messages, system_prompt, task) return response except Exception as groq_error: print(f"❌ All Groq API keys failed: {groq_error}") # Fall back to HuggingFace if available if self.huggingface: print("🔄 Falling back to HuggingFace Inference API...") else: raise ValueError(f"Groq failed and no HuggingFace fallback configured: {groq_error}") # Try HuggingFace (if Groq failed or not available) if self.huggingface: try: print("🚀 Attempting HuggingFace API (Fallback)...") response = await self.huggingface.generate(messages, system_prompt, task) return response except Exception as hf_error: raise ValueError(f"All LLM providers exhausted. HuggingFace error: {hf_error}") raise ValueError("No LLM provider available") async def generate_chat_response( self, query: str, context: str, history: List[Dict[str, str]] ) -> str: """ Generate chat response (uses Llama 3 8B). Args: query: User query context: Retrieved context (from FAISS) history: Conversation history Returns: str: Chat response """ # Import the detailed prompt from app.services.chat_service import BANKING_SYSTEM_PROMPT # Build enhanced system prompt with context system_prompt = BANKING_SYSTEM_PROMPT if context: system_prompt += f"\n\nRelevant Knowledge Base Context:\n{context}" else: system_prompt += "\n\nNo specific banking documents were retrieved for this query. Provide a helpful general response while acknowledging your banking specialization." # Build messages messages = history + [{'role': 'user', 'content': query}] # Generate using chat task (Llama 3 8B) return await self.generate(messages, system_prompt, task="chat") async def evaluate_response( self, query: str, response: str, context: str = "" ) -> Dict: """ Evaluate response quality (uses Llama 3 70B for better evaluation). Used during RL training. Args: query: User query response: Generated response context: Retrieved context (if any) Returns: dict: Evaluation results {'quality': 'Good'/'Bad', 'explanation': '...'} """ eval_prompt = f"""Evaluate this response: Query: {query} Response: {response} Context used: {context if context else 'None'} Is this response Good or Bad? Respond with just "Good" or "Bad" and brief explanation.""" messages = [{'role': 'user', 'content': eval_prompt}] # Generate using evaluation task (Llama 3 70B) result = await self.generate(messages, task="evaluation") # Parse result quality = "Good" if "Good" in result else "Bad" return { 'quality': quality, 'explanation': result } # ============================================================================ # GLOBAL LLM MANAGER INSTANCE # ============================================================================ llm_manager = LLMManager() # ============================================================================ # USAGE EXAMPLE (for reference) # ============================================================================ """ # In your service file: from app.core.llm_manager import llm_manager # Generate chat response (uses Llama 3 8B with Groq → HF fallback) response = await llm_manager.generate_chat_response( query="What is my account balance?", context="Your balance is $1000", history=[] ) # Evaluate response (uses Llama 3 70B with Groq → HF fallback) evaluation = await llm_manager.evaluate_response( query="What is my balance?", response="Your balance is $1000", context="Balance: $1000" ) """