""" generation.py — LLM answer generation using Groq API (free tier). Groq runs Llama 3 and Mixtral on custom LPU hardware. Free tier: 14,400 requests/day, responses in ~200-400ms. Model options (all free): llama-3.1-8b-instant — fastest, good for most Q&A llama-3.1-70b-versatile — best quality, use for complex documents mixtral-8x7b-32768 — 32k context, good for long documents gemma2-9b-it — Google model, good structured extraction """ import logging from groq import Groq from src.utils import get_required_env, get_env, count_tokens_estimate, Timer logger = logging.getLogger("enterprise-rag.generation") WEAK_CONTEXT_RESPONSE = ( "I was unable to find sufficient information in the uploaded document " "to answer this question confidently. Please ensure the document contains " "relevant content, or try rephrasing your question." ) SYSTEM_PROMPT = """You are an enterprise document assistant. Answer questions strictly based on the provided document context. Rules: 1. Answer ONLY using information from the provided context sections. 2. If the context does not contain enough information, say so clearly — do not guess or fabricate. 3. Be concise and precise. 4. Reproduce numbers, dates, and names exactly as they appear in the context. 5. Never use outside knowledge — only the provided context.""" def generate_answer( query: str, context_chunks: list, scores: list, is_relevant: bool, max_new_tokens: int = 512, ) -> dict: """ Generate a grounded answer using Groq API. Returns dict: answer — generated text string prompt_tokens — input token count response_tokens — output token count generation_latency_ms — time in milliseconds model_used — model identifier fallback_used — True if context was too weak error — error message or None """ result = { "answer": "", "prompt_tokens": 0, "response_tokens": 0, "generation_latency_ms": 0, "model_used": "", "fallback_used": False, "error": None, } # Return fallback immediately if retrieval quality is too low. # Never send weak context to the LLM — it will fill gaps with hallucinations. if not is_relevant or not context_chunks: result["answer"] = WEAK_CONTEXT_RESPONSE result["fallback_used"] = True logger.warning("Fallback triggered: low retrieval relevance or empty chunks") return result try: api_key = get_required_env("GROQ_API_KEY") model_id = get_env("GROQ_MODEL", "llama-3.1-8b-instant") result["model_used"] = model_id client = Groq(api_key=api_key) # Build numbered context block from retrieved chunks context_block = "\n\n".join( f"[Document Section {i + 1}]:\n{chunk.strip()}" for i, chunk in enumerate(context_chunks) ) full_prompt = SYSTEM_PROMPT + context_block + query result["prompt_tokens"] = count_tokens_estimate(full_prompt) messages = [ { "role": "system", "content": SYSTEM_PROMPT, }, { "role": "user", "content": ( f"Here is the relevant document context:\n\n" f"{context_block}\n\n" f"Question: {query}\n\n" f"Answer based only on the context above:" ), }, ] with Timer() as t: response = client.chat.completions.create( model=model_id, messages=messages, max_tokens=max_new_tokens, temperature=0.1, top_p=0.9, stream=False, ) result["generation_latency_ms"] = round(t.elapsed_ms, 2) answer_text = response.choices[0].message.content.strip() if not answer_text: result["answer"] = WEAK_CONTEXT_RESPONSE result["fallback_used"] = True else: result["answer"] = answer_text result["response_tokens"] = count_tokens_estimate(answer_text) # Use Groq's actual token counts when available — more accurate than estimates if hasattr(response, "usage") and response.usage: if response.usage.prompt_tokens: result["prompt_tokens"] = response.usage.prompt_tokens if response.usage.completion_tokens: result["response_tokens"] = response.usage.completion_tokens logger.info( f"Generated: {result['response_tokens']} tokens | " f"{t.elapsed_ms:.0f}ms | model={model_id}" ) except Exception as e: error_msg = str(e) logger.error(f"Groq generation error: {error_msg}") if "rate_limit" in error_msg.lower() or "429" in error_msg: result["answer"] = ( "⚠️ Groq API rate limit reached. " "Free tier allows 14,400 requests/day and 6,000 tokens/minute. " "Please wait a moment and try again." ) elif "authentication" in error_msg.lower() or "401" in error_msg or "api_key" in error_msg.lower(): result["answer"] = ( "⚠️ Invalid GROQ_API_KEY. " "Check your key in HF Space secrets. " "Get a free key at console.groq.com → API Keys." ) elif "model" in error_msg.lower() and "not found" in error_msg.lower(): result["answer"] = ( "⚠️ Model not found. Valid GROQ_MODEL options: " "llama-3.1-8b-instant, llama-3.1-70b-versatile, " "mixtral-8x7b-32768, gemma2-9b-it" ) else: result["answer"] = f"⚠️ Generation error: {error_msg}" result["error"] = error_msg result["fallback_used"] = True return result