"""LLM-based faithfulness and answer relevancy evaluation for generated support responses. Implements the same metrics as RAGAS (faithfulness, answer_relevancy) but calls the Anthropic API directly in a synchronous loop — no async timeouts, no OpenAI dependency. """ import json import os import statistics import time from pathlib import Path from typing import Dict, List import anthropic from dotenv import load_dotenv from loguru import logger from tqdm import tqdm load_dotenv() _FAITHFULNESS_PROMPT = """You are an evaluation assistant. Given a context and a generated response, rate how faithful the response is to the context on a scale from 0.0 to 1.0. Faithfulness means the response only contains information that is grounded in or consistent with the context. A score of 1.0 means every claim in the response is supported by the context. A score of 0.0 means the response contains claims that contradict or are completely absent from the context. Context: {context} Response: {response} Reply with ONLY a decimal number between 0.0 and 1.0. No explanation.""" _RELEVANCY_PROMPT = """You are an evaluation assistant. Given a customer question and a support response, rate how relevant the response is to the question on a scale from 0.0 to 1.0. Relevancy means the response directly addresses what the customer asked. A score of 1.0 means the response fully and directly answers the customer's question. A score of 0.0 means the response is completely off-topic or ignores the question. Customer question: {question} Support response: {response} Reply with ONLY a decimal number between 0.0 and 1.0. No explanation.""" def _score_single( client: anthropic.Anthropic, prompt: str, retries: int = 3, ) -> float: """Call Claude Haiku to get a 0-1 score from a prompt. Args: client: Anthropic client instance. prompt: Evaluation prompt string. retries: Number of retry attempts on failure. Returns: Float score between 0.0 and 1.0. """ text = "" for attempt in range(retries): try: msg = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=10, temperature=0, messages=[{"role": "user", "content": prompt}], ) text = msg.content[0].text.strip() score = float(text) return max(0.0, min(1.0, score)) except (ValueError, IndexError): logger.warning(f"Could not parse score from response: '{text}' -- defaulting to 0.5") return 0.5 except anthropic.RateLimitError: wait = 2 ** attempt logger.warning(f"Rate limit hit, retrying in {wait}s…") time.sleep(wait) except Exception as e: logger.warning(f"Score attempt {attempt+1} failed: {e}") time.sleep(1) return 0.5 def run_ragas_evaluation( results: List[Dict], results_dir: str, faithfulness_threshold: float = 0.5, ) -> Dict: """Evaluate faithfulness and answer relevancy using Claude Haiku directly. Implements the same metrics as RAGAS but calls Anthropic API synchronously to avoid async timeout issues. Args: results: List of pipeline result dicts containing 'query', 'response', 'context'. results_dir: Directory to save scores JSON. faithfulness_threshold: Responses below this faithfulness score are flagged. Returns: Dict with aggregate scores, per-query scores, and flagged responses. """ api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: raise EnvironmentError("ANTHROPIC_API_KEY not set.") client = anthropic.Anthropic(api_key=api_key) logger.info(f"Running LLM evaluation on {len(results)} queries using Claude Haiku…") per_query = [] for r in tqdm(results, desc="Evaluating responses"): faith_prompt = _FAITHFULNESS_PROMPT.format( context=r["context"], response=r["response"] ) rel_prompt = _RELEVANCY_PROMPT.format( question=r["query"], response=r["response"] ) faithfulness_score = _score_single(client, faith_prompt) answer_relevancy_score = _score_single(client, rel_prompt) per_query.append({ "query": r["query"], "predicted_intent": r.get("predicted_intent", ""), "faithfulness": faithfulness_score, "answer_relevancy": answer_relevancy_score, }) # Aggregate statistics agg: Dict = {} for metric in ["faithfulness", "answer_relevancy"]: vals = [q[metric] for q in per_query if q[metric] is not None] if vals: agg[metric] = { "mean": round(sum(vals) / len(vals), 4), "median": round(statistics.median(vals), 4), "std": round(statistics.stdev(vals) if len(vals) > 1 else 0.0, 4), "min": round(min(vals), 4), "max": round(max(vals), 4), } logger.info( f"{metric}: mean={agg[metric]['mean']:.4f}, " f"std={agg[metric]['std']:.4f}, " f"min={agg[metric]['min']:.4f}, " f"max={agg[metric]['max']:.4f}" ) # Flag low-faithfulness flagged = [ {"index": i, "query": q["query"], "faithfulness": q["faithfulness"], "response": results[i]["response"]} for i, q in enumerate(per_query) if q["faithfulness"] < faithfulness_threshold ] pct_flagged = len(flagged) / len(results) * 100 if results else 0.0 if flagged: logger.warning(f"{len(flagged)} responses ({pct_flagged:.1f}%) flagged for faithfulness < {faithfulness_threshold}") output = { "aggregate": agg, "per_query": per_query, "flagged_low_faithfulness": flagged, "n_evaluated": len(results), "n_flagged": len(flagged), "pct_flagged": pct_flagged, } Path(results_dir).mkdir(parents=True, exist_ok=True) path = Path(results_dir) / "ragas_scores.json" with open(path, "w") as f: json.dump(output, f, indent=2) logger.info(f"Saved evaluation scores → {path}") return output