Spaces:
Running
Running
| """LLM-based faithfulness and answer relevancy evaluation for generated support responses. | |
| Implements the same metrics as RAGAS (faithfulness, answer_relevancy) but calls | |
| the Anthropic API directly in a synchronous loop — no async timeouts, no OpenAI dependency. | |
| """ | |
| import json | |
| import os | |
| import statistics | |
| import time | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import anthropic | |
| from dotenv import load_dotenv | |
| from loguru import logger | |
| from tqdm import tqdm | |
| load_dotenv() | |
| _FAITHFULNESS_PROMPT = """You are an evaluation assistant. Given a context and a generated response, | |
| rate how faithful the response is to the context on a scale from 0.0 to 1.0. | |
| Faithfulness means the response only contains information that is grounded in or consistent with the context. | |
| A score of 1.0 means every claim in the response is supported by the context. | |
| A score of 0.0 means the response contains claims that contradict or are completely absent from the context. | |
| Context: | |
| {context} | |
| Response: | |
| {response} | |
| Reply with ONLY a decimal number between 0.0 and 1.0. No explanation.""" | |
| _RELEVANCY_PROMPT = """You are an evaluation assistant. Given a customer question and a support response, | |
| rate how relevant the response is to the question on a scale from 0.0 to 1.0. | |
| Relevancy means the response directly addresses what the customer asked. | |
| A score of 1.0 means the response fully and directly answers the customer's question. | |
| A score of 0.0 means the response is completely off-topic or ignores the question. | |
| Customer question: | |
| {question} | |
| Support response: | |
| {response} | |
| Reply with ONLY a decimal number between 0.0 and 1.0. No explanation.""" | |
| def _score_single( | |
| client: anthropic.Anthropic, | |
| prompt: str, | |
| retries: int = 3, | |
| ) -> float: | |
| """Call Claude Haiku to get a 0-1 score from a prompt. | |
| Args: | |
| client: Anthropic client instance. | |
| prompt: Evaluation prompt string. | |
| retries: Number of retry attempts on failure. | |
| Returns: | |
| Float score between 0.0 and 1.0. | |
| """ | |
| text = "" | |
| for attempt in range(retries): | |
| try: | |
| msg = client.messages.create( | |
| model="claude-haiku-4-5-20251001", | |
| max_tokens=10, | |
| temperature=0, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| text = msg.content[0].text.strip() | |
| score = float(text) | |
| return max(0.0, min(1.0, score)) | |
| except (ValueError, IndexError): | |
| logger.warning(f"Could not parse score from response: '{text}' -- defaulting to 0.5") | |
| return 0.5 | |
| except anthropic.RateLimitError: | |
| wait = 2 ** attempt | |
| logger.warning(f"Rate limit hit, retrying in {wait}s…") | |
| time.sleep(wait) | |
| except Exception as e: | |
| logger.warning(f"Score attempt {attempt+1} failed: {e}") | |
| time.sleep(1) | |
| return 0.5 | |
| def run_ragas_evaluation( | |
| results: List[Dict], | |
| results_dir: str, | |
| faithfulness_threshold: float = 0.5, | |
| ) -> Dict: | |
| """Evaluate faithfulness and answer relevancy using Claude Haiku directly. | |
| Implements the same metrics as RAGAS but calls Anthropic API synchronously | |
| to avoid async timeout issues. | |
| Args: | |
| results: List of pipeline result dicts containing 'query', 'response', 'context'. | |
| results_dir: Directory to save scores JSON. | |
| faithfulness_threshold: Responses below this faithfulness score are flagged. | |
| Returns: | |
| Dict with aggregate scores, per-query scores, and flagged responses. | |
| """ | |
| api_key = os.environ.get("ANTHROPIC_API_KEY") | |
| if not api_key: | |
| raise EnvironmentError("ANTHROPIC_API_KEY not set.") | |
| client = anthropic.Anthropic(api_key=api_key) | |
| logger.info(f"Running LLM evaluation on {len(results)} queries using Claude Haiku…") | |
| per_query = [] | |
| for r in tqdm(results, desc="Evaluating responses"): | |
| faith_prompt = _FAITHFULNESS_PROMPT.format( | |
| context=r["context"], response=r["response"] | |
| ) | |
| rel_prompt = _RELEVANCY_PROMPT.format( | |
| question=r["query"], response=r["response"] | |
| ) | |
| faithfulness_score = _score_single(client, faith_prompt) | |
| answer_relevancy_score = _score_single(client, rel_prompt) | |
| per_query.append({ | |
| "query": r["query"], | |
| "predicted_intent": r.get("predicted_intent", ""), | |
| "faithfulness": faithfulness_score, | |
| "answer_relevancy": answer_relevancy_score, | |
| }) | |
| # Aggregate statistics | |
| agg: Dict = {} | |
| for metric in ["faithfulness", "answer_relevancy"]: | |
| vals = [q[metric] for q in per_query if q[metric] is not None] | |
| if vals: | |
| agg[metric] = { | |
| "mean": round(sum(vals) / len(vals), 4), | |
| "median": round(statistics.median(vals), 4), | |
| "std": round(statistics.stdev(vals) if len(vals) > 1 else 0.0, 4), | |
| "min": round(min(vals), 4), | |
| "max": round(max(vals), 4), | |
| } | |
| logger.info( | |
| f"{metric}: mean={agg[metric]['mean']:.4f}, " | |
| f"std={agg[metric]['std']:.4f}, " | |
| f"min={agg[metric]['min']:.4f}, " | |
| f"max={agg[metric]['max']:.4f}" | |
| ) | |
| # Flag low-faithfulness | |
| flagged = [ | |
| {"index": i, "query": q["query"], "faithfulness": q["faithfulness"], "response": results[i]["response"]} | |
| for i, q in enumerate(per_query) | |
| if q["faithfulness"] < faithfulness_threshold | |
| ] | |
| pct_flagged = len(flagged) / len(results) * 100 if results else 0.0 | |
| if flagged: | |
| logger.warning(f"{len(flagged)} responses ({pct_flagged:.1f}%) flagged for faithfulness < {faithfulness_threshold}") | |
| output = { | |
| "aggregate": agg, | |
| "per_query": per_query, | |
| "flagged_low_faithfulness": flagged, | |
| "n_evaluated": len(results), | |
| "n_flagged": len(flagged), | |
| "pct_flagged": pct_flagged, | |
| } | |
| Path(results_dir).mkdir(parents=True, exist_ok=True) | |
| path = Path(results_dir) / "ragas_scores.json" | |
| with open(path, "w") as f: | |
| json.dump(output, f, indent=2) | |
| logger.info(f"Saved evaluation scores → {path}") | |
| return output | |