""" Enhanced evaluation with proper groundedness checking. This module implements LLM-based groundedness evaluation that checks if generated answers are factually consistent with and fully supported by the retrieved evidence, going beyond simple token overlap. """ import json import os import statistics import time from typing import Any, Dict, List import requests from tqdm import tqdm ROOT = os.path.dirname(os.path.abspath(__file__)) EVAL_DIR = os.path.join(ROOT) QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json") GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json") OUT_FILE = os.path.join(EVAL_DIR, "enhanced_results.json") EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results") # Ensure results directory exists os.makedirs(EVAL_RESULTS_DIR, exist_ok=True) TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space") CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat") TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30")) # LLM API for groundedness evaluation OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") GROUNDEDNESS_MODEL = "microsoft/wizardlm-2-8x22b" def load_json(path: str) -> Any: with open(path, "r", encoding="utf-8") as f: return json.load(f) def evaluate_groundedness_llm(generated_answer: str, retrieved_context: List[str]) -> Dict[str, Any]: """ Use LLM to evaluate if the generated answer is grounded in the retrieved context. Args: generated_answer: The generated response text retrieved_context: List of retrieved document excerpts Returns: Dictionary with groundedness score and explanation """ if not OPENROUTER_API_KEY: # Fallback to token overlap if no API key return { "grounded": True, "confidence": 0.5, "explanation": "Using fallback token overlap method - no OpenRouter API key available", "method": "token_overlap_fallback", } # Create context from retrieved documents context_text = "\n\n".join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(retrieved_context)]) # Groundedness evaluation prompt prompt = f"""You are an expert evaluator tasked with determining if a generated answer is factually grounded in the provided context. CONTEXT (Retrieved Documents): {context_text} GENERATED ANSWER: {generated_answer} TASK: Evaluate whether the generated answer is: 1. FACTUALLY CONSISTENT with the context (no contradictions) 2. FULLY SUPPORTED by the context (all claims can be verified) 3. NOT HALLUCINATED (no information absent from context) Respond with a JSON object containing: - "grounded": boolean (true if fully grounded, false otherwise) - "confidence": float 0-1 (how confident you are in this assessment) - "explanation": string (detailed reasoning for your assessment) - "unsupported_claims": list of strings (any claims not supported by context) Be strict: if ANY part of the answer contains information not present in or contradicted by the context, mark as false.""" try: response = requests.post( "https://openrouter.ai/api/v1/chat/completions", headers={ "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json", }, json={ "model": GROUNDEDNESS_MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 500, }, timeout=30, ) if response.status_code == 200: result = response.json() content = result["choices"][0]["message"]["content"] # Try to parse JSON response try: evaluation = json.loads(content) evaluation["method"] = "llm_evaluation" return evaluation except json.JSONDecodeError: # Fallback if LLM didn't return valid JSON is_grounded = "true" in content.lower() and "grounded" in content.lower() return { "grounded": is_grounded, "confidence": 0.7, "explanation": f"LLM evaluation (non-JSON): {content[:200]}...", "method": "llm_evaluation_parsed", } else: # API error fallback return { "grounded": True, "confidence": 0.3, "explanation": f"API error {response.status_code}, using neutral assessment", "method": "api_error_fallback", } except Exception as e: # Exception fallback return { "grounded": True, "confidence": 0.3, "explanation": f"Evaluation error: {str(e)}, using neutral assessment", "method": "exception_fallback", } def evaluate_citation_accuracy_enhanced( expected_sources: List[str], returned_sources: List[Dict[str, Any]], generated_answer: str, ) -> Dict[str, Any]: """ Enhanced citation accuracy that considers both source presence and relevance. Args: expected_sources: List of expected source filenames returned_sources: List of returned source dictionaries generated_answer: The generated response text Returns: Dictionary with citation accuracy metrics """ if not expected_sources: return { "citation_accuracy": 1.0 if not returned_sources else 0.0, "expected_count": 0, "returned_count": len(returned_sources), "correctly_cited": 0, "method": "exact_match", } # Extract returned filenames returned_filenames = { s.get("document") or s.get("filename") or s.get("source_file") or s.get("file") for s in returned_sources } returned_filenames = {f for f in returned_filenames if f} # Count correct citations correctly_cited = 0 for expected in expected_sources: if expected in returned_filenames: correctly_cited += 1 citation_accuracy = correctly_cited / len(expected_sources) if expected_sources else 0.0 return { "citation_accuracy": citation_accuracy, "expected_count": len(expected_sources), "returned_count": len(returned_filenames), "correctly_cited": correctly_cited, "expected_sources": expected_sources, "returned_sources": list(returned_filenames), "method": "exact_match", } def token_overlap_score(gold: str, response: str) -> float: """Simple partial match score based on token overlap.""" gold_tokens = set(gold.lower().split()) resp_tokens = set(response.lower().split()) if not gold_tokens: return 0.0 overlap = gold_tokens & resp_tokens return len(overlap) / len(gold_tokens) def run_enhanced_evaluation(target: str = TARGET_URL): """Run enhanced evaluation with proper groundedness checking.""" questions = load_json(QUESTIONS_FILE) golds = load_json(GOLD_FILE) results = [] latencies = [] groundedness_scores = [] citation_accuracies = [] print(f"Running enhanced evaluation against {target}") print(f"Using groundedness evaluation: {'LLM-based' if OPENROUTER_API_KEY else 'Token overlap fallback'}") for q in tqdm(questions, desc="Enhanced Evaluation"): qid = str(q["id"]) payload = {"message": q["question"], "include_sources": True} url = target.rstrip("/") + CHAT_ENDPOINT start = time.time() try: # Add progress info print(f"\nEvaluating question {qid}: {q['question'][:50]}...") r = requests.post(url, json=payload, timeout=TIMEOUT) latency = time.time() - start latencies.append(latency) print(f"Response received in {latency:.2f}s") if r.status_code != 200: results.append( { "id": qid, "question": q["question"], "status_code": r.status_code, "error": r.text, "latency_s": latency, } ) continue data = r.json() response_text = data.get("response", "") returned_sources = data.get("sources", []) or [] gold_answer = golds.get(qid, {}).get("answer", "") expected_sources = golds.get(qid, {}).get("expected_sources", []) # Enhanced groundedness evaluation context_excerpts = [s.get("excerpt", "") for s in returned_sources if s.get("excerpt")] groundedness_eval = evaluate_groundedness_llm(response_text, context_excerpts) # Enhanced citation accuracy citation_eval = evaluate_citation_accuracy_enhanced(expected_sources, returned_sources, response_text) # Traditional token overlap for comparison overlap_score = token_overlap_score(gold_answer, response_text) # Store comprehensive results result = { "id": qid, "question": q["question"], "response": response_text, "latency_s": latency, # Enhanced groundedness metrics "groundedness": groundedness_eval, # Enhanced citation metrics "citation_evaluation": citation_eval, # Traditional metrics for comparison "overlap_score": overlap_score, "citation_accuracy": citation_eval["citation_accuracy"], # Source information "returned_sources": returned_sources, "expected_sources": expected_sources, "gold_answer": gold_answer, } results.append(result) # Track metrics for summary if groundedness_eval.get("grounded") is not None: groundedness_scores.append(1.0 if groundedness_eval["grounded"] else 0.0) citation_accuracies.append(citation_eval["citation_accuracy"]) except Exception as e: latency = time.time() - start latencies.append(latency) results.append( { "id": qid, "question": q["question"], "status_code": "error", "error": str(e), "latency_s": latency, } ) # Calculate summary metrics success_latencies = [lat for lat in latencies if lat is not None] p50 = statistics.median(success_latencies) if success_latencies else None p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None # Enhanced summary metrics avg_groundedness = sum(groundedness_scores) / len(groundedness_scores) if groundedness_scores else None avg_citation_accuracy = sum(citation_accuracies) / len(citation_accuracies) if citation_accuracies else None # Count successful evaluations successful_evals = len([r for r in results if r.get("groundedness") is not None]) total_questions = len(questions) summary = { "target": target, "evaluation_method": "enhanced_llm_based", "n_questions": total_questions, "successful_evaluations": successful_evals, "success_rate": (successful_evals / total_questions if total_questions > 0 else 0), # Performance metrics "latency_p50_s": p50, "latency_p95_s": p95, "avg_latency_s": (sum(success_latencies) / len(success_latencies) if success_latencies else None), # Quality metrics (enhanced) "avg_groundedness_score": avg_groundedness, "avg_citation_accuracy": avg_citation_accuracy, "groundedness_method": ("llm_evaluation" if OPENROUTER_API_KEY else "token_overlap_fallback"), # Additional insights "grounded_responses": sum(groundedness_scores), "ungrounded_responses": (len(groundedness_scores) - sum(groundedness_scores) if groundedness_scores else 0), "perfect_citations": len([c for c in citation_accuracies if c == 1.0]), "no_citations": len([c for c in citation_accuracies if c == 0.0]), } # Save enhanced results output = { "summary": summary, "results": results, "metadata": { "evaluation_timestamp": time.time(), "evaluation_version": "enhanced_v1.0", "groundedness_model": (GROUNDEDNESS_MODEL if OPENROUTER_API_KEY else "token_overlap"), "target_endpoint": target + CHAT_ENDPOINT, }, } # Save to evaluation directory and a centralized evaluation_results folder with open(OUT_FILE, "w", encoding="utf-8") as f: json.dump(output, f, indent=2) # Also write a copy into evaluation_results for CI aggregation try: out_summary_path = os.path.join(EVAL_RESULTS_DIR, "enhanced_results_summary.json") with open(out_summary_path, "w", encoding="utf-8") as f2: json.dump(output["summary"], f2, indent=2) except Exception: pass print("\nEnhanced Evaluation Complete!") print("=" * 50) print(json.dumps(summary, indent=2)) print(f"\nDetailed results saved to {OUT_FILE}") return output if __name__ == "__main__": target = os.getenv("EVAL_TARGET_URL", TARGET_URL) run_enhanced_evaluation(target)