Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced evaluation with proper groundedness checking. | |
| This module implements LLM-based groundedness evaluation that checks if | |
| generated answers are factually consistent with and fully supported by | |
| the retrieved evidence, going beyond simple token overlap. | |
| """ | |
| import json | |
| import os | |
| import statistics | |
| import time | |
| from typing import Any, Dict, List | |
| import requests | |
| from tqdm import tqdm | |
| ROOT = os.path.dirname(os.path.abspath(__file__)) | |
| EVAL_DIR = os.path.join(ROOT) | |
| QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json") | |
| GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json") | |
| OUT_FILE = os.path.join(EVAL_DIR, "enhanced_results.json") | |
| EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results") | |
| # Ensure results directory exists | |
| os.makedirs(EVAL_RESULTS_DIR, exist_ok=True) | |
| TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space") | |
| CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat") | |
| TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30")) | |
| # LLM API for groundedness evaluation | |
| OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") | |
| GROUNDEDNESS_MODEL = "microsoft/wizardlm-2-8x22b" | |
| def load_json(path: str) -> Any: | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def evaluate_groundedness_llm(generated_answer: str, retrieved_context: List[str]) -> Dict[str, Any]: | |
| """ | |
| Use LLM to evaluate if the generated answer is grounded in the retrieved context. | |
| Args: | |
| generated_answer: The generated response text | |
| retrieved_context: List of retrieved document excerpts | |
| Returns: | |
| Dictionary with groundedness score and explanation | |
| """ | |
| if not OPENROUTER_API_KEY: | |
| # Fallback to token overlap if no API key | |
| return { | |
| "grounded": True, | |
| "confidence": 0.5, | |
| "explanation": "Using fallback token overlap method - no OpenRouter API key available", | |
| "method": "token_overlap_fallback", | |
| } | |
| # Create context from retrieved documents | |
| context_text = "\n\n".join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(retrieved_context)]) | |
| # Groundedness evaluation prompt | |
| prompt = f"""You are an expert evaluator tasked with determining if a generated answer is | |
| factually grounded in the provided context. | |
| CONTEXT (Retrieved Documents): | |
| {context_text} | |
| GENERATED ANSWER: | |
| {generated_answer} | |
| TASK: | |
| Evaluate whether the generated answer is: | |
| 1. FACTUALLY CONSISTENT with the context (no contradictions) | |
| 2. FULLY SUPPORTED by the context (all claims can be verified) | |
| 3. NOT HALLUCINATED (no information absent from context) | |
| Respond with a JSON object containing: | |
| - "grounded": boolean (true if fully grounded, false otherwise) | |
| - "confidence": float 0-1 (how confident you are in this assessment) | |
| - "explanation": string (detailed reasoning for your assessment) | |
| - "unsupported_claims": list of strings (any claims not supported by context) | |
| Be strict: if ANY part of the answer contains information not present in or | |
| contradicted by the context, mark as false.""" | |
| try: | |
| response = requests.post( | |
| "https://openrouter.ai/api/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
| "Content-Type": "application/json", | |
| }, | |
| json={ | |
| "model": GROUNDEDNESS_MODEL, | |
| "messages": [{"role": "user", "content": prompt}], | |
| "temperature": 0.1, | |
| "max_tokens": 500, | |
| }, | |
| timeout=30, | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| content = result["choices"][0]["message"]["content"] | |
| # Try to parse JSON response | |
| try: | |
| evaluation = json.loads(content) | |
| evaluation["method"] = "llm_evaluation" | |
| return evaluation | |
| except json.JSONDecodeError: | |
| # Fallback if LLM didn't return valid JSON | |
| is_grounded = "true" in content.lower() and "grounded" in content.lower() | |
| return { | |
| "grounded": is_grounded, | |
| "confidence": 0.7, | |
| "explanation": f"LLM evaluation (non-JSON): {content[:200]}...", | |
| "method": "llm_evaluation_parsed", | |
| } | |
| else: | |
| # API error fallback | |
| return { | |
| "grounded": True, | |
| "confidence": 0.3, | |
| "explanation": f"API error {response.status_code}, using neutral assessment", | |
| "method": "api_error_fallback", | |
| } | |
| except Exception as e: | |
| # Exception fallback | |
| return { | |
| "grounded": True, | |
| "confidence": 0.3, | |
| "explanation": f"Evaluation error: {str(e)}, using neutral assessment", | |
| "method": "exception_fallback", | |
| } | |
| def evaluate_citation_accuracy_enhanced( | |
| expected_sources: List[str], | |
| returned_sources: List[Dict[str, Any]], | |
| generated_answer: str, | |
| ) -> Dict[str, Any]: | |
| """ | |
| Enhanced citation accuracy that considers both source presence and relevance. | |
| Args: | |
| expected_sources: List of expected source filenames | |
| returned_sources: List of returned source dictionaries | |
| generated_answer: The generated response text | |
| Returns: | |
| Dictionary with citation accuracy metrics | |
| """ | |
| if not expected_sources: | |
| return { | |
| "citation_accuracy": 1.0 if not returned_sources else 0.0, | |
| "expected_count": 0, | |
| "returned_count": len(returned_sources), | |
| "correctly_cited": 0, | |
| "method": "exact_match", | |
| } | |
| # Extract returned filenames | |
| returned_filenames = { | |
| s.get("document") or s.get("filename") or s.get("source_file") or s.get("file") for s in returned_sources | |
| } | |
| returned_filenames = {f for f in returned_filenames if f} | |
| # Count correct citations | |
| correctly_cited = 0 | |
| for expected in expected_sources: | |
| if expected in returned_filenames: | |
| correctly_cited += 1 | |
| citation_accuracy = correctly_cited / len(expected_sources) if expected_sources else 0.0 | |
| return { | |
| "citation_accuracy": citation_accuracy, | |
| "expected_count": len(expected_sources), | |
| "returned_count": len(returned_filenames), | |
| "correctly_cited": correctly_cited, | |
| "expected_sources": expected_sources, | |
| "returned_sources": list(returned_filenames), | |
| "method": "exact_match", | |
| } | |
| def token_overlap_score(gold: str, response: str) -> float: | |
| """Simple partial match score based on token overlap.""" | |
| gold_tokens = set(gold.lower().split()) | |
| resp_tokens = set(response.lower().split()) | |
| if not gold_tokens: | |
| return 0.0 | |
| overlap = gold_tokens & resp_tokens | |
| return len(overlap) / len(gold_tokens) | |
| def run_enhanced_evaluation(target: str = TARGET_URL): | |
| """Run enhanced evaluation with proper groundedness checking.""" | |
| questions = load_json(QUESTIONS_FILE) | |
| golds = load_json(GOLD_FILE) | |
| results = [] | |
| latencies = [] | |
| groundedness_scores = [] | |
| citation_accuracies = [] | |
| print(f"Running enhanced evaluation against {target}") | |
| print(f"Using groundedness evaluation: {'LLM-based' if OPENROUTER_API_KEY else 'Token overlap fallback'}") | |
| for q in tqdm(questions, desc="Enhanced Evaluation"): | |
| qid = str(q["id"]) | |
| payload = {"message": q["question"], "include_sources": True} | |
| url = target.rstrip("/") + CHAT_ENDPOINT | |
| start = time.time() | |
| try: | |
| # Add progress info | |
| print(f"\nEvaluating question {qid}: {q['question'][:50]}...") | |
| r = requests.post(url, json=payload, timeout=TIMEOUT) | |
| latency = time.time() - start | |
| latencies.append(latency) | |
| print(f"Response received in {latency:.2f}s") | |
| if r.status_code != 200: | |
| results.append( | |
| { | |
| "id": qid, | |
| "question": q["question"], | |
| "status_code": r.status_code, | |
| "error": r.text, | |
| "latency_s": latency, | |
| } | |
| ) | |
| continue | |
| data = r.json() | |
| response_text = data.get("response", "") | |
| returned_sources = data.get("sources", []) or [] | |
| gold_answer = golds.get(qid, {}).get("answer", "") | |
| expected_sources = golds.get(qid, {}).get("expected_sources", []) | |
| # Enhanced groundedness evaluation | |
| context_excerpts = [s.get("excerpt", "") for s in returned_sources if s.get("excerpt")] | |
| groundedness_eval = evaluate_groundedness_llm(response_text, context_excerpts) | |
| # Enhanced citation accuracy | |
| citation_eval = evaluate_citation_accuracy_enhanced(expected_sources, returned_sources, response_text) | |
| # Traditional token overlap for comparison | |
| overlap_score = token_overlap_score(gold_answer, response_text) | |
| # Store comprehensive results | |
| result = { | |
| "id": qid, | |
| "question": q["question"], | |
| "response": response_text, | |
| "latency_s": latency, | |
| # Enhanced groundedness metrics | |
| "groundedness": groundedness_eval, | |
| # Enhanced citation metrics | |
| "citation_evaluation": citation_eval, | |
| # Traditional metrics for comparison | |
| "overlap_score": overlap_score, | |
| "citation_accuracy": citation_eval["citation_accuracy"], | |
| # Source information | |
| "returned_sources": returned_sources, | |
| "expected_sources": expected_sources, | |
| "gold_answer": gold_answer, | |
| } | |
| results.append(result) | |
| # Track metrics for summary | |
| if groundedness_eval.get("grounded") is not None: | |
| groundedness_scores.append(1.0 if groundedness_eval["grounded"] else 0.0) | |
| citation_accuracies.append(citation_eval["citation_accuracy"]) | |
| except Exception as e: | |
| latency = time.time() - start | |
| latencies.append(latency) | |
| results.append( | |
| { | |
| "id": qid, | |
| "question": q["question"], | |
| "status_code": "error", | |
| "error": str(e), | |
| "latency_s": latency, | |
| } | |
| ) | |
| # Calculate summary metrics | |
| success_latencies = [lat for lat in latencies if lat is not None] | |
| p50 = statistics.median(success_latencies) if success_latencies else None | |
| p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None | |
| # Enhanced summary metrics | |
| avg_groundedness = sum(groundedness_scores) / len(groundedness_scores) if groundedness_scores else None | |
| avg_citation_accuracy = sum(citation_accuracies) / len(citation_accuracies) if citation_accuracies else None | |
| # Count successful evaluations | |
| successful_evals = len([r for r in results if r.get("groundedness") is not None]) | |
| total_questions = len(questions) | |
| summary = { | |
| "target": target, | |
| "evaluation_method": "enhanced_llm_based", | |
| "n_questions": total_questions, | |
| "successful_evaluations": successful_evals, | |
| "success_rate": (successful_evals / total_questions if total_questions > 0 else 0), | |
| # Performance metrics | |
| "latency_p50_s": p50, | |
| "latency_p95_s": p95, | |
| "avg_latency_s": (sum(success_latencies) / len(success_latencies) if success_latencies else None), | |
| # Quality metrics (enhanced) | |
| "avg_groundedness_score": avg_groundedness, | |
| "avg_citation_accuracy": avg_citation_accuracy, | |
| "groundedness_method": ("llm_evaluation" if OPENROUTER_API_KEY else "token_overlap_fallback"), | |
| # Additional insights | |
| "grounded_responses": sum(groundedness_scores), | |
| "ungrounded_responses": (len(groundedness_scores) - sum(groundedness_scores) if groundedness_scores else 0), | |
| "perfect_citations": len([c for c in citation_accuracies if c == 1.0]), | |
| "no_citations": len([c for c in citation_accuracies if c == 0.0]), | |
| } | |
| # Save enhanced results | |
| output = { | |
| "summary": summary, | |
| "results": results, | |
| "metadata": { | |
| "evaluation_timestamp": time.time(), | |
| "evaluation_version": "enhanced_v1.0", | |
| "groundedness_model": (GROUNDEDNESS_MODEL if OPENROUTER_API_KEY else "token_overlap"), | |
| "target_endpoint": target + CHAT_ENDPOINT, | |
| }, | |
| } | |
| # Save to evaluation directory and a centralized evaluation_results folder | |
| with open(OUT_FILE, "w", encoding="utf-8") as f: | |
| json.dump(output, f, indent=2) | |
| # Also write a copy into evaluation_results for CI aggregation | |
| try: | |
| out_summary_path = os.path.join(EVAL_RESULTS_DIR, "enhanced_results_summary.json") | |
| with open(out_summary_path, "w", encoding="utf-8") as f2: | |
| json.dump(output["summary"], f2, indent=2) | |
| except Exception: | |
| pass | |
| print("\nEnhanced Evaluation Complete!") | |
| print("=" * 50) | |
| print(json.dumps(summary, indent=2)) | |
| print(f"\nDetailed results saved to {OUT_FILE}") | |
| return output | |
| if __name__ == "__main__": | |
| target = os.getenv("EVAL_TARGET_URL", TARGET_URL) | |
| run_enhanced_evaluation(target) | |