Spaces:
Sleeping
Sleeping
File size: 7,675 Bytes
14f13a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
"""
Evaluate RAG pipeline using RAGAS framework.
Measures:
- Faithfulness: Answer accuracy vs. retrieved context
- Answer Relevancy: How relevant the answer is to the question
- Context Precision: How precise the retrieved context is
- Context Recall: Coverage of relevant information
"""
import logging
import sys
from pathlib import Path
import json
from typing import List, Dict, Any
from datetime import datetime
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src import create_rag_pipeline, settings
from src.config import EVALS_DIR, RESULTS_DIR
try:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
RAGAS_AVAILABLE = True
except ImportError:
RAGAS_AVAILABLE = False
print("WARNING: RAGAS not installed. Install with: pip install ragas")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Evaluation dataset
TEST_QUERIES = [
{
"question": "How do I create a FastAPI application?",
"ground_truth": "You create a FastAPI application by importing FastAPI and creating an instance: from fastapi import FastAPI; app = FastAPI()"
},
{
"question": "What are path parameters in FastAPI?",
"ground_truth": "Path parameters are variables in the URL path that FastAPI can extract and pass to your endpoint function."
},
{
"question": "How do I add request validation?",
"ground_truth": "FastAPI uses Pydantic models for request validation. You define a model with type hints and use it as a parameter type."
},
{
"question": "What is dependency injection in FastAPI?",
"ground_truth": "Dependency injection allows you to declare dependencies that FastAPI will resolve and inject into your endpoint functions."
},
{
"question": "How do I handle authentication in FastAPI?",
"ground_truth": "FastAPI provides security utilities for OAuth2, JWT tokens, and API keys. You can use dependencies to protect endpoints."
},
]
def run_evaluation():
"""Run RAGAS evaluation on the RAG pipeline."""
if not RAGAS_AVAILABLE:
logger.error("RAGAS not available. Please install it.")
return
logger.info("=" * 60)
logger.info("RAG Evaluation with RAGAS")
logger.info("=" * 60)
# Initialize pipeline
logger.info("Initializing RAG pipeline...")
pipeline = create_rag_pipeline()
# Prepare evaluation data
logger.info(f"\nRunning evaluation on {len(TEST_QUERIES)} queries...")
evaluation_data = {
"question": [],
"answer": [],
"contexts": [],
"ground_truth": []
}
for item in TEST_QUERIES:
question = item["question"]
logger.info(f"\nProcessing: {question}")
# Get response from pipeline
response = pipeline.query(question, top_k=5)
# Extract data for RAGAS
evaluation_data["question"].append(question)
evaluation_data["answer"].append(response["answer"])
evaluation_data["ground_truth"].append(item["ground_truth"])
# Get context from retrieved chunks
contexts = []
retrieved_chunks = pipeline.retriever.retrieve(question, top_k=5)
for chunk in retrieved_chunks:
contexts.append(chunk["content"])
evaluation_data["contexts"].append(contexts)
logger.info(f" Answer length: {len(response['answer'])} chars")
logger.info(f" Contexts retrieved: {len(contexts)}")
# Create dataset
dataset = Dataset.from_dict(evaluation_data)
# Run evaluation
logger.info("\n" + "=" * 60)
logger.info("Running RAGAS metrics...")
logger.info("=" * 60)
try:
results = evaluate(
dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall,
],
)
# Display results
logger.info("\n" + "=" * 60)
logger.info("Evaluation Results")
logger.info("=" * 60)
metrics = {
"faithfulness": results["faithfulness"],
"answer_relevancy": results["answer_relevancy"],
"context_precision": results["context_precision"],
"context_recall": results["context_recall"],
}
for metric_name, score in metrics.items():
logger.info(f"{metric_name.replace('_', ' ').title()}: {score:.4f}")
# Overall score
overall_score = sum(metrics.values()) / len(metrics)
logger.info(f"\nOverall Score: {overall_score:.4f}")
# Interpretation
logger.info("\n" + "=" * 60)
logger.info("Interpretation")
logger.info("=" * 60)
logger.info("Scores range from 0 to 1 (higher is better)")
logger.info("Target scores for production:")
logger.info(" • Faithfulness: > 0.80 (answers are accurate)")
logger.info(" • Answer Relevancy: > 0.70 (answers address the question)")
logger.info(" • Context Precision: > 0.75 (retrieved context is relevant)")
logger.info(" • Context Recall: > 0.80 (all relevant info is retrieved)")
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = RESULTS_DIR / f"ragas_eval_{timestamp}.json"
results_dict = {
"timestamp": timestamp,
"metrics": metrics,
"overall_score": overall_score,
"test_queries": TEST_QUERIES,
"settings": {
"chunk_size": settings.chunk_size,
"chunk_overlap": settings.chunk_overlap,
"top_k": 5,
"embedding_model": settings.embedding_model,
"llm_model": settings.llm_model
}
}
with open(results_file, 'w') as f:
json.dump(results_dict, f, indent=2)
logger.info(f"\nResults saved to: {results_file}")
except Exception as e:
logger.error(f"Evaluation failed: {e}", exc_info=True)
def simple_accuracy_test():
"""Simple accuracy test without RAGAS."""
logger.info("Running simple accuracy test...")
pipeline = create_rag_pipeline()
correct = 0
total = len(TEST_QUERIES)
for item in TEST_QUERIES:
question = item["question"]
response = pipeline.query(question)
# Simple check: does answer contain key terms?
answer_lower = response["answer"].lower()
ground_truth_lower = item["ground_truth"].lower()
# Extract key terms from ground truth
key_terms = [term for term in ground_truth_lower.split() if len(term) > 4]
# Check if at least 50% of key terms are in answer
matches = sum(1 for term in key_terms if term in answer_lower)
if matches / len(key_terms) >= 0.5:
correct += 1
logger.info(f"✓ {question}")
else:
logger.info(f"✗ {question}")
accuracy = correct / total
logger.info(f"\nSimple Accuracy: {accuracy:.2%} ({correct}/{total})")
if __name__ == "__main__":
if RAGAS_AVAILABLE:
run_evaluation()
else:
logger.warning("RAGAS not available. Running simple test instead.")
simple_accuracy_test()
|