DeveloperDocs_RAG / evaluate_rag.py
Aishwarya30998's picture
Deploy DeveloperDocs-AI-Copilot-RAG to Hugging Face Space
14f13a5
"""
Evaluate RAG pipeline using RAGAS framework.
Measures:
- Faithfulness: Answer accuracy vs. retrieved context
- Answer Relevancy: How relevant the answer is to the question
- Context Precision: How precise the retrieved context is
- Context Recall: Coverage of relevant information
"""
import logging
import sys
from pathlib import Path
import json
from typing import List, Dict, Any
from datetime import datetime
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src import create_rag_pipeline, settings
from src.config import EVALS_DIR, RESULTS_DIR
try:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
RAGAS_AVAILABLE = True
except ImportError:
RAGAS_AVAILABLE = False
print("WARNING: RAGAS not installed. Install with: pip install ragas")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Evaluation dataset
TEST_QUERIES = [
{
"question": "How do I create a FastAPI application?",
"ground_truth": "You create a FastAPI application by importing FastAPI and creating an instance: from fastapi import FastAPI; app = FastAPI()"
},
{
"question": "What are path parameters in FastAPI?",
"ground_truth": "Path parameters are variables in the URL path that FastAPI can extract and pass to your endpoint function."
},
{
"question": "How do I add request validation?",
"ground_truth": "FastAPI uses Pydantic models for request validation. You define a model with type hints and use it as a parameter type."
},
{
"question": "What is dependency injection in FastAPI?",
"ground_truth": "Dependency injection allows you to declare dependencies that FastAPI will resolve and inject into your endpoint functions."
},
{
"question": "How do I handle authentication in FastAPI?",
"ground_truth": "FastAPI provides security utilities for OAuth2, JWT tokens, and API keys. You can use dependencies to protect endpoints."
},
]
def run_evaluation():
"""Run RAGAS evaluation on the RAG pipeline."""
if not RAGAS_AVAILABLE:
logger.error("RAGAS not available. Please install it.")
return
logger.info("=" * 60)
logger.info("RAG Evaluation with RAGAS")
logger.info("=" * 60)
# Initialize pipeline
logger.info("Initializing RAG pipeline...")
pipeline = create_rag_pipeline()
# Prepare evaluation data
logger.info(f"\nRunning evaluation on {len(TEST_QUERIES)} queries...")
evaluation_data = {
"question": [],
"answer": [],
"contexts": [],
"ground_truth": []
}
for item in TEST_QUERIES:
question = item["question"]
logger.info(f"\nProcessing: {question}")
# Get response from pipeline
response = pipeline.query(question, top_k=5)
# Extract data for RAGAS
evaluation_data["question"].append(question)
evaluation_data["answer"].append(response["answer"])
evaluation_data["ground_truth"].append(item["ground_truth"])
# Get context from retrieved chunks
contexts = []
retrieved_chunks = pipeline.retriever.retrieve(question, top_k=5)
for chunk in retrieved_chunks:
contexts.append(chunk["content"])
evaluation_data["contexts"].append(contexts)
logger.info(f" Answer length: {len(response['answer'])} chars")
logger.info(f" Contexts retrieved: {len(contexts)}")
# Create dataset
dataset = Dataset.from_dict(evaluation_data)
# Run evaluation
logger.info("\n" + "=" * 60)
logger.info("Running RAGAS metrics...")
logger.info("=" * 60)
try:
results = evaluate(
dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall,
],
)
# Display results
logger.info("\n" + "=" * 60)
logger.info("Evaluation Results")
logger.info("=" * 60)
metrics = {
"faithfulness": results["faithfulness"],
"answer_relevancy": results["answer_relevancy"],
"context_precision": results["context_precision"],
"context_recall": results["context_recall"],
}
for metric_name, score in metrics.items():
logger.info(f"{metric_name.replace('_', ' ').title()}: {score:.4f}")
# Overall score
overall_score = sum(metrics.values()) / len(metrics)
logger.info(f"\nOverall Score: {overall_score:.4f}")
# Interpretation
logger.info("\n" + "=" * 60)
logger.info("Interpretation")
logger.info("=" * 60)
logger.info("Scores range from 0 to 1 (higher is better)")
logger.info("Target scores for production:")
logger.info(" • Faithfulness: > 0.80 (answers are accurate)")
logger.info(" • Answer Relevancy: > 0.70 (answers address the question)")
logger.info(" • Context Precision: > 0.75 (retrieved context is relevant)")
logger.info(" • Context Recall: > 0.80 (all relevant info is retrieved)")
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = RESULTS_DIR / f"ragas_eval_{timestamp}.json"
results_dict = {
"timestamp": timestamp,
"metrics": metrics,
"overall_score": overall_score,
"test_queries": TEST_QUERIES,
"settings": {
"chunk_size": settings.chunk_size,
"chunk_overlap": settings.chunk_overlap,
"top_k": 5,
"embedding_model": settings.embedding_model,
"llm_model": settings.llm_model
}
}
with open(results_file, 'w') as f:
json.dump(results_dict, f, indent=2)
logger.info(f"\nResults saved to: {results_file}")
except Exception as e:
logger.error(f"Evaluation failed: {e}", exc_info=True)
def simple_accuracy_test():
"""Simple accuracy test without RAGAS."""
logger.info("Running simple accuracy test...")
pipeline = create_rag_pipeline()
correct = 0
total = len(TEST_QUERIES)
for item in TEST_QUERIES:
question = item["question"]
response = pipeline.query(question)
# Simple check: does answer contain key terms?
answer_lower = response["answer"].lower()
ground_truth_lower = item["ground_truth"].lower()
# Extract key terms from ground truth
key_terms = [term for term in ground_truth_lower.split() if len(term) > 4]
# Check if at least 50% of key terms are in answer
matches = sum(1 for term in key_terms if term in answer_lower)
if matches / len(key_terms) >= 0.5:
correct += 1
logger.info(f"✓ {question}")
else:
logger.info(f"✗ {question}")
accuracy = correct / total
logger.info(f"\nSimple Accuracy: {accuracy:.2%} ({correct}/{total})")
if __name__ == "__main__":
if RAGAS_AVAILABLE:
run_evaluation()
else:
logger.warning("RAGAS not available. Running simple test instead.")
simple_accuracy_test()