Spaces:

Aishwarya30998
/

DeveloperDocs_RAG

Sleeping

File size: 7,675 Bytes

14f13a5

"""
Evaluate RAG pipeline using RAGAS framework.

Measures:
- Faithfulness: Answer accuracy vs. retrieved context
- Answer Relevancy: How relevant the answer is to the question
- Context Precision: How precise the retrieved context is
- Context Recall: Coverage of relevant information
"""
import logging
import sys
from pathlib import Path
import json
from typing import List, Dict, Any
from datetime import datetime

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from src import create_rag_pipeline, settings
from src.config import EVALS_DIR, RESULTS_DIR

try:
    from datasets import Dataset
    from ragas import evaluate
    from ragas.metrics import (
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
    )
    RAGAS_AVAILABLE = True
except ImportError:
    RAGAS_AVAILABLE = False
    print("WARNING: RAGAS not installed. Install with: pip install ragas")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


# Evaluation dataset
TEST_QUERIES = [
    {
        "question": "How do I create a FastAPI application?",
        "ground_truth": "You create a FastAPI application by importing FastAPI and creating an instance: from fastapi import FastAPI; app = FastAPI()"
    },
    {
        "question": "What are path parameters in FastAPI?",
        "ground_truth": "Path parameters are variables in the URL path that FastAPI can extract and pass to your endpoint function."
    },
    {
        "question": "How do I add request validation?",
        "ground_truth": "FastAPI uses Pydantic models for request validation. You define a model with type hints and use it as a parameter type."
    },
    {
        "question": "What is dependency injection in FastAPI?",
        "ground_truth": "Dependency injection allows you to declare dependencies that FastAPI will resolve and inject into your endpoint functions."
    },
    {
        "question": "How do I handle authentication in FastAPI?",
        "ground_truth": "FastAPI provides security utilities for OAuth2, JWT tokens, and API keys. You can use dependencies to protect endpoints."
    },
]


def run_evaluation():
    """Run RAGAS evaluation on the RAG pipeline."""
    
    if not RAGAS_AVAILABLE:
        logger.error("RAGAS not available. Please install it.")
        return
    
    logger.info("=" * 60)
    logger.info("RAG Evaluation with RAGAS")
    logger.info("=" * 60)
    
    # Initialize pipeline
    logger.info("Initializing RAG pipeline...")
    pipeline = create_rag_pipeline()
    
    # Prepare evaluation data
    logger.info(f"\nRunning evaluation on {len(TEST_QUERIES)} queries...")
    
    evaluation_data = {
        "question": [],
        "answer": [],
        "contexts": [],
        "ground_truth": []
    }
    
    for item in TEST_QUERIES:
        question = item["question"]
        logger.info(f"\nProcessing: {question}")
        
        # Get response from pipeline
        response = pipeline.query(question, top_k=5)
        
        # Extract data for RAGAS
        evaluation_data["question"].append(question)
        evaluation_data["answer"].append(response["answer"])
        evaluation_data["ground_truth"].append(item["ground_truth"])
        
        # Get context from retrieved chunks
        contexts = []
        retrieved_chunks = pipeline.retriever.retrieve(question, top_k=5)
        for chunk in retrieved_chunks:
            contexts.append(chunk["content"])
        evaluation_data["contexts"].append(contexts)
        
        logger.info(f"  Answer length: {len(response['answer'])} chars")
        logger.info(f"  Contexts retrieved: {len(contexts)}")
    
    # Create dataset
    dataset = Dataset.from_dict(evaluation_data)
    
    # Run evaluation
    logger.info("\n" + "=" * 60)
    logger.info("Running RAGAS metrics...")
    logger.info("=" * 60)
    
    try:
        results = evaluate(
            dataset,
            metrics=[
                faithfulness,
                answer_relevancy,
                context_precision,
                context_recall,
            ],
        )
        
        # Display results
        logger.info("\n" + "=" * 60)
        logger.info("Evaluation Results")
        logger.info("=" * 60)
        
        metrics = {
            "faithfulness": results["faithfulness"],
            "answer_relevancy": results["answer_relevancy"],
            "context_precision": results["context_precision"],
            "context_recall": results["context_recall"],
        }
        
        for metric_name, score in metrics.items():
            logger.info(f"{metric_name.replace('_', ' ').title()}: {score:.4f}")
        
        # Overall score
        overall_score = sum(metrics.values()) / len(metrics)
        logger.info(f"\nOverall Score: {overall_score:.4f}")
        
        # Interpretation
        logger.info("\n" + "=" * 60)
        logger.info("Interpretation")
        logger.info("=" * 60)
        logger.info("Scores range from 0 to 1 (higher is better)")
        logger.info("Target scores for production:")
        logger.info("  • Faithfulness: > 0.80 (answers are accurate)")
        logger.info("  • Answer Relevancy: > 0.70 (answers address the question)")
        logger.info("  • Context Precision: > 0.75 (retrieved context is relevant)")
        logger.info("  • Context Recall: > 0.80 (all relevant info is retrieved)")
        
        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = RESULTS_DIR / f"ragas_eval_{timestamp}.json"
        
        results_dict = {
            "timestamp": timestamp,
            "metrics": metrics,
            "overall_score": overall_score,
            "test_queries": TEST_QUERIES,
            "settings": {
                "chunk_size": settings.chunk_size,
                "chunk_overlap": settings.chunk_overlap,
                "top_k": 5,
                "embedding_model": settings.embedding_model,
                "llm_model": settings.llm_model
            }
        }
        
        with open(results_file, 'w') as f:
            json.dump(results_dict, f, indent=2)
        
        logger.info(f"\nResults saved to: {results_file}")
        
    except Exception as e:
        logger.error(f"Evaluation failed: {e}", exc_info=True)


def simple_accuracy_test():
    """Simple accuracy test without RAGAS."""
    logger.info("Running simple accuracy test...")
    
    pipeline = create_rag_pipeline()
    
    correct = 0
    total = len(TEST_QUERIES)
    
    for item in TEST_QUERIES:
        question = item["question"]
        response = pipeline.query(question)
        
        # Simple check: does answer contain key terms?
        answer_lower = response["answer"].lower()
        ground_truth_lower = item["ground_truth"].lower()
        
        # Extract key terms from ground truth
        key_terms = [term for term in ground_truth_lower.split() if len(term) > 4]
        
        # Check if at least 50% of key terms are in answer
        matches = sum(1 for term in key_terms if term in answer_lower)
        if matches / len(key_terms) >= 0.5:
            correct += 1
            logger.info(f"✓ {question}")
        else:
            logger.info(f"✗ {question}")
    
    accuracy = correct / total
    logger.info(f"\nSimple Accuracy: {accuracy:.2%} ({correct}/{total})")


if __name__ == "__main__":
    if RAGAS_AVAILABLE:
        run_evaluation()
    else:
        logger.warning("RAGAS not available. Running simple test instead.")
        simple_accuracy_test()