Spaces:

guifav
/

rag_template

Sleeping

File size: 9,757 Bytes

a686b1b

"""
Script de benchmarking para avaliar diferentes configuracoes de RAG.

Uso:
    python scripts/benchmark.py
"""

import sys
import json
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime
import time

# Adicionar src ao path
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.evaluation import RAGEvaluator, RAGEvaluationResult
from src.config import DATABASE_URL
from src.database import DatabaseManager
from src.embeddings import EmbeddingManager
from src.generation import GenerationManager


def load_test_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """Carrega dataset de teste."""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def run_rag_pipeline(
    query: str,
    top_k: int = 5,
    use_reranking: bool = False,
    use_hybrid: bool = False,
    session_id: str = "benchmark"
) -> Dict[str, Any]:
    """
    Executa pipeline RAG completo.

    Returns:
        Dict com response e contexts
    """
    db = DatabaseManager(DATABASE_URL)
    embedding_manager = EmbeddingManager()
    generation_manager = GenerationManager()

    # 1. Criar embedding
    query_embedding = embedding_manager.encode(query)

    # 2. Buscar contextos
    if use_hybrid:
        # Implementar hybrid search
        contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id)
    else:
        contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id)

    # 3. Reranking (se ativado)
    if use_reranking and len(contexts) > 0:
        # Implementar reranking
        pass

    # 4. Gerar resposta
    context_texts = [ctx['content'] for ctx in contexts]
    response = generation_manager.generate_response(query, context_texts)

    return {
        'response': response,
        'contexts': context_texts
    }


def benchmark_configurations(
    test_cases: List[Dict[str, Any]],
    configs: List[Dict[str, Any]]
) -> Dict[str, List[RAGEvaluationResult]]:
    """
    Testa multiplas configuracoes.

    Args:
        test_cases: Casos de teste
        configs: Lista de configuracoes para testar

    Returns:
        Dict com resultados por configuracao
    """
    evaluator = RAGEvaluator(use_ragas=False)  # Usar metricas simples
    all_results = {}

    for config in configs:
        config_name = config['name']
        print(f"\n{'='*60}")
        print(f"Testando configuracao: {config_name}")
        print(f"{'='*60}")

        results = []

        for i, test_case in enumerate(test_cases):
            print(f"  Caso {i+1}/{len(test_cases)}: {test_case['query'][:50]}...")

            start_time = time.time()

            # Executar RAG com configuracao
            try:
                rag_result = run_rag_pipeline(
                    query=test_case['query'],
                    top_k=config.get('top_k', 5),
                    use_reranking=config.get('use_reranking', False),
                    use_hybrid=config.get('use_hybrid', False)
                )

                # Avaliar resultado
                eval_result = evaluator.evaluate_single(
                    query=test_case['query'],
                    response=rag_result['response'],
                    contexts=rag_result['contexts'],
                    ground_truth=test_case.get('ground_truth')
                )

                eval_result.response_time = time.time() - start_time
                results.append(eval_result)

            except Exception as e:
                print(f"    Erro: {e}")
                continue

        all_results[config_name] = results

    return all_results


def generate_html_report(
    all_results: Dict[str, List[RAGEvaluationResult]],
    output_path: str
):
    """Gera relatorio HTML."""
    evaluator = RAGEvaluator(use_ragas=False)

    html = """
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>RAG Benchmark Report</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                margin: 20px;
                background-color: #f5f5f5;
            }
            h1 {
                color: #333;
            }
            .config {
                background: white;
                padding: 20px;
                margin: 20px 0;
                border-radius: 8px;
                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            }
            table {
                width: 100%;
                border-collapse: collapse;
                margin: 10px 0;
            }
            th, td {
                padding: 12px;
                text-align: left;
                border-bottom: 1px solid #ddd;
            }
            th {
                background-color: #4CAF50;
                color: white;
            }
            .score-high {
                color: green;
                font-weight: bold;
            }
            .score-medium {
                color: orange;
            }
            .score-low {
                color: red;
                font-weight: bold;
            }
            .worst-cases {
                background: #fff3cd;
                padding: 15px;
                margin: 15px 0;
                border-radius: 5px;
                border-left: 4px solid #ffc107;
            }
        </style>
    </head>
    <body>
        <h1>RAG Benchmark Report</h1>
        <p>Data: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """</p>
    """

    for config_name, results in all_results.items():
        if not results:
            continue

        report = evaluator.generate_report(results)

        html += f"""
        <div class="config">
            <h2>{config_name}</h2>
            <p>Total de casos: {report['total_cases']}</p>

            <h3>Scores Medios</h3>
            <table>
                <tr>
                    <th>Metrica</th>
                    <th>Media</th>
                    <th>Min</th>
                    <th>Max</th>
                </tr>
        """

        for metric in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'overall']:
            avg = report['average_scores'].get(metric, 0)
            min_val = report['min_scores'].get(metric, 0) if metric != 'overall' else 0
            max_val = report['max_scores'].get(metric, 0) if metric != 'overall' else 0

            score_class = 'score-high' if avg >= 0.7 else ('score-medium' if avg >= 0.5 else 'score-low')

            html += f"""
                <tr>
                    <td>{metric.replace('_', ' ').title()}</td>
                    <td class="{score_class}">{avg:.3f}</td>
                    <td>{min_val:.3f}</td>
                    <td>{max_val:.3f}</td>
                </tr>
            """

        html += """
            </table>
        """

        # Piores casos
        if 'worst_cases' in report:
            html += """
            <div class="worst-cases">
                <h3>Top 5 Piores Casos (para analise)</h3>
                <ul>
            """
            for case in report['worst_cases']:
                html += f"""
                    <li>
                        <strong>Query:</strong> {case['query']}<br>
                        <strong>Score:</strong> {case['score']:.3f}
                    </li>
                """
            html += """
                </ul>
            </div>
            """

        html += """
        </div>
        """

    html += """
    </body>
    </html>
    """

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html)

    print(f"\nRelatorio HTML gerado: {output_path}")


def main():
    """Funcao principal."""
    print("RAG Benchmark")
    print("="*60)

    # Carregar dataset
    dataset_path = Path(__file__).parent.parent / "data" / "evaluation" / "test_dataset.json"

    if not dataset_path.exists():
        print(f"Erro: Dataset nao encontrado em {dataset_path}")
        return

    test_cases = load_test_dataset(str(dataset_path))
    print(f"Carregados {len(test_cases)} casos de teste\n")

    # Configuracoes para testar
    configs = [
        {
            'name': 'Baseline (top_k=5)',
            'top_k': 5,
            'use_reranking': False,
            'use_hybrid': False
        },
        {
            'name': 'Top_k=10',
            'top_k': 10,
            'use_reranking': False,
            'use_hybrid': False
        },
        {
            'name': 'Com Reranking',
            'top_k': 10,
            'use_reranking': True,
            'use_hybrid': False
        },
        {
            'name': 'Com Hybrid Search',
            'top_k': 5,
            'use_reranking': False,
            'use_hybrid': True
        },
        {
            'name': 'Tudo Ativado',
            'top_k': 10,
            'use_reranking': True,
            'use_hybrid': True
        }
    ]

    # Executar benchmark
    all_results = benchmark_configurations(test_cases, configs)

    # Gerar relatorio
    output_path = Path(__file__).parent.parent / "benchmark_report.html"
    generate_html_report(all_results, str(output_path))

    # Imprimir resumo
    print("\n" + "="*60)
    print("RESUMO")
    print("="*60)

    evaluator = RAGEvaluator(use_ragas=False)

    for config_name, results in all_results.items():
        if not results:
            continue

        report = evaluator.generate_report(results)
        avg_overall = report['average_scores']['overall']

        print(f"\n{config_name}:")
        print(f"  Overall Score: {avg_overall:.3f}")
        print(f"  Faithfulness: {report['average_scores']['faithfulness']:.3f}")
        print(f"  Answer Relevancy: {report['average_scores']['answer_relevancy']:.3f}")


if __name__ == "__main__":
    main()