""" Script de benchmarking para avaliar diferentes configuracoes de RAG. Uso: python scripts/benchmark.py """ import sys import json from pathlib import Path from typing import List, Dict, Any from datetime import datetime import time # Adicionar src ao path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.evaluation import RAGEvaluator, RAGEvaluationResult from src.config import DATABASE_URL from src.database import DatabaseManager from src.embeddings import EmbeddingManager from src.generation import GenerationManager def load_test_dataset(dataset_path: str) -> List[Dict[str, Any]]: """Carrega dataset de teste.""" with open(dataset_path, 'r', encoding='utf-8') as f: return json.load(f) def run_rag_pipeline( query: str, top_k: int = 5, use_reranking: bool = False, use_hybrid: bool = False, session_id: str = "benchmark" ) -> Dict[str, Any]: """ Executa pipeline RAG completo. Returns: Dict com response e contexts """ db = DatabaseManager(DATABASE_URL) embedding_manager = EmbeddingManager() generation_manager = GenerationManager() # 1. Criar embedding query_embedding = embedding_manager.encode(query) # 2. Buscar contextos if use_hybrid: # Implementar hybrid search contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id) else: contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id) # 3. Reranking (se ativado) if use_reranking and len(contexts) > 0: # Implementar reranking pass # 4. Gerar resposta context_texts = [ctx['content'] for ctx in contexts] response = generation_manager.generate_response(query, context_texts) return { 'response': response, 'contexts': context_texts } def benchmark_configurations( test_cases: List[Dict[str, Any]], configs: List[Dict[str, Any]] ) -> Dict[str, List[RAGEvaluationResult]]: """ Testa multiplas configuracoes. Args: test_cases: Casos de teste configs: Lista de configuracoes para testar Returns: Dict com resultados por configuracao """ evaluator = RAGEvaluator(use_ragas=False) # Usar metricas simples all_results = {} for config in configs: config_name = config['name'] print(f"\n{'='*60}") print(f"Testando configuracao: {config_name}") print(f"{'='*60}") results = [] for i, test_case in enumerate(test_cases): print(f" Caso {i+1}/{len(test_cases)}: {test_case['query'][:50]}...") start_time = time.time() # Executar RAG com configuracao try: rag_result = run_rag_pipeline( query=test_case['query'], top_k=config.get('top_k', 5), use_reranking=config.get('use_reranking', False), use_hybrid=config.get('use_hybrid', False) ) # Avaliar resultado eval_result = evaluator.evaluate_single( query=test_case['query'], response=rag_result['response'], contexts=rag_result['contexts'], ground_truth=test_case.get('ground_truth') ) eval_result.response_time = time.time() - start_time results.append(eval_result) except Exception as e: print(f" Erro: {e}") continue all_results[config_name] = results return all_results def generate_html_report( all_results: Dict[str, List[RAGEvaluationResult]], output_path: str ): """Gera relatorio HTML.""" evaluator = RAGEvaluator(use_ragas=False) html = """ RAG Benchmark Report

RAG Benchmark Report

Data: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """

""" for config_name, results in all_results.items(): if not results: continue report = evaluator.generate_report(results) html += f"""

{config_name}

Total de casos: {report['total_cases']}

Scores Medios

""" for metric in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'overall']: avg = report['average_scores'].get(metric, 0) min_val = report['min_scores'].get(metric, 0) if metric != 'overall' else 0 max_val = report['max_scores'].get(metric, 0) if metric != 'overall' else 0 score_class = 'score-high' if avg >= 0.7 else ('score-medium' if avg >= 0.5 else 'score-low') html += f""" """ html += """
Metrica Media Min Max
{metric.replace('_', ' ').title()} {avg:.3f} {min_val:.3f} {max_val:.3f}
""" # Piores casos if 'worst_cases' in report: html += """

Top 5 Piores Casos (para analise)

""" html += """
""" html += """ """ with open(output_path, 'w', encoding='utf-8') as f: f.write(html) print(f"\nRelatorio HTML gerado: {output_path}") def main(): """Funcao principal.""" print("RAG Benchmark") print("="*60) # Carregar dataset dataset_path = Path(__file__).parent.parent / "data" / "evaluation" / "test_dataset.json" if not dataset_path.exists(): print(f"Erro: Dataset nao encontrado em {dataset_path}") return test_cases = load_test_dataset(str(dataset_path)) print(f"Carregados {len(test_cases)} casos de teste\n") # Configuracoes para testar configs = [ { 'name': 'Baseline (top_k=5)', 'top_k': 5, 'use_reranking': False, 'use_hybrid': False }, { 'name': 'Top_k=10', 'top_k': 10, 'use_reranking': False, 'use_hybrid': False }, { 'name': 'Com Reranking', 'top_k': 10, 'use_reranking': True, 'use_hybrid': False }, { 'name': 'Com Hybrid Search', 'top_k': 5, 'use_reranking': False, 'use_hybrid': True }, { 'name': 'Tudo Ativado', 'top_k': 10, 'use_reranking': True, 'use_hybrid': True } ] # Executar benchmark all_results = benchmark_configurations(test_cases, configs) # Gerar relatorio output_path = Path(__file__).parent.parent / "benchmark_report.html" generate_html_report(all_results, str(output_path)) # Imprimir resumo print("\n" + "="*60) print("RESUMO") print("="*60) evaluator = RAGEvaluator(use_ragas=False) for config_name, results in all_results.items(): if not results: continue report = evaluator.generate_report(results) avg_overall = report['average_scores']['overall'] print(f"\n{config_name}:") print(f" Overall Score: {avg_overall:.3f}") print(f" Faithfulness: {report['average_scores']['faithfulness']:.3f}") print(f" Answer Relevancy: {report['average_scores']['answer_relevancy']:.3f}") if __name__ == "__main__": main()