""" Script de benchmarking para avaliar diferentes configuracoes de RAG. Uso: python scripts/benchmark.py """ import sys import json from pathlib import Path from typing import List, Dict, Any from datetime import datetime import time # Adicionar src ao path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.evaluation import RAGEvaluator, RAGEvaluationResult from src.config import DATABASE_URL from src.database import DatabaseManager from src.embeddings import EmbeddingManager from src.generation import GenerationManager def load_test_dataset(dataset_path: str) -> List[Dict[str, Any]]: """Carrega dataset de teste.""" with open(dataset_path, 'r', encoding='utf-8') as f: return json.load(f) def run_rag_pipeline( query: str, top_k: int = 5, use_reranking: bool = False, use_hybrid: bool = False, session_id: str = "benchmark" ) -> Dict[str, Any]: """ Executa pipeline RAG completo. Returns: Dict com response e contexts """ db = DatabaseManager(DATABASE_URL) embedding_manager = EmbeddingManager() generation_manager = GenerationManager() # 1. Criar embedding query_embedding = embedding_manager.encode(query) # 2. Buscar contextos if use_hybrid: # Implementar hybrid search contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id) else: contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id) # 3. Reranking (se ativado) if use_reranking and len(contexts) > 0: # Implementar reranking pass # 4. Gerar resposta context_texts = [ctx['content'] for ctx in contexts] response = generation_manager.generate_response(query, context_texts) return { 'response': response, 'contexts': context_texts } def benchmark_configurations( test_cases: List[Dict[str, Any]], configs: List[Dict[str, Any]] ) -> Dict[str, List[RAGEvaluationResult]]: """ Testa multiplas configuracoes. Args: test_cases: Casos de teste configs: Lista de configuracoes para testar Returns: Dict com resultados por configuracao """ evaluator = RAGEvaluator(use_ragas=False) # Usar metricas simples all_results = {} for config in configs: config_name = config['name'] print(f"\n{'='*60}") print(f"Testando configuracao: {config_name}") print(f"{'='*60}") results = [] for i, test_case in enumerate(test_cases): print(f" Caso {i+1}/{len(test_cases)}: {test_case['query'][:50]}...") start_time = time.time() # Executar RAG com configuracao try: rag_result = run_rag_pipeline( query=test_case['query'], top_k=config.get('top_k', 5), use_reranking=config.get('use_reranking', False), use_hybrid=config.get('use_hybrid', False) ) # Avaliar resultado eval_result = evaluator.evaluate_single( query=test_case['query'], response=rag_result['response'], contexts=rag_result['contexts'], ground_truth=test_case.get('ground_truth') ) eval_result.response_time = time.time() - start_time results.append(eval_result) except Exception as e: print(f" Erro: {e}") continue all_results[config_name] = results return all_results def generate_html_report( all_results: Dict[str, List[RAGEvaluationResult]], output_path: str ): """Gera relatorio HTML.""" evaluator = RAGEvaluator(use_ragas=False) html = """
Data: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """
""" for config_name, results in all_results.items(): if not results: continue report = evaluator.generate_report(results) html += f"""Total de casos: {report['total_cases']}
| Metrica | Media | Min | Max |
|---|---|---|---|
| {metric.replace('_', ' ').title()} | {avg:.3f} | {min_val:.3f} | {max_val:.3f} |