Spaces:
Sleeping
Sleeping
| """ | |
| Script de benchmarking para avaliar diferentes configuracoes de RAG. | |
| Uso: | |
| python scripts/benchmark.py | |
| """ | |
| import sys | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| from datetime import datetime | |
| import time | |
| # Adicionar src ao path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from src.evaluation import RAGEvaluator, RAGEvaluationResult | |
| from src.config import DATABASE_URL | |
| from src.database import DatabaseManager | |
| from src.embeddings import EmbeddingManager | |
| from src.generation import GenerationManager | |
| def load_test_dataset(dataset_path: str) -> List[Dict[str, Any]]: | |
| """Carrega dataset de teste.""" | |
| with open(dataset_path, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| def run_rag_pipeline( | |
| query: str, | |
| top_k: int = 5, | |
| use_reranking: bool = False, | |
| use_hybrid: bool = False, | |
| session_id: str = "benchmark" | |
| ) -> Dict[str, Any]: | |
| """ | |
| Executa pipeline RAG completo. | |
| Returns: | |
| Dict com response e contexts | |
| """ | |
| db = DatabaseManager(DATABASE_URL) | |
| embedding_manager = EmbeddingManager() | |
| generation_manager = GenerationManager() | |
| # 1. Criar embedding | |
| query_embedding = embedding_manager.encode(query) | |
| # 2. Buscar contextos | |
| if use_hybrid: | |
| # Implementar hybrid search | |
| contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id) | |
| else: | |
| contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id) | |
| # 3. Reranking (se ativado) | |
| if use_reranking and len(contexts) > 0: | |
| # Implementar reranking | |
| pass | |
| # 4. Gerar resposta | |
| context_texts = [ctx['content'] for ctx in contexts] | |
| response = generation_manager.generate_response(query, context_texts) | |
| return { | |
| 'response': response, | |
| 'contexts': context_texts | |
| } | |
| def benchmark_configurations( | |
| test_cases: List[Dict[str, Any]], | |
| configs: List[Dict[str, Any]] | |
| ) -> Dict[str, List[RAGEvaluationResult]]: | |
| """ | |
| Testa multiplas configuracoes. | |
| Args: | |
| test_cases: Casos de teste | |
| configs: Lista de configuracoes para testar | |
| Returns: | |
| Dict com resultados por configuracao | |
| """ | |
| evaluator = RAGEvaluator(use_ragas=False) # Usar metricas simples | |
| all_results = {} | |
| for config in configs: | |
| config_name = config['name'] | |
| print(f"\n{'='*60}") | |
| print(f"Testando configuracao: {config_name}") | |
| print(f"{'='*60}") | |
| results = [] | |
| for i, test_case in enumerate(test_cases): | |
| print(f" Caso {i+1}/{len(test_cases)}: {test_case['query'][:50]}...") | |
| start_time = time.time() | |
| # Executar RAG com configuracao | |
| try: | |
| rag_result = run_rag_pipeline( | |
| query=test_case['query'], | |
| top_k=config.get('top_k', 5), | |
| use_reranking=config.get('use_reranking', False), | |
| use_hybrid=config.get('use_hybrid', False) | |
| ) | |
| # Avaliar resultado | |
| eval_result = evaluator.evaluate_single( | |
| query=test_case['query'], | |
| response=rag_result['response'], | |
| contexts=rag_result['contexts'], | |
| ground_truth=test_case.get('ground_truth') | |
| ) | |
| eval_result.response_time = time.time() - start_time | |
| results.append(eval_result) | |
| except Exception as e: | |
| print(f" Erro: {e}") | |
| continue | |
| all_results[config_name] = results | |
| return all_results | |
| def generate_html_report( | |
| all_results: Dict[str, List[RAGEvaluationResult]], | |
| output_path: str | |
| ): | |
| """Gera relatorio HTML.""" | |
| evaluator = RAGEvaluator(use_ragas=False) | |
| html = """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>RAG Benchmark Report</title> | |
| <style> | |
| body { | |
| font-family: Arial, sans-serif; | |
| margin: 20px; | |
| background-color: #f5f5f5; | |
| } | |
| h1 { | |
| color: #333; | |
| } | |
| .config { | |
| background: white; | |
| padding: 20px; | |
| margin: 20px 0; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 10px 0; | |
| } | |
| th, td { | |
| padding: 12px; | |
| text-align: left; | |
| border-bottom: 1px solid #ddd; | |
| } | |
| th { | |
| background-color: #4CAF50; | |
| color: white; | |
| } | |
| .score-high { | |
| color: green; | |
| font-weight: bold; | |
| } | |
| .score-medium { | |
| color: orange; | |
| } | |
| .score-low { | |
| color: red; | |
| font-weight: bold; | |
| } | |
| .worst-cases { | |
| background: #fff3cd; | |
| padding: 15px; | |
| margin: 15px 0; | |
| border-radius: 5px; | |
| border-left: 4px solid #ffc107; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>RAG Benchmark Report</h1> | |
| <p>Data: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """</p> | |
| """ | |
| for config_name, results in all_results.items(): | |
| if not results: | |
| continue | |
| report = evaluator.generate_report(results) | |
| html += f""" | |
| <div class="config"> | |
| <h2>{config_name}</h2> | |
| <p>Total de casos: {report['total_cases']}</p> | |
| <h3>Scores Medios</h3> | |
| <table> | |
| <tr> | |
| <th>Metrica</th> | |
| <th>Media</th> | |
| <th>Min</th> | |
| <th>Max</th> | |
| </tr> | |
| """ | |
| for metric in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'overall']: | |
| avg = report['average_scores'].get(metric, 0) | |
| min_val = report['min_scores'].get(metric, 0) if metric != 'overall' else 0 | |
| max_val = report['max_scores'].get(metric, 0) if metric != 'overall' else 0 | |
| score_class = 'score-high' if avg >= 0.7 else ('score-medium' if avg >= 0.5 else 'score-low') | |
| html += f""" | |
| <tr> | |
| <td>{metric.replace('_', ' ').title()}</td> | |
| <td class="{score_class}">{avg:.3f}</td> | |
| <td>{min_val:.3f}</td> | |
| <td>{max_val:.3f}</td> | |
| </tr> | |
| """ | |
| html += """ | |
| </table> | |
| """ | |
| # Piores casos | |
| if 'worst_cases' in report: | |
| html += """ | |
| <div class="worst-cases"> | |
| <h3>Top 5 Piores Casos (para analise)</h3> | |
| <ul> | |
| """ | |
| for case in report['worst_cases']: | |
| html += f""" | |
| <li> | |
| <strong>Query:</strong> {case['query']}<br> | |
| <strong>Score:</strong> {case['score']:.3f} | |
| </li> | |
| """ | |
| html += """ | |
| </ul> | |
| </div> | |
| """ | |
| html += """ | |
| </div> | |
| """ | |
| html += """ | |
| </body> | |
| </html> | |
| """ | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(html) | |
| print(f"\nRelatorio HTML gerado: {output_path}") | |
| def main(): | |
| """Funcao principal.""" | |
| print("RAG Benchmark") | |
| print("="*60) | |
| # Carregar dataset | |
| dataset_path = Path(__file__).parent.parent / "data" / "evaluation" / "test_dataset.json" | |
| if not dataset_path.exists(): | |
| print(f"Erro: Dataset nao encontrado em {dataset_path}") | |
| return | |
| test_cases = load_test_dataset(str(dataset_path)) | |
| print(f"Carregados {len(test_cases)} casos de teste\n") | |
| # Configuracoes para testar | |
| configs = [ | |
| { | |
| 'name': 'Baseline (top_k=5)', | |
| 'top_k': 5, | |
| 'use_reranking': False, | |
| 'use_hybrid': False | |
| }, | |
| { | |
| 'name': 'Top_k=10', | |
| 'top_k': 10, | |
| 'use_reranking': False, | |
| 'use_hybrid': False | |
| }, | |
| { | |
| 'name': 'Com Reranking', | |
| 'top_k': 10, | |
| 'use_reranking': True, | |
| 'use_hybrid': False | |
| }, | |
| { | |
| 'name': 'Com Hybrid Search', | |
| 'top_k': 5, | |
| 'use_reranking': False, | |
| 'use_hybrid': True | |
| }, | |
| { | |
| 'name': 'Tudo Ativado', | |
| 'top_k': 10, | |
| 'use_reranking': True, | |
| 'use_hybrid': True | |
| } | |
| ] | |
| # Executar benchmark | |
| all_results = benchmark_configurations(test_cases, configs) | |
| # Gerar relatorio | |
| output_path = Path(__file__).parent.parent / "benchmark_report.html" | |
| generate_html_report(all_results, str(output_path)) | |
| # Imprimir resumo | |
| print("\n" + "="*60) | |
| print("RESUMO") | |
| print("="*60) | |
| evaluator = RAGEvaluator(use_ragas=False) | |
| for config_name, results in all_results.items(): | |
| if not results: | |
| continue | |
| report = evaluator.generate_report(results) | |
| avg_overall = report['average_scores']['overall'] | |
| print(f"\n{config_name}:") | |
| print(f" Overall Score: {avg_overall:.3f}") | |
| print(f" Faithfulness: {report['average_scores']['faithfulness']:.3f}") | |
| print(f" Answer Relevancy: {report['average_scores']['answer_relevancy']:.3f}") | |
| if __name__ == "__main__": | |
| main() | |