rag_template / scripts /benchmark.py
Guilherme Favaron
Sync: Complete project update (Phase 6) - API, Metadata, Eval, Docs
a686b1b
"""
Script de benchmarking para avaliar diferentes configuracoes de RAG.
Uso:
python scripts/benchmark.py
"""
import sys
import json
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime
import time
# Adicionar src ao path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.evaluation import RAGEvaluator, RAGEvaluationResult
from src.config import DATABASE_URL
from src.database import DatabaseManager
from src.embeddings import EmbeddingManager
from src.generation import GenerationManager
def load_test_dataset(dataset_path: str) -> List[Dict[str, Any]]:
"""Carrega dataset de teste."""
with open(dataset_path, 'r', encoding='utf-8') as f:
return json.load(f)
def run_rag_pipeline(
query: str,
top_k: int = 5,
use_reranking: bool = False,
use_hybrid: bool = False,
session_id: str = "benchmark"
) -> Dict[str, Any]:
"""
Executa pipeline RAG completo.
Returns:
Dict com response e contexts
"""
db = DatabaseManager(DATABASE_URL)
embedding_manager = EmbeddingManager()
generation_manager = GenerationManager()
# 1. Criar embedding
query_embedding = embedding_manager.encode(query)
# 2. Buscar contextos
if use_hybrid:
# Implementar hybrid search
contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id)
else:
contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id)
# 3. Reranking (se ativado)
if use_reranking and len(contexts) > 0:
# Implementar reranking
pass
# 4. Gerar resposta
context_texts = [ctx['content'] for ctx in contexts]
response = generation_manager.generate_response(query, context_texts)
return {
'response': response,
'contexts': context_texts
}
def benchmark_configurations(
test_cases: List[Dict[str, Any]],
configs: List[Dict[str, Any]]
) -> Dict[str, List[RAGEvaluationResult]]:
"""
Testa multiplas configuracoes.
Args:
test_cases: Casos de teste
configs: Lista de configuracoes para testar
Returns:
Dict com resultados por configuracao
"""
evaluator = RAGEvaluator(use_ragas=False) # Usar metricas simples
all_results = {}
for config in configs:
config_name = config['name']
print(f"\n{'='*60}")
print(f"Testando configuracao: {config_name}")
print(f"{'='*60}")
results = []
for i, test_case in enumerate(test_cases):
print(f" Caso {i+1}/{len(test_cases)}: {test_case['query'][:50]}...")
start_time = time.time()
# Executar RAG com configuracao
try:
rag_result = run_rag_pipeline(
query=test_case['query'],
top_k=config.get('top_k', 5),
use_reranking=config.get('use_reranking', False),
use_hybrid=config.get('use_hybrid', False)
)
# Avaliar resultado
eval_result = evaluator.evaluate_single(
query=test_case['query'],
response=rag_result['response'],
contexts=rag_result['contexts'],
ground_truth=test_case.get('ground_truth')
)
eval_result.response_time = time.time() - start_time
results.append(eval_result)
except Exception as e:
print(f" Erro: {e}")
continue
all_results[config_name] = results
return all_results
def generate_html_report(
all_results: Dict[str, List[RAGEvaluationResult]],
output_path: str
):
"""Gera relatorio HTML."""
evaluator = RAGEvaluator(use_ragas=False)
html = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>RAG Benchmark Report</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 20px;
background-color: #f5f5f5;
}
h1 {
color: #333;
}
.config {
background: white;
padding: 20px;
margin: 20px 0;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
table {
width: 100%;
border-collapse: collapse;
margin: 10px 0;
}
th, td {
padding: 12px;
text-align: left;
border-bottom: 1px solid #ddd;
}
th {
background-color: #4CAF50;
color: white;
}
.score-high {
color: green;
font-weight: bold;
}
.score-medium {
color: orange;
}
.score-low {
color: red;
font-weight: bold;
}
.worst-cases {
background: #fff3cd;
padding: 15px;
margin: 15px 0;
border-radius: 5px;
border-left: 4px solid #ffc107;
}
</style>
</head>
<body>
<h1>RAG Benchmark Report</h1>
<p>Data: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """</p>
"""
for config_name, results in all_results.items():
if not results:
continue
report = evaluator.generate_report(results)
html += f"""
<div class="config">
<h2>{config_name}</h2>
<p>Total de casos: {report['total_cases']}</p>
<h3>Scores Medios</h3>
<table>
<tr>
<th>Metrica</th>
<th>Media</th>
<th>Min</th>
<th>Max</th>
</tr>
"""
for metric in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'overall']:
avg = report['average_scores'].get(metric, 0)
min_val = report['min_scores'].get(metric, 0) if metric != 'overall' else 0
max_val = report['max_scores'].get(metric, 0) if metric != 'overall' else 0
score_class = 'score-high' if avg >= 0.7 else ('score-medium' if avg >= 0.5 else 'score-low')
html += f"""
<tr>
<td>{metric.replace('_', ' ').title()}</td>
<td class="{score_class}">{avg:.3f}</td>
<td>{min_val:.3f}</td>
<td>{max_val:.3f}</td>
</tr>
"""
html += """
</table>
"""
# Piores casos
if 'worst_cases' in report:
html += """
<div class="worst-cases">
<h3>Top 5 Piores Casos (para analise)</h3>
<ul>
"""
for case in report['worst_cases']:
html += f"""
<li>
<strong>Query:</strong> {case['query']}<br>
<strong>Score:</strong> {case['score']:.3f}
</li>
"""
html += """
</ul>
</div>
"""
html += """
</div>
"""
html += """
</body>
</html>
"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html)
print(f"\nRelatorio HTML gerado: {output_path}")
def main():
"""Funcao principal."""
print("RAG Benchmark")
print("="*60)
# Carregar dataset
dataset_path = Path(__file__).parent.parent / "data" / "evaluation" / "test_dataset.json"
if not dataset_path.exists():
print(f"Erro: Dataset nao encontrado em {dataset_path}")
return
test_cases = load_test_dataset(str(dataset_path))
print(f"Carregados {len(test_cases)} casos de teste\n")
# Configuracoes para testar
configs = [
{
'name': 'Baseline (top_k=5)',
'top_k': 5,
'use_reranking': False,
'use_hybrid': False
},
{
'name': 'Top_k=10',
'top_k': 10,
'use_reranking': False,
'use_hybrid': False
},
{
'name': 'Com Reranking',
'top_k': 10,
'use_reranking': True,
'use_hybrid': False
},
{
'name': 'Com Hybrid Search',
'top_k': 5,
'use_reranking': False,
'use_hybrid': True
},
{
'name': 'Tudo Ativado',
'top_k': 10,
'use_reranking': True,
'use_hybrid': True
}
]
# Executar benchmark
all_results = benchmark_configurations(test_cases, configs)
# Gerar relatorio
output_path = Path(__file__).parent.parent / "benchmark_report.html"
generate_html_report(all_results, str(output_path))
# Imprimir resumo
print("\n" + "="*60)
print("RESUMO")
print("="*60)
evaluator = RAGEvaluator(use_ragas=False)
for config_name, results in all_results.items():
if not results:
continue
report = evaluator.generate_report(results)
avg_overall = report['average_scores']['overall']
print(f"\n{config_name}:")
print(f" Overall Score: {avg_overall:.3f}")
print(f" Faithfulness: {report['average_scores']['faithfulness']:.3f}")
print(f" Answer Relevancy: {report['average_scores']['answer_relevancy']:.3f}")
if __name__ == "__main__":
main()