Spaces:

guifav
/

rag_template

Sleeping

Guilherme Favaron

Sync: Complete project update (Phase 6) - API, Metadata, Eval, Docs

a686b1b 5 days ago

9.76 kB

	"""
	Script de benchmarking para avaliar diferentes configuracoes de RAG.

	Uso:
	python scripts/benchmark.py
	"""

	import sys
	import json
	from pathlib import Path
	from typing import List, Dict, Any
	from datetime import datetime
	import time

	# Adicionar src ao path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from src.evaluation import RAGEvaluator, RAGEvaluationResult
	from src.config import DATABASE_URL
	from src.database import DatabaseManager
	from src.embeddings import EmbeddingManager
	from src.generation import GenerationManager


	def load_test_dataset(dataset_path: str) -> List[Dict[str, Any]]:
	"""Carrega dataset de teste."""
	with open(dataset_path, 'r', encoding='utf-8') as f:
	return json.load(f)


	def run_rag_pipeline(
	query: str,
	top_k: int = 5,
	use_reranking: bool = False,
	use_hybrid: bool = False,
	session_id: str = "benchmark"
	) -> Dict[str, Any]:
	"""
	Executa pipeline RAG completo.

	Returns:
	Dict com response e contexts
	"""
	db = DatabaseManager(DATABASE_URL)
	embedding_manager = EmbeddingManager()
	generation_manager = GenerationManager()

	# 1. Criar embedding
	query_embedding = embedding_manager.encode(query)

	# 2. Buscar contextos
	if use_hybrid:
	# Implementar hybrid search
	contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id)
	else:
	contexts = db.search_similar(query_embedding, top_k=top_k, session_id=session_id)

	# 3. Reranking (se ativado)
	if use_reranking and len(contexts) > 0:
	# Implementar reranking
	pass

	# 4. Gerar resposta
	context_texts = [ctx['content'] for ctx in contexts]
	response = generation_manager.generate_response(query, context_texts)

	return {
	'response': response,
	'contexts': context_texts
	}


	def benchmark_configurations(
	test_cases: List[Dict[str, Any]],
	configs: List[Dict[str, Any]]
	) -> Dict[str, List[RAGEvaluationResult]]:
	"""
	Testa multiplas configuracoes.

	Args:
	test_cases: Casos de teste
	configs: Lista de configuracoes para testar

	Returns:
	Dict com resultados por configuracao
	"""
	evaluator = RAGEvaluator(use_ragas=False) # Usar metricas simples
	all_results = {}

	for config in configs:
	config_name = config['name']
	print(f"\n{'='*60}")
	print(f"Testando configuracao: {config_name}")
	print(f"{'='*60}")

	results = []

	for i, test_case in enumerate(test_cases):
	print(f" Caso {i+1}/{len(test_cases)}: {test_case['query'][:50]}...")

	start_time = time.time()

	# Executar RAG com configuracao
	try:
	rag_result = run_rag_pipeline(
	query=test_case['query'],
	top_k=config.get('top_k', 5),
	use_reranking=config.get('use_reranking', False),
	use_hybrid=config.get('use_hybrid', False)
	)

	# Avaliar resultado
	eval_result = evaluator.evaluate_single(
	query=test_case['query'],
	response=rag_result['response'],
	contexts=rag_result['contexts'],
	ground_truth=test_case.get('ground_truth')
	)

	eval_result.response_time = time.time() - start_time
	results.append(eval_result)

	except Exception as e:
	print(f" Erro: {e}")
	continue

	all_results[config_name] = results

	return all_results


	def generate_html_report(
	all_results: Dict[str, List[RAGEvaluationResult]],
	output_path: str
	):
	"""Gera relatorio HTML."""
	evaluator = RAGEvaluator(use_ragas=False)

	html = """
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<title>RAG Benchmark Report</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 20px;
	background-color: #f5f5f5;
	}
	h1 {
	color: #333;
	}
	.config {
	background: white;
	padding: 20px;
	margin: 20px 0;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	table {
	width: 100%;
	border-collapse: collapse;
	margin: 10px 0;
	}
	th, td {
	padding: 12px;
	text-align: left;
	border-bottom: 1px solid #ddd;
	}
	th {
	background-color: #4CAF50;
	color: white;
	}
	.score-high {
	color: green;
	font-weight: bold;
	}
	.score-medium {
	color: orange;
	}
	.score-low {
	color: red;
	font-weight: bold;
	}
	.worst-cases {
	background: #fff3cd;
	padding: 15px;
	margin: 15px 0;
	border-radius: 5px;
	border-left: 4px solid #ffc107;
	}
	</style>
	</head>
	<body>
	<h1>RAG Benchmark Report</h1>
	<p>Data: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """</p>
	"""

	for config_name, results in all_results.items():
	if not results:
	continue

	report = evaluator.generate_report(results)

	html += f"""
	<div class="config">
	<h2>{config_name}</h2>
	<p>Total de casos: {report['total_cases']}</p>

	<h3>Scores Medios</h3>
	<table>
	<tr>
	<th>Metrica</th>
	<th>Media</th>
	<th>Min</th>
	<th>Max</th>
	</tr>
	"""

	for metric in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'overall']:
	avg = report['average_scores'].get(metric, 0)
	min_val = report['min_scores'].get(metric, 0) if metric != 'overall' else 0
	max_val = report['max_scores'].get(metric, 0) if metric != 'overall' else 0

	score_class = 'score-high' if avg >= 0.7 else ('score-medium' if avg >= 0.5 else 'score-low')

	html += f"""
	<tr>
	<td>{metric.replace('_', ' ').title()}</td>
	<td class="{score_class}">{avg:.3f}</td>
	<td>{min_val:.3f}</td>
	<td>{max_val:.3f}</td>
	</tr>
	"""

	html += """
	</table>
	"""

	# Piores casos
	if 'worst_cases' in report:
	html += """
	<div class="worst-cases">
	<h3>Top 5 Piores Casos (para analise)</h3>
	<ul>
	"""
	for case in report['worst_cases']:
	html += f"""
	<li>
	<strong>Query:</strong> {case['query']}<br>
	<strong>Score:</strong> {case['score']:.3f}
	</li>
	"""
	html += """
	</ul>
	</div>
	"""

	html += """
	</div>
	"""

	html += """
	</body>
	</html>
	"""

	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(html)

	print(f"\nRelatorio HTML gerado: {output_path}")


	def main():
	"""Funcao principal."""
	print("RAG Benchmark")
	print("="*60)

	# Carregar dataset
	dataset_path = Path(__file__).parent.parent / "data" / "evaluation" / "test_dataset.json"

	if not dataset_path.exists():
	print(f"Erro: Dataset nao encontrado em {dataset_path}")
	return

	test_cases = load_test_dataset(str(dataset_path))
	print(f"Carregados {len(test_cases)} casos de teste\n")

	# Configuracoes para testar
	configs = [
	{
	'name': 'Baseline (top_k=5)',
	'top_k': 5,
	'use_reranking': False,
	'use_hybrid': False
	},
	{
	'name': 'Top_k=10',
	'top_k': 10,
	'use_reranking': False,
	'use_hybrid': False
	},
	{
	'name': 'Com Reranking',
	'top_k': 10,
	'use_reranking': True,
	'use_hybrid': False
	},
	{
	'name': 'Com Hybrid Search',
	'top_k': 5,
	'use_reranking': False,
	'use_hybrid': True
	},
	{
	'name': 'Tudo Ativado',
	'top_k': 10,
	'use_reranking': True,
	'use_hybrid': True
	}
	]

	# Executar benchmark
	all_results = benchmark_configurations(test_cases, configs)

	# Gerar relatorio
	output_path = Path(__file__).parent.parent / "benchmark_report.html"
	generate_html_report(all_results, str(output_path))

	# Imprimir resumo
	print("\n" + "="*60)
	print("RESUMO")
	print("="*60)

	evaluator = RAGEvaluator(use_ragas=False)

	for config_name, results in all_results.items():
	if not results:
	continue

	report = evaluator.generate_report(results)
	avg_overall = report['average_scores']['overall']

	print(f"\n{config_name}:")
	print(f" Overall Score: {avg_overall:.3f}")
	print(f" Faithfulness: {report['average_scores']['faithfulness']:.3f}")
	print(f" Answer Relevancy: {report['average_scores']['answer_relevancy']:.3f}")


	if __name__ == "__main__":
	main()