Spaces:

RamsesCamas
/

DocAgentSystem

Sleeping

App Files Files Community

DocAgentSystem / agents /comparison.py

RamsesCamas

Initial clean commit for HF Space deployment

d0d2f42 17 days ago

raw

history blame contribute delete

16.8 kB

	"""
	Comparación: Pipeline vs Agente Básico vs Agente ReAct.
	Ejecutar: python -m agents.comparison
	"""

	import logging
	import os
	import re
	import time

	from dotenv import load_dotenv
	from openai import OpenAI

	from agents.basic_agent import BasicAgent
	from agents.react_agent import ReactAgent
	from agents.tools import search_docs

	load_dotenv()

	logging.basicConfig(level=logging.WARNING)

	# ── Colores ANSI ──────────────────────────────────────────────

	BOLD = "\033[1m"
	DIM = "\033[2m"
	CYAN = "\033[36m"
	BLUE = "\033[34m" # Thought / Razonamiento
	GREEN = "\033[32m" # Respuesta
	YELLOW = "\033[33m" # Action
	MAGENTA = "\033[35m" # Observation
	RED = "\033[31m"
	RESET = "\033[0m"

	# ── Pipeline RAG estático ─────────────────────────────────────

	_pipeline_client = OpenAI(
	base_url="https://api.groq.com/openai/v1",
	api_key=os.getenv("GROQ_API_KEY"),
	timeout=30.0,
	)


	def run_pipeline(query: str) -> dict:
	"""Pipeline RAG estático: retrieve → generate. Sin loops, sin decisiones."""
	context = search_docs(query)

	try:
	response = _pipeline_client.chat.completions.create(
	model="openai/gpt-oss-120b",
	messages=[
	{
	"role": "system",
	"content": (
	"Responde la pregunta basándote SOLO en el contexto proporcionado. "
	"Si el contexto no contiene la respuesta, indica que no tienes "
	"información suficiente. Responde en español."
	),
	},
	{
	"role": "user",
	"content": f"Contexto:\n{context}\n\nPregunta: {query}",
	},
	],
	temperature=0,
	max_tokens=1024,
	)
	answer = response.choices[0].message.content.strip()
	except Exception as e:
	answer = f"Error: {e}"

	return {"answer": answer, "steps": 1, "approach": "pipeline"}


	# ── Preguntas de prueba ───────────────────────────────────────

	QUESTIONS = [
	# EASY — respuesta directa
	{
	"question": "¿Cuántos días de vacaciones tienen los empleados en su primer año?",
	"category": "FÁCIL",
	"keywords": ["12", "días", "primer"],
	},
	{
	"question": "¿Cuál es el horario de soporte técnico?",
	"category": "FÁCIL",
	"keywords": ["horario", "soporte", "lunes"],
	},
	# MEDIUM — requiere buscar en el documento correcto
	{
	"question": "¿Qué documentos necesito entregar en mi primer día de trabajo?",
	"category": "MEDIA",
	"keywords": ["primer día", "onboarding", "recepción"],
	},
	{
	"question": "¿Cómo reporto un problema técnico urgente?",
	"category": "MEDIA",
	"keywords": ["P1", "crítico", "soporte", "NovaHub", "ticket"],
	},
	# HARD — combinar info o reformular
	{
	"question": "Si soy nuevo empleado y mi laptop no funciona el primer día, ¿qué debo hacer?",
	"category": "DIFÍCIL",
	"keywords": ["soporte", "equipo", "TI", "ticket"],
	},
	{
	"question": "¿Puedo tomar vacaciones durante mi periodo de prueba?",
	"category": "DIFÍCIL",
	"keywords": ["prueba", "vacaciones", "90", "días"],
	},
	# UNANSWERABLE — no está en los documentos
	{
	"question": "¿Cuál es el salario promedio de los ingenieros?",
	"category": "SIN RESPUESTA",
	"keywords": [],
	},
	{
	"question": "¿La empresa tiene oficina en Barcelona?",
	"category": "SIN RESPUESTA",
	"keywords": [],
	},
	]


	# ── Evaluación heurística ─────────────────────────────────────


	def _check_answer(answer: str \| None, keywords: list[str]) -> bool:
	"""Evalúa si la respuesta contiene las keywords esperadas."""
	if not answer:
	return False
	if not keywords:
	# Para preguntas sin respuesta, aceptar si dice que no tiene info
	lower = answer.lower()
	return any(
	phrase in lower
	for phrase in [
	"no ",
	"no se encontr",
	"no tengo",
	"no cuento",
	"no hay información",
	"no dispongo",
	]
	)
	lower = answer.lower()
	return any(kw.lower() in lower for kw in keywords)


	def _has_source(answer: str \| None) -> bool:
	"""Verifica si la respuesta menciona un documento fuente."""
	if not answer:
	return False
	sources = [
	"manual_onboarding",
	"politica_vacaciones",
	"proceso_soporte",
	"onboarding",
	"política",
	"soporte técnico",
	]
	lower = answer.lower()
	return any(s.lower() in lower for s in sources)


	def _count_reformulations(steps: list[dict]) -> int:
	"""Cuenta cuántas veces se reformuló la búsqueda."""
	search_actions = []
	for s in steps:
	action = s.get("action", "")
	if "search_docs" in action:
	search_actions.append(action)
	return max(0, len(search_actions) - 1)


	# ── Formato de salida ─────────────────────────────────────────


	def _print_header(question: str, category: str):
	width = 64
	print()
	print(f"{BOLD}{CYAN}╔{'═' * width}╗{RESET}")
	print(f"{BOLD}{CYAN}║ [{category}] {question[:width - len(category) - 6]}{RESET}")
	print(f"{BOLD}{CYAN}╚{'═' * width}╝{RESET}")


	def _print_pipeline(result: dict, elapsed: float):
	answer = result["answer"]
	print(f"\n{CYAN}┌─ PIPELINE ────────────────────────────────────────────────────┐{RESET}")
	print(f"{CYAN}│{RESET} {GREEN}{BOLD}Respuesta:{RESET}")
	for line in _wrap(answer, 58):
	print(f"{CYAN}│{RESET} {GREEN}{line}{RESET}")
	print(f"{CYAN}│{RESET} {DIM}Pasos: {result['steps']} \| Tiempo: {elapsed:.1f}s{RESET}")
	print(f"{CYAN}└──────────────────────────────────────────────────────────────┘{RESET}")


	def _print_basic(result: dict, elapsed: float):
	print(f"\n{CYAN}┌─ AGENTE BÁSICO (Act-Only) ────────────────────────────────────┐{RESET}")
	for s in result["steps"]:
	action = s["action"][:65]
	print(f"{CYAN}│{RESET} {YELLOW}Action {s['step']}:{RESET} {YELLOW}{action}{RESET}")
	if s.get("observation") and s["action"] != s.get("observation", ""):
	obs = s["observation"][:62]
	print(f"{CYAN}│{RESET} {MAGENTA}Obs {s['step']}:{RESET} {MAGENTA}{obs}{RESET}")
	answer = result.get("answer") or "(sin respuesta)"
	print(f"{CYAN}│{RESET}")
	print(f"{CYAN}│{RESET} {GREEN}{BOLD}Respuesta:{RESET}")
	for line in _wrap(answer, 56):
	print(f"{CYAN}│{RESET} {GREEN}{line}{RESET}")
	print(
	f"{CYAN}│{RESET} {DIM}Pasos: {result['total_steps']} \| Tiempo: {elapsed:.1f}s{RESET}"
	)
	print(f"{CYAN}└──────────────────────────────────────────────────────────────┘{RESET}")


	def _print_react(result: dict, elapsed: float):
	print(f"\n{CYAN}┌─ AGENTE REACT ─────────────────────────────────────────────────┐{RESET}")
	for s in result["steps"]:
	thought = s.get("thought", "")[:62]
	action = s["action"][:65]
	print(f"{CYAN}│{RESET} {BLUE}Thought {s['step']}:{RESET} {BLUE}{thought}{RESET}")
	print(f"{CYAN}│{RESET} {YELLOW}Action {s['step']}:{RESET} {YELLOW}{action}{RESET}")
	if s.get("observation") and s["action"] != s.get("observation", ""):
	obs = s["observation"][:62]
	print(f"{CYAN}│{RESET} {MAGENTA}Obs {s['step']}:{RESET} {MAGENTA}{obs}{RESET}")
	answer = result.get("answer") or "(sin respuesta)"
	print(f"{CYAN}│{RESET}")
	print(f"{CYAN}│{RESET} {GREEN}{BOLD}Respuesta:{RESET}")
	for line in _wrap(answer, 56):
	print(f"{CYAN}│{RESET} {GREEN}{line}{RESET}")
	print(
	f"{CYAN}│{RESET} {DIM}Pasos: {result['total_steps']} \| Tiempo: {elapsed:.1f}s{RESET}"
	)
	print(f"{CYAN}└──────────────────────────────────────────────────────────────┘{RESET}")


	def _wrap(text: str, width: int) -> list[str]:
	"""Simple line wrapper."""
	words = text.split()
	lines: list[str] = []
	current = ""
	for word in words:
	if len(current) + len(word) + 1 > width:
	lines.append(current)
	current = word
	else:
	current = f"{current} {word}" if current else word
	if current:
	lines.append(current)
	return lines or [""]


	# ── Tabla resumen ─────────────────────────────────────────────


	def _print_summary(metrics: dict):
	"""Imprime la tabla resumen de la comparación."""
	p = metrics["pipeline"]
	b = metrics["basic"]
	r = metrics["react"]
	total = metrics["total"]

	def _winner(p_val, b_val, r_val, lower_is_better=True):
	vals = {"Pipeline": p_val, "Básico": b_val, "ReAct": r_val}
	if lower_is_better:
	return min(vals, key=vals.get)
	return max(vals, key=vals.get)

	rows = [
	(
	"Promedio pasos",
	f"{p['total_steps'] / total:.1f}",
	f"{b['total_steps'] / total:.1f}",
	f"{r['total_steps'] / total:.1f}",
	_winner(
	p["total_steps"] / total,
	b["total_steps"] / total,
	r["total_steps"] / total,
	),
	),
	(
	"Promedio tiempo",
	f"{p['total_time'] / total:.1f}s",
	f"{b['total_time'] / total:.1f}s",
	f"{r['total_time'] / total:.1f}s",
	_winner(p["total_time"], b["total_time"], r["total_time"]),
	),
	(
	"Resp. correctas",
	f"{p['correct']}/{total}",
	f"{b['correct']}/{total}",
	f"{r['correct']}/{total}",
	_winner(p["correct"], b["correct"], r["correct"], lower_is_better=False),
	),
	(
	"Resp. con fuente",
	f"{p['with_source']}/{total}",
	f"{b['with_source']}/{total}",
	f"{r['with_source']}/{total}",
	_winner(
	p["with_source"],
	b["with_source"],
	r["with_source"],
	lower_is_better=False,
	),
	),
	(
	"Trazabilidad",
	"Ninguna",
	"Parcial",
	"Completa",
	"ReAct",
	),
	(
	"Reformulaciones",
	"0",
	str(b["reformulations"]),
	str(r["reformulations"]),
	_winner(
	0,
	b["reformulations"],
	r["reformulations"],
	lower_is_better=False,
	),
	),
	]

	print(f"\n\n{BOLD}{'═' * 80}{RESET}")
	print(f"{BOLD} TABLA RESUMEN{RESET}")
	print(f"{BOLD}{'═' * 80}{RESET}")

	hdr = f"{'Métrica':<20} {'Pipeline':>10} {'Ag. Básico':>12} {'Ag. ReAct':>12} {'Ganador':>14}"
	print(f"\n{BOLD}{hdr}{RESET}")
	print("─" * 72)

	for metric, pv, bv, rv, winner in rows:
	print(f"{metric:<20} {pv:>10} {bv:>12} {rv:>12} {BOLD}{winner:>14}{RESET}")

	print("─" * 72)


	# ── Análisis ──────────────────────────────────────────────────


	def _print_analysis():
	print(f"\n{BOLD}ANÁLISIS DE RESULTADOS{RESET}")
	print("=" * 60)
	print(
	f"""
	{GREEN}1. PIPELINE:{RESET} Rápido y predecible, pero sin capacidad de
	adaptación. Si el retrieval falla, la respuesta falla.
	No hay forma de reformular o buscar alternativas.

	{YELLOW}2. AGENTE BÁSICO ({YELLOW}Action{RESET} only):{RESET} Puede usar herramientas
	iterativamente, pero sin razonamiento explícito tiende
	a repetir búsquedas similares sin reformular. No puede
	explicar POR QUÉ tomó cada decisión.

	{BLUE}3. AGENTE REACT{RESET} ({BLUE}Thought{RESET} → {YELLOW}Action{RESET} → {MAGENTA}Obs{RESET}):
	Más lento pero significativamente más preciso. Puede
	reformular búsquedas fallidas, combinar información de
	múltiples fuentes, y cada decisión es trazable a un
	{BLUE}pensamiento explícito{RESET}. Ideal para tareas complejas
	donde la precisión importa más que la velocidad.

	{DIM}Leyenda de colores:{RESET}
	{BLUE}Azul{RESET} = Razonamiento (Thought)
	{YELLOW}Amarillo{RESET} = Acción (Action)
	{MAGENTA}Rosa{RESET} = Observación (Observation)
	{GREEN}Verde{RESET} = Respuesta final
	"""
	)


	# ── Main ──────────────────────────────────────────────────────


	def main():
	print(f"\n{BOLD}{CYAN}{'═' * 70}{RESET}")
	print(f"{BOLD}{CYAN} COMPARACIÓN: Pipeline vs Agente Básico vs Agente ReAct{RESET}")
	print(f"{BOLD}{CYAN}{'═' * 70}{RESET}")
	print(f"{DIM} Modelo: gpt-oss-120b \| Docs: data/*.txt \| VectorDB: ChromaDB{RESET}")

	basic_agent = BasicAgent()
	react_agent = ReactAgent()

	metrics = {
	"pipeline": {
	"total_steps": 0,
	"total_time": 0.0,
	"correct": 0,
	"with_source": 0,
	"reformulations": 0,
	},
	"basic": {
	"total_steps": 0,
	"total_time": 0.0,
	"correct": 0,
	"with_source": 0,
	"reformulations": 0,
	},
	"react": {
	"total_steps": 0,
	"total_time": 0.0,
	"correct": 0,
	"with_source": 0,
	"reformulations": 0,
	},
	"total": len(QUESTIONS),
	}

	for q in QUESTIONS:
	question = q["question"]
	keywords = q["keywords"]
	category = q["category"]

	_print_header(question, category)

	# ── Pipeline ──
	t0 = time.time()
	p_result = run_pipeline(question)
	p_time = time.time() - t0
	_print_pipeline(p_result, p_time)

	metrics["pipeline"]["total_steps"] += p_result["steps"]
	metrics["pipeline"]["total_time"] += p_time
	if _check_answer(p_result["answer"], keywords):
	metrics["pipeline"]["correct"] += 1
	if _has_source(p_result["answer"]):
	metrics["pipeline"]["with_source"] += 1

	# ── Agente Básico ──
	t0 = time.time()
	b_result = basic_agent.run(question, verbose=False)
	b_time = time.time() - t0
	_print_basic(b_result, b_time)

	metrics["basic"]["total_steps"] += b_result["total_steps"]
	metrics["basic"]["total_time"] += b_time
	if _check_answer(b_result.get("answer"), keywords):
	metrics["basic"]["correct"] += 1
	if _has_source(b_result.get("answer")):
	metrics["basic"]["with_source"] += 1
	metrics["basic"]["reformulations"] += _count_reformulations(b_result["steps"])

	# ── Agente ReAct ──
	t0 = time.time()
	r_result = react_agent.run(question, verbose=False)
	r_time = time.time() - t0
	_print_react(r_result, r_time)

	metrics["react"]["total_steps"] += r_result["total_steps"]
	metrics["react"]["total_time"] += r_time
	if _check_answer(r_result.get("answer"), keywords):
	metrics["react"]["correct"] += 1
	if _has_source(r_result.get("answer")):
	metrics["react"]["with_source"] += 1
	metrics["react"]["reformulations"] += _count_reformulations(r_result["steps"])

	_print_summary(metrics)
	_print_analysis()


	if __name__ == "__main__":
	main()