DocAgentSystem / agents /comparison.py
RamsesCamas's picture
Initial clean commit for HF Space deployment
d0d2f42
"""
Comparación: Pipeline vs Agente Básico vs Agente ReAct.
Ejecutar: python -m agents.comparison
"""
import logging
import os
import re
import time
from dotenv import load_dotenv
from openai import OpenAI
from agents.basic_agent import BasicAgent
from agents.react_agent import ReactAgent
from agents.tools import search_docs
load_dotenv()
logging.basicConfig(level=logging.WARNING)
# ── Colores ANSI ──────────────────────────────────────────────
BOLD = "\033[1m"
DIM = "\033[2m"
CYAN = "\033[36m"
BLUE = "\033[34m" # Thought / Razonamiento
GREEN = "\033[32m" # Respuesta
YELLOW = "\033[33m" # Action
MAGENTA = "\033[35m" # Observation
RED = "\033[31m"
RESET = "\033[0m"
# ── Pipeline RAG estático ─────────────────────────────────────
_pipeline_client = OpenAI(
base_url="https://api.groq.com/openai/v1",
api_key=os.getenv("GROQ_API_KEY"),
timeout=30.0,
)
def run_pipeline(query: str) -> dict:
"""Pipeline RAG estático: retrieve → generate. Sin loops, sin decisiones."""
context = search_docs(query)
try:
response = _pipeline_client.chat.completions.create(
model="openai/gpt-oss-120b",
messages=[
{
"role": "system",
"content": (
"Responde la pregunta basándote SOLO en el contexto proporcionado. "
"Si el contexto no contiene la respuesta, indica que no tienes "
"información suficiente. Responde en español."
),
},
{
"role": "user",
"content": f"Contexto:\n{context}\n\nPregunta: {query}",
},
],
temperature=0,
max_tokens=1024,
)
answer = response.choices[0].message.content.strip()
except Exception as e:
answer = f"Error: {e}"
return {"answer": answer, "steps": 1, "approach": "pipeline"}
# ── Preguntas de prueba ───────────────────────────────────────
QUESTIONS = [
# EASY — respuesta directa
{
"question": "¿Cuántos días de vacaciones tienen los empleados en su primer año?",
"category": "FÁCIL",
"keywords": ["12", "días", "primer"],
},
{
"question": "¿Cuál es el horario de soporte técnico?",
"category": "FÁCIL",
"keywords": ["horario", "soporte", "lunes"],
},
# MEDIUM — requiere buscar en el documento correcto
{
"question": "¿Qué documentos necesito entregar en mi primer día de trabajo?",
"category": "MEDIA",
"keywords": ["primer día", "onboarding", "recepción"],
},
{
"question": "¿Cómo reporto un problema técnico urgente?",
"category": "MEDIA",
"keywords": ["P1", "crítico", "soporte", "NovaHub", "ticket"],
},
# HARD — combinar info o reformular
{
"question": "Si soy nuevo empleado y mi laptop no funciona el primer día, ¿qué debo hacer?",
"category": "DIFÍCIL",
"keywords": ["soporte", "equipo", "TI", "ticket"],
},
{
"question": "¿Puedo tomar vacaciones durante mi periodo de prueba?",
"category": "DIFÍCIL",
"keywords": ["prueba", "vacaciones", "90", "días"],
},
# UNANSWERABLE — no está en los documentos
{
"question": "¿Cuál es el salario promedio de los ingenieros?",
"category": "SIN RESPUESTA",
"keywords": [],
},
{
"question": "¿La empresa tiene oficina en Barcelona?",
"category": "SIN RESPUESTA",
"keywords": [],
},
]
# ── Evaluación heurística ─────────────────────────────────────
def _check_answer(answer: str | None, keywords: list[str]) -> bool:
"""Evalúa si la respuesta contiene las keywords esperadas."""
if not answer:
return False
if not keywords:
# Para preguntas sin respuesta, aceptar si dice que no tiene info
lower = answer.lower()
return any(
phrase in lower
for phrase in [
"no ",
"no se encontr",
"no tengo",
"no cuento",
"no hay información",
"no dispongo",
]
)
lower = answer.lower()
return any(kw.lower() in lower for kw in keywords)
def _has_source(answer: str | None) -> bool:
"""Verifica si la respuesta menciona un documento fuente."""
if not answer:
return False
sources = [
"manual_onboarding",
"politica_vacaciones",
"proceso_soporte",
"onboarding",
"política",
"soporte técnico",
]
lower = answer.lower()
return any(s.lower() in lower for s in sources)
def _count_reformulations(steps: list[dict]) -> int:
"""Cuenta cuántas veces se reformuló la búsqueda."""
search_actions = []
for s in steps:
action = s.get("action", "")
if "search_docs" in action:
search_actions.append(action)
return max(0, len(search_actions) - 1)
# ── Formato de salida ─────────────────────────────────────────
def _print_header(question: str, category: str):
width = 64
print()
print(f"{BOLD}{CYAN}{'═' * width}{RESET}")
print(f"{BOLD}{CYAN}║ [{category}] {question[:width - len(category) - 6]}{RESET}")
print(f"{BOLD}{CYAN}{'═' * width}{RESET}")
def _print_pipeline(result: dict, elapsed: float):
answer = result["answer"]
print(f"\n{CYAN}┌─ PIPELINE ────────────────────────────────────────────────────┐{RESET}")
print(f"{CYAN}{RESET} {GREEN}{BOLD}Respuesta:{RESET}")
for line in _wrap(answer, 58):
print(f"{CYAN}{RESET} {GREEN}{line}{RESET}")
print(f"{CYAN}{RESET} {DIM}Pasos: {result['steps']} | Tiempo: {elapsed:.1f}s{RESET}")
print(f"{CYAN}└──────────────────────────────────────────────────────────────┘{RESET}")
def _print_basic(result: dict, elapsed: float):
print(f"\n{CYAN}┌─ AGENTE BÁSICO (Act-Only) ────────────────────────────────────┐{RESET}")
for s in result["steps"]:
action = s["action"][:65]
print(f"{CYAN}{RESET} {YELLOW}Action {s['step']}:{RESET} {YELLOW}{action}{RESET}")
if s.get("observation") and s["action"] != s.get("observation", ""):
obs = s["observation"][:62]
print(f"{CYAN}{RESET} {MAGENTA}Obs {s['step']}:{RESET} {MAGENTA}{obs}{RESET}")
answer = result.get("answer") or "(sin respuesta)"
print(f"{CYAN}{RESET}")
print(f"{CYAN}{RESET} {GREEN}{BOLD}Respuesta:{RESET}")
for line in _wrap(answer, 56):
print(f"{CYAN}{RESET} {GREEN}{line}{RESET}")
print(
f"{CYAN}{RESET} {DIM}Pasos: {result['total_steps']} | Tiempo: {elapsed:.1f}s{RESET}"
)
print(f"{CYAN}└──────────────────────────────────────────────────────────────┘{RESET}")
def _print_react(result: dict, elapsed: float):
print(f"\n{CYAN}┌─ AGENTE REACT ─────────────────────────────────────────────────┐{RESET}")
for s in result["steps"]:
thought = s.get("thought", "")[:62]
action = s["action"][:65]
print(f"{CYAN}{RESET} {BLUE}Thought {s['step']}:{RESET} {BLUE}{thought}{RESET}")
print(f"{CYAN}{RESET} {YELLOW}Action {s['step']}:{RESET} {YELLOW}{action}{RESET}")
if s.get("observation") and s["action"] != s.get("observation", ""):
obs = s["observation"][:62]
print(f"{CYAN}{RESET} {MAGENTA}Obs {s['step']}:{RESET} {MAGENTA}{obs}{RESET}")
answer = result.get("answer") or "(sin respuesta)"
print(f"{CYAN}{RESET}")
print(f"{CYAN}{RESET} {GREEN}{BOLD}Respuesta:{RESET}")
for line in _wrap(answer, 56):
print(f"{CYAN}{RESET} {GREEN}{line}{RESET}")
print(
f"{CYAN}{RESET} {DIM}Pasos: {result['total_steps']} | Tiempo: {elapsed:.1f}s{RESET}"
)
print(f"{CYAN}└──────────────────────────────────────────────────────────────┘{RESET}")
def _wrap(text: str, width: int) -> list[str]:
"""Simple line wrapper."""
words = text.split()
lines: list[str] = []
current = ""
for word in words:
if len(current) + len(word) + 1 > width:
lines.append(current)
current = word
else:
current = f"{current} {word}" if current else word
if current:
lines.append(current)
return lines or [""]
# ── Tabla resumen ─────────────────────────────────────────────
def _print_summary(metrics: dict):
"""Imprime la tabla resumen de la comparación."""
p = metrics["pipeline"]
b = metrics["basic"]
r = metrics["react"]
total = metrics["total"]
def _winner(p_val, b_val, r_val, lower_is_better=True):
vals = {"Pipeline": p_val, "Básico": b_val, "ReAct": r_val}
if lower_is_better:
return min(vals, key=vals.get)
return max(vals, key=vals.get)
rows = [
(
"Promedio pasos",
f"{p['total_steps'] / total:.1f}",
f"{b['total_steps'] / total:.1f}",
f"{r['total_steps'] / total:.1f}",
_winner(
p["total_steps"] / total,
b["total_steps"] / total,
r["total_steps"] / total,
),
),
(
"Promedio tiempo",
f"{p['total_time'] / total:.1f}s",
f"{b['total_time'] / total:.1f}s",
f"{r['total_time'] / total:.1f}s",
_winner(p["total_time"], b["total_time"], r["total_time"]),
),
(
"Resp. correctas",
f"{p['correct']}/{total}",
f"{b['correct']}/{total}",
f"{r['correct']}/{total}",
_winner(p["correct"], b["correct"], r["correct"], lower_is_better=False),
),
(
"Resp. con fuente",
f"{p['with_source']}/{total}",
f"{b['with_source']}/{total}",
f"{r['with_source']}/{total}",
_winner(
p["with_source"],
b["with_source"],
r["with_source"],
lower_is_better=False,
),
),
(
"Trazabilidad",
"Ninguna",
"Parcial",
"Completa",
"ReAct",
),
(
"Reformulaciones",
"0",
str(b["reformulations"]),
str(r["reformulations"]),
_winner(
0,
b["reformulations"],
r["reformulations"],
lower_is_better=False,
),
),
]
print(f"\n\n{BOLD}{'═' * 80}{RESET}")
print(f"{BOLD} TABLA RESUMEN{RESET}")
print(f"{BOLD}{'═' * 80}{RESET}")
hdr = f"{'Métrica':<20} {'Pipeline':>10} {'Ag. Básico':>12} {'Ag. ReAct':>12} {'Ganador':>14}"
print(f"\n{BOLD}{hdr}{RESET}")
print("─" * 72)
for metric, pv, bv, rv, winner in rows:
print(f"{metric:<20} {pv:>10} {bv:>12} {rv:>12} {BOLD}{winner:>14}{RESET}")
print("─" * 72)
# ── Análisis ──────────────────────────────────────────────────
def _print_analysis():
print(f"\n{BOLD}ANÁLISIS DE RESULTADOS{RESET}")
print("=" * 60)
print(
f"""
{GREEN}1. PIPELINE:{RESET} Rápido y predecible, pero sin capacidad de
adaptación. Si el retrieval falla, la respuesta falla.
No hay forma de reformular o buscar alternativas.
{YELLOW}2. AGENTE BÁSICO ({YELLOW}Action{RESET} only):{RESET} Puede usar herramientas
iterativamente, pero sin razonamiento explícito tiende
a repetir búsquedas similares sin reformular. No puede
explicar POR QUÉ tomó cada decisión.
{BLUE}3. AGENTE REACT{RESET} ({BLUE}Thought{RESET}{YELLOW}Action{RESET}{MAGENTA}Obs{RESET}):
Más lento pero significativamente más preciso. Puede
reformular búsquedas fallidas, combinar información de
múltiples fuentes, y cada decisión es trazable a un
{BLUE}pensamiento explícito{RESET}. Ideal para tareas complejas
donde la precisión importa más que la velocidad.
{DIM}Leyenda de colores:{RESET}
{BLUE}Azul{RESET} = Razonamiento (Thought)
{YELLOW}Amarillo{RESET} = Acción (Action)
{MAGENTA}Rosa{RESET} = Observación (Observation)
{GREEN}Verde{RESET} = Respuesta final
"""
)
# ── Main ──────────────────────────────────────────────────────
def main():
print(f"\n{BOLD}{CYAN}{'═' * 70}{RESET}")
print(f"{BOLD}{CYAN} COMPARACIÓN: Pipeline vs Agente Básico vs Agente ReAct{RESET}")
print(f"{BOLD}{CYAN}{'═' * 70}{RESET}")
print(f"{DIM} Modelo: gpt-oss-120b | Docs: data/*.txt | VectorDB: ChromaDB{RESET}")
basic_agent = BasicAgent()
react_agent = ReactAgent()
metrics = {
"pipeline": {
"total_steps": 0,
"total_time": 0.0,
"correct": 0,
"with_source": 0,
"reformulations": 0,
},
"basic": {
"total_steps": 0,
"total_time": 0.0,
"correct": 0,
"with_source": 0,
"reformulations": 0,
},
"react": {
"total_steps": 0,
"total_time": 0.0,
"correct": 0,
"with_source": 0,
"reformulations": 0,
},
"total": len(QUESTIONS),
}
for q in QUESTIONS:
question = q["question"]
keywords = q["keywords"]
category = q["category"]
_print_header(question, category)
# ── Pipeline ──
t0 = time.time()
p_result = run_pipeline(question)
p_time = time.time() - t0
_print_pipeline(p_result, p_time)
metrics["pipeline"]["total_steps"] += p_result["steps"]
metrics["pipeline"]["total_time"] += p_time
if _check_answer(p_result["answer"], keywords):
metrics["pipeline"]["correct"] += 1
if _has_source(p_result["answer"]):
metrics["pipeline"]["with_source"] += 1
# ── Agente Básico ──
t0 = time.time()
b_result = basic_agent.run(question, verbose=False)
b_time = time.time() - t0
_print_basic(b_result, b_time)
metrics["basic"]["total_steps"] += b_result["total_steps"]
metrics["basic"]["total_time"] += b_time
if _check_answer(b_result.get("answer"), keywords):
metrics["basic"]["correct"] += 1
if _has_source(b_result.get("answer")):
metrics["basic"]["with_source"] += 1
metrics["basic"]["reformulations"] += _count_reformulations(b_result["steps"])
# ── Agente ReAct ──
t0 = time.time()
r_result = react_agent.run(question, verbose=False)
r_time = time.time() - t0
_print_react(r_result, r_time)
metrics["react"]["total_steps"] += r_result["total_steps"]
metrics["react"]["total_time"] += r_time
if _check_answer(r_result.get("answer"), keywords):
metrics["react"]["correct"] += 1
if _has_source(r_result.get("answer")):
metrics["react"]["with_source"] += 1
metrics["react"]["reformulations"] += _count_reformulations(r_result["steps"])
_print_summary(metrics)
_print_analysis()
if __name__ == "__main__":
main()