letxinet / modules /research_tab.py
C2MV's picture
Initial upload for Build Small Hackathon
68fb5e2 verified
Raw
History Blame Contribute Delete
83.9 kB
import gradio as gr
import json
import asyncio
import os
import sys
import re
FLOATING_CARD_JS = ''
# Note: showCiteCard/closeCiteCard JS and MathJax are now globally loaded via THEME_JS in app.py
import time
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dotenv import load_dotenv
_project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
load_dotenv(os.path.join(_project_root, ".env"))
from backend.pipeline import ResearchPipeline
from backend.tools.search_engine import search
from backend.tools.graph_generator import generator as graph_generator
from modules.graph_module import generate_interactive_graph
from backend.synthesis import PROVIDERS
from backend.prompts.profiles import AGENT_PROFILES
from .utils import format_results_for_dataframe, format_error
DEFAULT_MODEL = "mistral-small-2506"
# Grupos de búsqueda
GROUPS = ["all", "latam", "global", "tesis", "iberoamerica", "peru", "brasil", "ecuador", "mexico", "ai_ml"]
# Fuentes individuales
INDIVIDUAL_SOURCES = [
"alicia", "renati", "lareferencia", "bdtd", "rraae",
"semantic", "openalex", "pubmed", "arxiv", "crossref",
"dblp", "scopus", "zenodo", "openaire", "doaj",
"core", "redalyc", "serpapi"
]
ALL_SOURCES = GROUPS + INDIVIDUAL_SOURCES
# ─── Module-level pipeline reference for stop/pause/resume ───
_active_pipeline = None
def _control_stop():
"""Stop the active pipeline"""
global _active_pipeline
if _active_pipeline:
_active_pipeline.stop()
return _build_status_html("error", "⛔ Detenido por el usuario")
return _build_status_html("idle")
def _control_pause():
"""Pause the active pipeline"""
global _active_pipeline
if _active_pipeline:
_active_pipeline.pause()
return _build_status_html("running", "⏸️ Pausado — haz clic en Reanudar")
return _build_status_html("idle")
def _control_resume():
"""Resume the active pipeline"""
global _active_pipeline
if _active_pipeline:
_active_pipeline.resume()
return _build_status_html("running", "▶️ Reanudado")
return _build_status_html("idle")
def _build_controls_html(state="idle"):
"""Build the control buttons bar matching Next.js AgentView"""
if state == "idle":
return '''
<div style="display:flex; gap:8px; align-items:center; padding:8px 0;">
<span style="font-size:12px; color:var(--text-muted, #9ca3af);">
⏹️ Pipeline inactivo
</span>
</div>'''
if state == "paused":
return '''
<div style="
display:flex; gap:8px; align-items:center; padding:10px 16px;
background:rgba(245,158,11,0.08); border:1px solid rgba(245,158,11,0.3);
border-radius:12px; animation:pulse 2s infinite;
">
<span style="width:8px;height:8px;border-radius:50%;background:#f59e0b;box-shadow:0 0 8px rgba(245,158,11,0.5);"></span>
<span style="font-size:13px; font-weight:600; color:#f59e0b;">⏸️ Pipeline pausado</span>
<span style="font-size:11px; color:var(--text-muted, #9ca3af); margin-left:8px;">Haz clic en ▶ Reanudar para continuar</span>
</div>'''
if state == "stopped":
return '''
<div style="
display:flex; gap:8px; align-items:center; padding:10px 16px;
background:rgba(239,68,68,0.08); border:1px solid rgba(239,68,68,0.3);
border-radius:12px;
">
<span style="width:8px;height:8px;border-radius:50%;background:#ef4444;"></span>
<span style="font-size:13px; font-weight:600; color:#ef4444;">⛔ Pipeline detenido</span>
</div>'''
# running
return '''
<div style="
display:flex; gap:8px; align-items:center; padding:8px 0;
">
<span style="font-size:12px; color:var(--text-muted, #9ca3af);">
⚡ Pipeline activo — usa los botones para controlar
</span>
</div>'''
PHASES = [
{"id": -1, "label": "Verificación de Fuentes", "icon": "🏥", "pct": 0, "color": "#6b7280"},
{"id": 0, "label": "Optimización de Queries", "icon": "🧠", "pct": 5, "color": "#8b5cf6"},
{"id": 1, "label": "Búsqueda Iterativa", "icon": "🔍", "pct": 15, "color": "#3b82f6"},
{"id": 2, "label": "Detección de Vacíos", "icon": "🔎", "pct": 35, "color": "#06b6d4"},
{"id": 3, "label": "Búsqueda de Rescate", "icon": "🚑", "pct": 45, "color": "#f59e0b"},
{"id": 4, "label": "Plan Maestro", "icon": "📋", "pct": 55, "color": "#10b981"},
{"id": 5, "label": "Redacción de Secciones", "icon": "✍️", "pct": 65, "color": "#a855f7"},
{"id": 6, "label": "Validación y Corrección", "icon": "✅", "pct": 90, "color": "#22c55e"},
{"id": 7, "label": "Completado", "icon": "🎉", "pct": 100,"color": "#10b981"},
]
# ─── Source badge colors (matching search_tab.py) ───
SOURCE_COLORS = {
"pubmed": "#3b82f6", "semantic_scholar": "#8b5cf6", "openalex": "#06b6d4",
"crossref": "#f59e0b", "arxiv": "#ef4444", "doaj": "#10b981",
"zenodo": "#6366f1", "dblp": "#ec4899", "openaire": "#14b8a6",
"core": "#f97316", "scielo": "#22c55e", "redalyc": "#a855f7",
"latindex": "#0ea5e9", "dialnet": "#e11d48", "la_referencia": "#84cc16",
}
GRADE_COLORS = {
"1A": "#10b981", "1B": "#22c55e", "2A": "#3b82f6", "2B": "#60a5fa",
"3A": "#f59e0b", "3B": "#fbbf24", "4": "#f97316", "5": "#ef4444", "6": "#6b7280",
}
def update_models(prov_name):
cfg = PROVIDERS.get(prov_name, PROVIDERS["mistral"])
return gr.update(choices=cfg["models"], value=cfg["models"][0])
def _build_progress_html(phase_id, extra=""):
"""Build a premium glassmorphic progress bar matching the search-popup style"""
phase = next((p for p in PHASES if p["id"] == phase_id), PHASES[-1])
pct = phase["pct"]
label = phase["label"]
icon = phase["icon"]
color = phase["color"]
# Build phase dots
dots_html = ""
for p in PHASES:
if p["id"] < 0:
continue
is_done = p["pct"] <= pct and pct > 0
is_active = p["id"] == phase_id
dot_color = p["color"] if is_done else "rgba(255,255,255,0.1)"
dot_size = "10px" if is_active else "8px"
glow = f"box-shadow:0 0 8px {p['color']}60;" if is_active else ""
border = f"border:2px solid {p['color']};" if is_active else ""
dots_html += f'''<div title="{p['icon']} {p['label']}" style="
width:{dot_size}; height:{dot_size}; border-radius:50%;
background:{dot_color}; {glow} {border}
transition:all 0.3s ease; cursor:pointer;
"></div>'''
extra_html = f'''<div style="font-size:11px; color:var(--text-muted, #9ca3af); margin-top:6px;
padding:4px 10px; border-radius:6px; background:rgba(139,92,246,0.06);
border:1px solid rgba(139,92,246,0.15);
">{extra}</div>''' if extra else ""
pulse_anim = "animation:pulse 2s infinite;" if pct < 100 and pct > 0 else ""
return f'''
<div style="
background: var(--glass, rgba(17,24,39,0.6));
backdrop-filter: blur(16px);
border: 1px solid var(--glass-border, rgba(255,255,255,0.08));
border-radius: 14px; padding: 16px 20px;
font-family: Inter, system-ui, sans-serif;
">
<div style="display:flex; justify-content:space-between; align-items:center; margin-bottom:10px;">
<div style="display:flex; align-items:center; gap:10px;">
<span style="
font-size:20px; width:36px; height:36px; display:flex; align-items:center; justify-content:center;
background:linear-gradient(135deg, {color}20, {color}10);
border:1px solid {color}40; border-radius:10px; {pulse_anim}
">{icon}</span>
<div>
<div style="font-size:14px; font-weight:700; color:var(--text, #fff);">
{label}
</div>
<div style="font-size:11px; color:var(--text-muted, #9ca3af);">
Fase {max(0, phase_id + 1)} de {len(PHASES) - 1}
</div>
</div>
</div>
<span style="
font-size:13px; font-weight:700; color:{color};
padding:4px 12px; border-radius:20px;
background:{color}15; border:1px solid {color}30;
">{pct}%</span>
</div>
<div style="background:rgba(255,255,255,0.05); border-radius:8px; height:8px; overflow:hidden; margin-bottom:10px;">
<div style="width:{pct}%; height:100%; border-radius:8px;
background:linear-gradient(90deg, {color}, {color}cc);
transition:width 0.8s cubic-bezier(0.23,1,0.32,1);
box-shadow:0 0 12px {color}40;
"></div>
</div>
<div style="display:flex; gap:6px; align-items:center; justify-content:center;">
{dots_html}
</div>
{extra_html}
</div>'''
def _build_status_html(state="idle", extra=""):
"""Build a premium status indicator"""
configs = {
"idle": {"color": "#6b7280", "icon": "⏹️", "label": "Inactivo", "bg": "rgba(107,114,128,0.08)", "border": "rgba(107,114,128,0.2)"},
"running": {"color": "#8b5cf6", "icon": "⚡", "label": "En ejecución...", "bg": "rgba(139,92,246,0.08)", "border": "rgba(139,92,246,0.3)"},
"done": {"color": "#10b981", "icon": "✅", "label": "Completado", "bg": "rgba(16,185,129,0.08)", "border": "rgba(16,185,129,0.3)"},
"error": {"color": "#ef4444", "icon": "❌", "label": "Error", "bg": "rgba(239,68,68,0.08)", "border": "rgba(239,68,68,0.3)"},
}
cfg = configs.get(state, configs["idle"])
pulse = "animation:pulse 2s infinite;" if state == "running" else ""
extra_html = f'<span style="color:var(--text-muted, #9ca3af); margin-left:8px; font-size:12px;">{extra}</span>' if extra else ""
return f'''
<div style="
display:inline-flex; align-items:center; gap:10px;
background:{cfg['bg']}; border:1px solid {cfg['border']};
border-radius:10px; padding:8px 16px;
backdrop-filter:blur(12px); {pulse}
">
<span style="
width:8px; height:8px; border-radius:50%;
background:{cfg['color']}; box-shadow:0 0 8px {cfg['color']}60;
"></span>
<span style="font-size:13px; font-weight:600; color:{cfg['color']};">
{cfg['icon']} {cfg['label']}
</span>
{extra_html}
</div>'''
def _parse_sections_from_report(report_md):
if not report_md:
return {}
sections = {}
current = None
current_lines = []
for line in report_md.split("\n"):
# Match Markdown headers: ## Title or ### Title
m = re.match(r'^#{2,3}\s+(.+)', line)
# Match LaTeX headers: \section{Title}, \subsection{Title}, \subsubsection{Title}
if not m:
m = re.match(r'\\(?:sub)*section\{(.+?)\}', line)
if m:
if current:
sections[current] = "\n".join(current_lines).strip()
title = m.group(1).strip()
title = re.sub(r'^[🔬📝📊🔎🚑📋✍️✅🎉🏥🧠🔍\s]+', '', title).strip()
if not title:
title = current or "Sin título"
current = title
current_lines = []
else:
current_lines.append(line)
if current:
sections[current] = "\n".join(current_lines).strip()
return sections
def _build_references_html(docs_df, report_md=""):
if docs_df is None or docs_df.empty:
return "_Sin referencias disponibles aún..._"
import json as _json
import re
import math
import base64
# Extract cited indices from report_md
cited_indices = set()
if report_md:
for match in re.finditer(r'\[(\d+)\]', report_md):
cited_indices.add(int(match.group(1)))
has_text_produced = bool(report_md.strip())
html = '<div style="display:flex; justify-content:space-between; align-items:center; margin-bottom:16px; padding-bottom:12px; border-bottom:1px solid rgba(255,255,255,0.1);">'
html += '<div id="refs-stats" style="font-size:13px; color:#9ca3af; font-weight:500;"></div>'
# Filters
html += '<div style="display:flex; gap:12px; align-items:center;">'
if has_text_produced:
html += '''
<label style="display:flex; align-items:center; gap:6px; font-size:13px; color:#d1d5db; cursor:pointer;">
<input type="checkbox" id="refs-filter-cited" onchange="document.getElementById('refs-container').setAttribute('data-page', '1'); initRefsPagination()" style="accent-color:var(--accent, #8b5cf6); width:16px; height:16px;">
Solo citados en texto
</label>
'''
else:
html += '<input type="checkbox" id="refs-filter-cited" style="display:none;">'
html += '</div></div>'
html += '<div id="refs-container" data-page="1" style="display:flex; flex-direction:column; gap:12px; min-height:400px;">'
for idx, row in docs_df.iterrows():
num = idx + 1
autores = str(row.get("Autores", ""))
año = str(row.get("Año", ""))
titulo = str(row.get("Título", ""))
fuente = str(row.get("Fuente", ""))
grade = str(row.get("GRADE", ""))
parts = [a.strip() for a in autores.split(",")]
surnames = [p.split()[-1] for p in parts if p and "..." not in p]
if len(surnames) == 1:
cite_text = f"{surnames[0]} ({año})"
elif len(surnames) == 2:
cite_text = f"{surnames[0]} y {surnames[1]} ({año})"
elif len(surnames) > 2:
cite_text = f"{surnames[0]} et al. ({año})"
else:
cite_text = f"Sin Autor ({año})"
level_key = grade.split(" - ")[0].strip().upper() if grade else "UNKNOWN"
color = GRADE_COLORS.get(level_key, "#6b7280")
import math
import base64
found = {k: ("" if (isinstance(v, float) and math.isnan(v)) else v) for k, v in row.to_dict().items()}
data_json = _json.dumps(found, ensure_ascii=False)
data_b64 = base64.b64encode(data_json.encode('utf-8')).decode('utf-8')
is_cited = str(num in cited_indices).lower()
initial_display = "flex" if idx < 10 else "none"
html += f'''
<div class="ref-item" data-cited="{is_cited}" style="display:{initial_display}; padding:14px; border-radius:10px; background:var(--glass, rgba(17,24,39,0.4)); border:1px solid var(--glass-border, rgba(255,255,255,0.06)); gap:14px; align-items:flex-start; transition: all 0.2s;">
<div style="font-weight:800; color:var(--accent); min-width:32px; font-size:16px;">[{num}]</div>
<div style="flex-grow:1;">
<div style="margin-bottom:6px; line-height:1.4;">
<span class="cite-link" data-cite-b64="{data_b64}" onclick="showCiteCard(this, {idx})" style="font-weight:700; font-size:15px; cursor:pointer; color:var(--accent, #8b5cf6);">
[{num}] {cite_text}.
</span> <span style="font-style:italic; font-size:15px; opacity:0.9;">{titulo}</span>
</div>
<div style="display:flex; gap:8px; margin-top:8px; align-items:center; flex-wrap:wrap;">
<span style="font-size:11px; font-weight:600; padding:3px 10px; border-radius:12px; background:rgba(255,255,255,0.08);">{fuente}</span>
<span style="font-size:11px; font-weight:600; padding:3px 10px; border-radius:12px; background:{color}15; border:1px solid {color}40; color:{color};">{grade}</span>
</div>
</div>
</div>
'''
html += '</div>'
html += '<div id="refs-pagination" style="display:flex; justify-content:center; align-items:center; margin-top:24px; padding-top:16px; border-top:1px solid rgba(255,255,255,0.05); gap:4px;"></div>'
html += '<img src="data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==" onload="if(window.initRefsPagination) window.initRefsPagination();" style="display:none;">'
html += FLOATING_CARD_JS
return html
def _build_stats_html(report_md, docs_df):
"""Build a premium stats dashboard matching the search-popup card style"""
import pandas as pd
total_docs = len(docs_df) if docs_df is not None and not docs_df.empty else 0
sections = _parse_sections_from_report(report_md)
total_sections = len(sections)
word_count = len(report_md.split()) if report_md else 0
grade_data = {}
if docs_df is not None and not docs_df.empty and "GRADE" in docs_df.columns:
grade_data = docs_df["GRADE"].value_counts().to_dict()
# Build stat cards
stats = [
("📄", "Documentos", str(total_docs), "#3b82f6"),
("📑", "Secciones", str(total_sections), "#8b5cf6"),
("📝", "Palabras", f"{word_count:,}", "#10b981"),
]
cards_html = ""
for icon, label, val, color in stats:
cards_html += f'''
<div class="stat-card" style="
position:relative; overflow:hidden;
">
<div style="
position:absolute; top:0; left:0; right:0; height:3px;
background:linear-gradient(90deg, {color}, {color}60);
"></div>
<div style="font-size:24px; margin-bottom:6px; margin-top:4px;">{icon}</div>
<div style="
font-size:26px; font-weight:800; color:{color};
letter-spacing:-0.5px; line-height:1;
">{val}</div>
<div style="
font-size:11px; color:var(--text-muted, #9ca3af); margin-top:6px;
font-weight:500; text-transform:uppercase; letter-spacing:0.5px;
">{label}</div>
</div>'''
# GRADE distribution badges
grade_html = ""
if grade_data:
grade_badges = ""
for label, count in sorted(grade_data.items(), key=lambda x: -x[1]):
level_key = label.split(" - ")[0].strip() if " - " in label else label
color = GRADE_COLORS.get(level_key.upper(), "#6b7280")
grade_badges += f'''<span style="
display:inline-flex; align-items:center; gap:5px;
padding:4px 10px; border-radius:20px; font-size:11px; font-weight:600;
background:{color}15; border:1px solid {color}40; color:{color};
">
<span style="width:6px;height:6px;border-radius:50%;background:{color};"></span>
{label}: {count}
</span>'''
grade_html = f'''
<div style="margin-top:12px; padding-top:12px; border-top:1px solid var(--glass-border, rgba(255,255,255,0.06));">
<div style="font-size:12px; font-weight:600; color:var(--text-muted, #9ca3af); margin-bottom:8px;">
🏅 Distribución GRADE
</div>
<div style="display:flex; flex-wrap:wrap; gap:6px;">
{grade_badges}
</div>
</div>'''
return f'''
<div style="
background:var(--glass, rgba(17,24,39,0.6));
backdrop-filter:blur(16px);
border:1px solid var(--glass-border, rgba(255,255,255,0.08));
border-radius:14px; padding:16px 20px;
">
<div style="display:grid; grid-template-columns:repeat(auto-fit, minmax(120px, 1fr)); gap:12px;">
{cards_html}
</div>
{grade_html}
</div>'''
def _generate_graph_from_df(df):
return generate_interactive_graph(df)
def _detect_phase(report_md):
if not report_md:
return 0
text = report_md.lower()
if ("completado" in text and ("secciones:" in text or "docs citados:" in text)) or "fase 8" in text:
return 8
if "reporte final" in text or "generando reporte" in text:
return 7
if "grade" in text or "clasificación grade" in text:
return 6
if ("validación" in text or "validate" in text or "ara+" in text) and "recuperación" not in text:
return 6
if "redactando" in text or "redacción" in text or "writing" in text:
return 5
if "plan maestro" in text or "master plan" in text or "fase 4" in text:
return 4
if "rescate" in text or "rescue" in text or "fase 3" in text:
return 3
if "detección de vacíos" in text or "gap detection" in text or "fase 2" in text:
return 2
if "ronda" in text or "buscando" in text or "búsqueda" in text:
return 1
if "optimiz" in text or "query" in text:
return 0
return 0
# _refs_to_markdown removed, handled by _build_references_html
SECTION_COLORS = [
"#8b5cf6", "#3b82f6", "#06b6d4", "#10b981", "#f59e0b",
"#ef4444", "#ec4899", "#6366f1", "#14b8a6", "#f97316",
]
def _build_section_cards_html(sections_map, is_done=False):
"""Build glassmorphic expandable section cards"""
if not sections_map:
return '''<div style="
text-align:center; padding:40px 20px; color:#6b7280;
">
<div style="font-size:36px; margin-bottom:10px; opacity:0.5;">📑</div>
<div style="font-size:13px;">Las secciones aparecerán aquí durante la ejecución...</div>
</div>'''
cards = ""
for i, (title, content) in enumerate(sections_map.items()):
color = SECTION_COLORS[i % len(SECTION_COLORS)]
word_count = len(content.split()) if content else 0
status_icon = "✅" if (is_done or word_count > 50) else "⏳"
sec_id = f"sec_{i}"
# Escape content for display
content_preview = content[:300].replace("<", "&lt;").replace(">", "&gt;") if content else ""
content_full = content.replace("<", "&lt;").replace(">", "&gt;") if content else ""
# Copy section button
content_escaped = content.replace("'", "\\'").replace("\n", "\\n").replace('"', '&quot;') if content else ""
cards += f'''
<div class="section-card" style="animation:slideIn 0.3s ease {i * 0.06}s both;">
<!-- Color accent -->
<div style="height:3px; background:linear-gradient(90deg, {color}, {color}80);"></div>
<!-- Header (clickable to expand) -->
<div class="section-card-header" onclick="
var body=document.getElementById('{sec_id}_body');
var arrow=document.getElementById('{sec_id}_arrow');
if(body.style.display==='none'){{body.style.display='block';arrow.textContent='▲';}}
else{{body.style.display='none';arrow.textContent='▼';}}
">
<div style="display:flex; align-items:center; gap:10px;">
<div style="
width:28px; height:28px; border-radius:8px;
background:linear-gradient(135deg, {color}25, {color}10);
border:1px solid {color}40;
display:flex; align-items:center; justify-content:center;
font-size:12px; font-weight:700; color:{color};
">{i+1}</div>
<div>
<div style="font-size:13px; font-weight:600; color:var(--text, #fff);">{title}</div>
<div style="font-size:11px; color:var(--text-muted, #9ca3af); margin-top:2px;">
{status_icon} {word_count} palabras
</div>
</div>
</div>
<div style="display:flex; align-items:center; gap:8px;">
<button onclick="
event.stopPropagation();
navigator.clipboard.writeText('{content_escaped}');
this.textContent='✅ Copiado';
var btn=this;
setTimeout(function(){{btn.textContent='📋';}},1500);
" style="
background:rgba(139,92,246,0.08); border:1px solid rgba(139,92,246,0.2);
color:#8b5cf6; border-radius:6px; padding:4px 8px;
font-size:11px; cursor:pointer; transition:all 0.2s;
" title="Copiar sección">📋</button>
<span id="{sec_id}_arrow" style="color:var(--text-muted, #9ca3af); font-size:12px;">▼</span>
</div>
</div>
<!-- Body (collapsed by default) -->
<div id="{sec_id}_body" class="section-card-body" style="display:none;">
<div style="
font-size:13px; line-height:1.7; color:var(--text, #e5e7eb);
padding-top:12px; white-space:pre-wrap;
">{content_full}</div>
</div>
</div>'''
return f'''<div style="max-height:650px; overflow-y:auto; padding-right:4px;">
{cards}
</div>'''
# ══════════════════════════════════════════════════════════════
# INTERACTIVE CITATIONS (Floating Card on Click)
# ══════════════════════════════════════════════════════════════
def _build_docs_index(docs_df):
"""Build a lookup dict: author_year_key -> paper details."""
import pandas as pd
index = {}
if docs_df is None or (hasattr(docs_df, 'empty') and docs_df.empty):
return index
rows = docs_df.to_dict(orient="records") if hasattr(docs_df, 'to_dict') else []
for row in rows:
title = row.get("Título", row.get("title", ""))
authors_raw = row.get("Autores", row.get("authors", ""))
year = str(row.get("Año", row.get("year", "")))
doi = row.get("DOI", row.get("doi", ""))
source = row.get("Fuente", row.get("source", ""))
grade = row.get("GRADE", row.get("grade", ""))
pdf_url = row.get("PDF URL", row.get("pdf_url", ""))
# Extract surname(s)
if isinstance(authors_raw, list):
surnames = [a.split()[-1] for a in authors_raw[:3] if a]
authors_display = ", ".join(authors_raw[:3])
elif isinstance(authors_raw, str) and authors_raw:
parts = [a.strip() for a in authors_raw.split(",")]
surnames = [p.split()[-1] for p in parts[:3] if p]
authors_display = authors_raw
else:
surnames = []
authors_display = ""
# Build keys: "surname_year", "surname1_surname2_year" etc.
for s in surnames:
key = f"{s.lower()}_{year}"
if key not in index:
index[key] = {
"title": title, "authors": authors_display, "year": year,
"doi": doi, "source": source, "grade": grade, "pdf_url": pdf_url,
}
# Combined key for multi-author
if len(surnames) >= 2:
combined = "_".join(s.lower() for s in surnames[:2]) + f"_{year}"
index[combined] = {
"title": title, "authors": authors_display, "year": year,
"doi": doi, "source": source, "grade": grade, "pdf_url": pdf_url,
}
return index
def _latex_to_html(text):
"""Convert common LaTeX commands to HTML for browser rendering."""
if not text:
return text
# --- Structural commands ---
# \section{Title} -> <h2>Title</h2>
text = re.sub(r'\\section\*?\{(.+?)\}', r'<h2>\1</h2>', text)
# \subsection{Title} -> <h3>Title</h3>
text = re.sub(r'\\subsection\*?\{(.+?)\}', r'<h3>\1</h3>', text)
# \subsubsection{Title} -> <h4>Title</h4>
text = re.sub(r'\\subsubsection\*?\{(.+?)\}', r'<h4>\1</h4>', text)
# --- Inline formatting ---
# \textbf{bold} -> <strong>bold</strong>
text = re.sub(r'\\textbf\{(.+?)\}', r'<strong>\1</strong>', text)
# \textit{italic} -> <em>italic</em>
text = re.sub(r'\\textit\{(.+?)\}', r'<em>\1</em>', text)
# \emph{text} -> <em>text</em>
text = re.sub(r'\\emph\{(.+?)\}', r'<em>\1</em>', text)
# \underline{text} -> <u>text</u>
text = re.sub(r'\\underline\{(.+?)\}', r'<u>\1</u>', text)
# --- Fix model hallucinative curly braces for taxonomy ---
# Convert {Word} to *Word* for markdown italics, ignoring {{BIB:ID}} and existing LaTeX commands
text = re.sub(r'(?<![\\\{])\{([^{}\n]+)\}(?!\})', r'*\1*', text)
# --- List environments ---
# Capture blocks between "itemize" and "itemize"
def fix_itemize_block(match):
content = match.group(1).strip()
lines = content.split('\n')
fixed_lines = []
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith('-'):
fixed_lines.append(f"\\item {line[1:].strip()}")
elif not line.startswith('\\item'):
fixed_lines.append(f"\\item {line}")
else:
fixed_lines.append(line)
return "\\begin{itemize}\n" + "\n".join(fixed_lines) + "\n\\end{itemize}"
text = re.sub(r'(?ims)^\s*itemize\s*$(.*?)(^\s*itemize\s*$)', fix_itemize_block, text)
# \begin{itemize}...\end{itemize}
text = re.sub(r'\\begin\{itemize\}', '<ul>', text)
text = re.sub(r'\\end\{itemize\}', '</ul>', text)
text = re.sub(r'\\item\s*', '<li>', text)
# Fix stray "itemize" text that might remain if not paired
text = re.sub(r'(?im)^\s*itemize\s*$', '', text)
# Fix math units where AI writes $$g/ml instead of \mu g/ml
text = text.replace('$$g/ml', '&micro;g/ml')
text = text.replace('$$g', '&micro;g')
# --- CATALOGO DE TRADUCCION CIENTIFICA PARA FRONTEND ---
# 1. Notacion cientifica (x10^n o x 10^{n})
text = re.sub(r'(?i)x\s*10\^\{([^}]+)\}', r'&times; 10<sup>\1</sup>', text)
text = re.sub(r'(?i)x\s*10\^([0-9\-]+)', r'&times; 10<sup>\1</sup>', text)
# 2. Quimica y Subindices comunes (CO2, H2O, NO3-)
# Busca una letra mayuscula (opcional minuscula) seguida de _ y un numero. Ejemplo: CO_2 -> CO<sub>2</sub>
text = re.sub(r'([A-Z][a-z]?)_([0-9]+)', r'\1<sub>\2</sub>', text)
# Variante para {}: CO_{2} -> CO<sub>2</sub>
text = re.sub(r'([A-Z][a-z]?)_\{([0-9]+)\}', r'\1<sub>\2</sub>', text)
# 3. Superindices aislados sin $ (e.g. m^2 o cm^{3})
text = re.sub(r'([a-zA-Z]+)\^\{([0-9\-]+)\}', r'\1<sup>\2</sup>', text)
text = re.sub(r'([a-zA-Z]+)\^([0-9\-]+)', r'\1<sup>\2</sup>', text)
# 4. Temperaturas (25 oC, 25oC, 25°C)
text = re.sub(r'\b([0-9]+)\s*[oO]C\b', r'\1 &deg;C', text)
# 5. Simbolos matematicos comunes escritos a mano
text = text.replace('+/-', '&plusmn;')
text = text.replace('>=', '&ge;')
text = text.replace('<=', '&le;')
# 6. Microgramos escritos con 'u' (ug/ml)
text = re.sub(r'\bug/ml\b', '&micro;g/ml', text)
text = re.sub(r'\bug/L\b', '&micro;g/L', text)
text = re.sub(r'\bug\b', '&micro;g', text)
# --------------------------------------------------------
text = re.sub(r'\\end\{enumerate\}', '</ol>', text)
text = re.sub(r'\\item\s*', '<li>', text)
# --- Escaped characters ---
text = text.replace(r'\%', '%')
text = text.replace(r'\&', '&amp;')
text = text.replace(r'\#', '#')
text = text.replace(r'\_', '_')
text = text.replace(r'\$', '$')
# --- Remove pure LaTeX boilerplate ---
text = re.sub(r'\\begin\{document\}', '', text)
text = re.sub(r'\\end\{document\}', '', text)
text = re.sub(r'\\begin\{abstract\}', '', text)
text = re.sub(r'\\end\{abstract\}', '', text)
text = re.sub(r'\\maketitle', '', text)
text = re.sub(r'\\documentclass\{[^}]*\}', '', text)
text = re.sub(r'\\usepackage\{[^}]*\}', '', text)
text = re.sub(r'\\title\{[^}]*\}', '', text)
text = re.sub(r'\\author\{[^}]*\}', '', text)
text = re.sub(r'\\date\{[^}]*\}', '', text)
# --- Citations: \cite{key} -> leave as-is for downstream processing ---
text = re.sub(r'\\cite\{([^}]+)\}', r'[\1]', text)
# --- Paragraph breaks: double newlines ---
text = re.sub(r'\n{2,}', '</p><p>', text)
# --- Clean leftover backslash commands that are not math ---
# But preserve $...$ and $$...$$ for MathJax
text = re.sub(r'\\(?:noindent|newpage|clearpage|vspace\{[^}]*\}|hspace\{[^}]*\}|par)\b', '', text)
return text
def _make_citations_interactive(report_md, docs_df):
"""Convert LaTeX/Markdown report to HTML with clickable [[n]] citations and MathJax math rendering."""
import markdown as md_lib
import json as _json
if not report_md:
return '<div style="color:#9ca3af; padding:20px;">Haz clic en el botón para ver el progreso en tiempo real...</div>'
# Build docs index
docs_index = _build_docs_index(docs_df)
# --- Phase 0: LaTeX to HTML pre-processing ---
processed = _latex_to_html(report_md)
# Convert remaining Markdown to HTML
try:
html_body = md_lib.markdown(
processed,
extensions=['tables', 'fenced_code', 'nl2br'],
)
except Exception:
html_body = processed.replace("\n\n", "</p><p>").replace("\n", "<br>")
html_body = f"<p>{html_body}</p>"
cite_id_counter = [0]
# 1. First pass: Replace [[n]] {{BIB:ID}} markers with interactive citations
bib_pattern = re.compile(r'(?:\[\[(\d+)\]\]\s*)?\{\{BIB:([\w\.\-/]+)\}\}')
def replace_bib(match):
idx_str = match.group(1)
bib_id = match.group(2)
# Try to resolve by index first
if idx_str and docs_df is not None and not docs_df.empty:
try:
idx = int(idx_str) - 1
if 0 <= idx < len(docs_df):
row = docs_df.iloc[idx]
autores = str(row.get("Autores", ""))
año = str(row.get("Año", ""))
parts = [a.strip() for a in autores.split(",")]
surnames = [p.split()[-1] for p in parts if p and "..." not in p]
if len(surnames) == 1:
cite_text = f"[{idx+1}]"
elif len(surnames) == 2:
cite_text = f"[{idx+1}]"
elif len(surnames) > 2:
cite_text = f"[{idx+1}]"
else:
cite_text = f"[{idx+1}]"
# Build tooltip with author info
if len(surnames) >= 1:
if len(surnames) == 1:
tooltip = f"{surnames[0]} ({año})"
elif len(surnames) == 2:
tooltip = f"{surnames[0]} y {surnames[1]} ({año})"
else:
tooltip = f"{surnames[0]} et al. ({año})"
else:
tooltip = f"Fuente {idx+1} ({año})"
cite_id_counter[0] += 1
cid = cite_id_counter[0]
import math
import base64
found = {k: ("" if (isinstance(v, float) and math.isnan(v)) else v) for k, v in row.to_dict().items()}
data_json = _json.dumps(found, ensure_ascii=False)
data_b64 = base64.b64encode(data_json.encode('utf-8')).decode('utf-8')
return f'<span class="cite-link" data-cite-b64="{data_b64}" onclick="showCiteCard(this, {cid})" id="cite_{cid}" title="{tooltip}">{cite_text}</span>'
except Exception:
pass
# Fallback: show the [[n]] as a simple superscript
if idx_str:
return f'<sup class="cite-inline">[{idx_str}]</sup>'
return ""
html_body = bib_pattern.sub(replace_bib, html_body)
# 1b. Also handle bare [[n]] without {{BIB:ID}} — common in some model outputs
bare_bracket_pattern = re.compile(r'\[\[(\d+)\]\]')
def replace_bare_bracket(match):
idx_str = match.group(1)
if docs_df is not None and not docs_df.empty:
try:
idx = int(idx_str) - 1
if 0 <= idx < len(docs_df):
row = docs_df.iloc[idx]
autores = str(row.get("Autores", ""))
año = str(row.get("Año", ""))
parts = [a.strip() for a in autores.split(",")]
surnames = [p.split()[-1] for p in parts if p and "..." not in p]
if len(surnames) >= 1:
if len(surnames) == 1:
tooltip = f"{surnames[0]} ({año})"
elif len(surnames) == 2:
tooltip = f"{surnames[0]} y {surnames[1]} ({año})"
else:
tooltip = f"{surnames[0]} et al. ({año})"
else:
tooltip = f"Fuente {idx+1}"
cite_id_counter[0] += 1
cid = cite_id_counter[0]
import math
import base64
found = {k: ("" if (isinstance(v, float) and math.isnan(v)) else v) for k, v in row.to_dict().items()}
data_json = _json.dumps(found, ensure_ascii=False)
data_b64 = base64.b64encode(data_json.encode('utf-8')).decode('utf-8')
return f'<span class="cite-link" data-cite-b64="{data_b64}" onclick="showCiteCard(this, {cid})" id="cite_{cid}" title="{tooltip}">[{idx_str}]</span>'
except Exception:
pass
return f'<sup>[{idx_str}]</sup>'
html_body = bare_bracket_pattern.sub(replace_bare_bracket, html_body)
# 2. Second pass: Find and wrap existing manual APA citations: (Author, Year)
citation_pattern = re.compile(
r'\(([A-ZÁÉÍÓÚÑ][a-záéíóúñ]+(?:\s*(?:&amp;|&|y|et\s+al\.?|,\s*[A-ZÁÉÍÓÚÑ][a-záéíóúñ]+))*)\s*,\s*(\d{4}|s\.f\.)\)'
)
def replace_citation(match):
full_match = match.group(0)
authors_part = match.group(1)
year_part = match.group(2)
author_names = re.split(r'\s*(?:&amp;|&|y|,)\s*', authors_part)
author_names = [a.strip().replace("et al.", "").strip() for a in author_names if a.strip()]
found = None
for a in author_names:
surname = a.split()[-1].lower() if a else ""
key = f"{surname}_{year_part}"
if key in docs_index:
found = docs_index[key]
break
if not found and len(author_names) >= 2:
combined = "_".join(a.split()[-1].lower() for a in author_names[:2]) + f"_{year_part}"
if combined in docs_index:
found = docs_index[combined]
if not found:
return f'<span class="cite-inline">{full_match}</span>'
cite_id_counter[0] += 1
cid = cite_id_counter[0]
import math
import base64
found_clean = {k: ("" if (isinstance(v, float) and math.isnan(v)) else v) for k, v in found.items()}
data_json = _json.dumps(found_clean, ensure_ascii=False)
data_b64 = base64.b64encode(data_json.encode('utf-8')).decode('utf-8')
return f'<span class="cite-link" data-cite-b64="{data_b64}" onclick="showCiteCard(this, {cid})" id="cite_{cid}">{full_match}</span>'
html_body = citation_pattern.sub(replace_citation, html_body)
# Build the floating card container + JS + MathJax
floating_card_js = FLOATING_CARD_JS
return f'''<div class="report-interactive" style="
font-family:'Inter',sans-serif; font-size:14px; line-height:1.75;
color:var(--text, #e5e7eb); max-height:700px; overflow-y:auto; padding:4px 8px 4px 4px;
">
<style>
.report-interactive h1 {{ font-size:1.5rem; font-weight:700; margin:1.2em 0 0.6em; color:#f3f4f6; border-bottom:1px solid rgba(255,255,255,0.08); padding-bottom:8px; }}
.report-interactive h2 {{ font-size:1.25rem; font-weight:600; margin:1em 0 0.5em; color:#e5e7eb; }}
.report-interactive h3 {{ font-size:1.1rem; font-weight:600; margin:0.8em 0 0.4em; color:#d1d5db; }}
.report-interactive h4 {{ font-size:1rem; font-weight:500; margin:0.6em 0 0.3em; color:#c084fc; }}
.report-interactive p {{ margin:0.5em 0; text-align:justify; font-size: 15px; }}
.report-interactive hr {{ border:none; border-top:1px solid rgba(255,255,255,0.06); margin:1.5em 0; }}
.report-interactive em {{ color:#c084fc; font-style: italic; }}
.report-interactive strong {{ color:#f3f4f6; }}
.report-interactive a {{ color:#818cf8; text-decoration:underline; }}
.report-interactive ul, .report-interactive ol {{ padding-left:1.5em; margin:0.5em 0; font-size: 15px; }}
.report-interactive li {{ margin:0.3em 0; }}
.report-interactive blockquote {{ margin:1em 0; padding:8px 16px; color:#9ca3af; font-style: italic; border-left: 3px solid rgba(255,255,255,0.2); }}
.cite-link {{
color:#a78bfa; cursor:pointer; font-weight:600;
border-bottom:1px dashed rgba(167,139,250,0.4);
transition:all 0.15s ease; padding:0 2px; border-radius:2px;
font-size:0.85em;
}}
.cite-link:hover {{
background:rgba(139,92,246,0.15); color:#c4b5fd;
border-bottom-color:rgba(167,139,250,0.7);
box-shadow:0 0 8px rgba(139,92,246,0.2);
}}
.cite-inline {{
color:#9ca3af; font-style:italic; font-size:0.85em;
}}
/* MathJax rendered equations */
.MathJax {{ font-size:1.05em !important; }}
</style>
{html_body}
{floating_card_js}
</div>'''
# ══════════════════════════════════════════════════════════════
# RESEARCH HANDLER
# ══════════════════════════════════════════════════════════════
async def research_handler(
query, provider, search_model, synthesis_model, translation_model,
profile, depth, iterations, include_validation, sources,
enable_dme=True, synthesis_strategy="auto",
year_start="", year_end="", university="",
infinite_output=True, max_continuation=5,
grade_mode="original", geo_context="Automático"
):
import pandas as pd
empty_df = pd.DataFrame(columns=["Título", "Autores", "Año", "DOI", "Fuente", "GRADE", "PDF URL"])
ref_md = "_Sin referencias disponibles aún..._"
stats_html = _build_stats_html("", empty_df)
if not query or not query.strip():
gr.Warning("Ingrese un tema de investigación")
yield _build_status_html("error", "Sin consulta"), _build_progress_html(-1), \
"**Error:** Ingrese un tema de investigación.", empty_df, \
"", ref_md, stats_html, ""
return
api_key = os.getenv(PROVIDERS.get(provider, {}).get("env_key", ""), "")
if not api_key:
env_key = PROVIDERS.get(provider, {}).get("env_key", "?")
gr.Warning(f"No hay API key para {provider}. Configure {env_key} en .env")
yield _build_status_html("error", "API key faltante"), _build_progress_html(-1), \
f"**Error:** No hay API key para {provider}. Configure `{env_key}` en .env", \
empty_df, "", ref_md, stats_html, ""
return
# Iniciar registro en BD
from backend.database.models import SessionLocal, User, Project, ResearchJob
db_job = None
db = SessionLocal()
user = db.query(User).filter(User.username == "admin").first()
if user:
project = Project(title=f"Investigación: {query[:50]}", owner_id=user.id)
db.add(project)
db.commit()
db_job = ResearchJob(project_id=project.id, query=query, status="running")
db.add(db_job)
db.commit()
db.refresh(db_job)
db.close()
search_sources = sources if sources else ["all"]
pipeline = ResearchPipeline(
provider=provider, search_model=search_model,
synthesis_model=synthesis_model, translation_model=translation_model,
api_key=api_key,
)
global _active_pipeline
_active_pipeline = pipeline
accumulated_report = ""
accumulated_df = empty_df
current_phase = -1
try:
async for report_md, docs_df in pipeline.run(
query=query.strip(), sources=search_sources, profile=profile,
depth=int(depth), iterations=int(iterations),
include_validation=include_validation,
enable_dme=enable_dme, synthesis_strategy=synthesis_strategy,
year_start=year_start or None, year_end=year_end or None,
university=university or None, grade_mode=grade_mode,
geo_context=geo_context,
infinite_output=infinite_output,
max_continuation_passes=int(max_continuation),
):
accumulated_report = report_md
if docs_df is not None and not docs_df.empty:
accumulated_df = docs_df
detected_phase = _detect_phase(report_md)
current_phase = detected_phase
sections_map = _parse_sections_from_report(accumulated_report)
last_key = list(sections_map.keys())[-1] if sections_map else ""
extra = f"{len(accumulated_df)} docs" if len(accumulated_df) else ""
if current_phase == 5 and last_key:
extra = f"Redactando: {last_key}"
progress_html = _build_progress_html(current_phase, extra)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, accumulated_df)
sections_content = _build_section_cards_html(sections_map)
paused_label = " ⏸️" if pipeline.is_paused else ""
yield (
_build_status_html("running", f"Fase {current_phase}{paused_label}"),
progress_html, _make_citations_interactive(accumulated_report, accumulated_df), accumulated_df,
sections_content, ref_md, stats_html, accumulated_report,
)
sections_map = _parse_sections_from_report(accumulated_report)
sections_content = _build_section_cards_html(sections_map, is_done=True)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, accumulated_df)
yield (
_build_status_html("done", f"{len(accumulated_df)} docs | {len(sections_map)} secciones"),
_build_progress_html(7), _make_citations_interactive(accumulated_report, accumulated_df), accumulated_df,
sections_content, ref_md, stats_html, accumulated_report,
)
if db_job:
from datetime import datetime
db = SessionLocal()
job = db.query(ResearchJob).get(db_job.id)
if job:
job.status = "completed"
job.report_md = accumulated_report
job.completed_at = datetime.utcnow()
db.commit()
db.close()
except (StopAsyncIteration, asyncio.CancelledError):
# Pipeline was stopped by user
sections_map = _parse_sections_from_report(accumulated_report)
sections_content = _build_section_cards_html(sections_map, is_done=True)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, accumulated_df)
yield (
_build_status_html("error", "⛔ Detenido por el usuario"),
_build_progress_html(current_phase, "Detenido"),
_make_citations_interactive(accumulated_report + "\n\n---\n⛔ **Pipeline detenido por el usuario**", accumulated_df),
accumulated_df, sections_content, ref_md, stats_html,
accumulated_report
)
except Exception as e:
if db_job:
db = SessionLocal()
job = db.query(ResearchJob).get(db_job.id)
if job:
job.status = "error"
db.commit()
db.close()
yield (
_build_status_html("error", str(e)[:60]),
_build_progress_html(current_phase),
_make_citations_interactive(f"**Error:** {str(e)}", accumulated_df), accumulated_df, "", ref_md, stats_html,
accumulated_report
)
finally:
_active_pipeline = None
await pipeline.close()
# ══════════════════════════════════════════════════════════════
# SUPER RESEARCH HANDLER
# ══════════════════════════════════════════════════════════════
async def super_research_handler(
query, provider, search_model, synthesis_model, translation_model,
profile, depth, rounds, include_validation, sources,
enable_dme=True, synthesis_strategy="auto",
year_start="", year_end="", university="",
infinite_output=True, max_continuation=5,
grade_mode="original", geo_context="Automático"
):
import pandas as pd
empty_df = pd.DataFrame(columns=["Título", "Autores", "Año", "DOI", "Fuente", "GRADE", "PDF URL"])
ref_md = "_Sin referencias disponibles aún..._"
stats_html = _build_stats_html("", empty_df)
if not query or not query.strip():
gr.Warning("Ingrese un tema de investigación")
yield _build_status_html("error", "Sin consulta"), _build_progress_html(-1), \
"**Error:** Ingrese un tema de investigación.", empty_df, \
"", ref_md, stats_html, ""
return
api_key = os.getenv(PROVIDERS.get(provider, {}).get("env_key", ""), "")
if not api_key:
env_key = PROVIDERS.get(provider, {}).get("env_key", "?")
gr.Warning(f"No hay API key para {provider}. Configure {env_key} en .env")
yield _build_status_html("error", "API key faltante"), _build_progress_html(-1), \
f"**Error:** No hay API key para {provider}. Configure `{env_key}` en .env", \
empty_df, "", ref_md, stats_html, ""
return
from backend.database.models import SessionLocal, User, Project, ResearchJob
db_job = None
db = SessionLocal()
user = db.query(User).filter(User.username == "admin").first()
if user:
project = Project(title=f"Super Inv: {query[:50]}", owner_id=user.id)
db.add(project)
db.commit()
db_job = ResearchJob(project_id=project.id, query=query, status="running")
db.add(db_job)
db.commit()
db.refresh(db_job)
db.close()
search_sources = sources if sources else ["all"]
pipeline = ResearchPipeline(
provider=provider, search_model=search_model,
synthesis_model=synthesis_model, translation_model=translation_model,
api_key=api_key,
)
global _active_pipeline
_active_pipeline = pipeline
accumulated_report = ""
accumulated_df = empty_df
current_phase = -1
try:
async for report_md, docs_df in pipeline.run(
query=query.strip(), sources=search_sources, profile=profile,
depth=int(depth), iterations=int(rounds),
include_validation=include_validation,
enable_dme=enable_dme, synthesis_strategy=synthesis_strategy,
year_start=year_start or None, year_end=year_end or None,
university=university or None, grade_mode=grade_mode,
geo_context=geo_context,
infinite_output=infinite_output,
max_continuation_passes=int(max_continuation),
):
accumulated_report = report_md
if docs_df is not None and not docs_df.empty:
accumulated_df = docs_df
detected_phase = _detect_phase(report_md)
current_phase = detected_phase
sections_map = _parse_sections_from_report(accumulated_report)
last_key = list(sections_map.keys())[-1] if sections_map else ""
extra = f"{len(accumulated_df)} docs" if len(accumulated_df) else ""
if current_phase == 5 and last_key:
extra = f"Redactando: {last_key}"
progress_html = _build_progress_html(current_phase, extra)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, accumulated_df)
sections_content = _build_section_cards_html(sections_map)
paused_label = " ⏸️" if pipeline.is_paused else ""
yield (
_build_status_html("running", f"Fase {current_phase}{paused_label}"),
progress_html, _make_citations_interactive(accumulated_report, accumulated_df), accumulated_df,
sections_content, ref_md, stats_html, accumulated_report
)
sections_map = _parse_sections_from_report(accumulated_report)
sections_content = _build_section_cards_html(sections_map, is_done=True)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, accumulated_df)
yield (
_build_status_html("done", f"{len(accumulated_df)} docs | {len(sections_map)} secciones"),
_build_progress_html(7), _make_citations_interactive(accumulated_report, accumulated_df), accumulated_df,
sections_content, ref_md, stats_html, accumulated_report
)
if db_job:
from datetime import datetime
db = SessionLocal()
job = db.query(ResearchJob).get(db_job.id)
if job:
job.status = "completed"
job.report_md = accumulated_report
job.completed_at = datetime.utcnow()
db.commit()
db.close()
except (StopAsyncIteration, asyncio.CancelledError):
sections_map = _parse_sections_from_report(accumulated_report)
sections_content = _build_section_cards_html(sections_map, is_done=True)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, accumulated_df)
yield (
_build_status_html("error", "⛔ Detenido por el usuario"),
_build_progress_html(current_phase, "Detenido"),
_make_citations_interactive(accumulated_report + "\n\n---\n⛔ **Pipeline detenido por el usuario**", accumulated_df),
accumulated_df, sections_content, ref_md, stats_html,
accumulated_report
)
except Exception as e:
if db_job:
db = SessionLocal()
job = db.query(ResearchJob).get(db_job.id)
if job:
job.status = "error"
db.commit()
db.close()
yield (
_build_status_html("error", str(e)[:60]),
_build_progress_html(current_phase),
_make_citations_interactive(f"**Error:** {str(e)}", accumulated_df), accumulated_df, "", ref_md, stats_html,
accumulated_report
)
finally:
_active_pipeline = None
await pipeline.close()
# ══════════════════════════════════════════════════════════════
# SÍNTESIS HANDLER
# ══════════════════════════════════════════════════════════════
async def synthesis_handler(
query, docs_text, provider, search_model, synthesis_model,
translation_model, profile, include_validation,
enable_dme=True, synthesis_strategy="auto",
grade_mode="original", geo_context="Automático",
):
import pandas as pd
empty_df = pd.DataFrame(columns=["Título", "Autores", "Año", "DOI", "Fuente", "GRADE", "PDF URL"])
ref_md = "_Sin referencias disponibles aún..._"
stats_html = _build_stats_html("", empty_df)
if not query or not query.strip():
gr.Warning("Ingrese un tema/título")
yield _build_status_html("error", "Sin consulta"), _build_progress_html(-1), \
"**Error:** Ingrese un tema o título para la síntesis.", empty_df, \
"", ref_md, stats_html, ""
return
if not docs_text or not docs_text.strip():
gr.Warning("Ingrese al menos 5 documentos")
yield _build_status_html("error", "Sin documentos"), _build_progress_html(-1), \
"**Error:** Pegue la lista de documentos en el campo de texto.", empty_df, \
"", ref_md, stats_html, ""
return
api_key = os.getenv(PROVIDERS.get(provider, {}).get("env_key", ""), "")
if not api_key:
env_key = PROVIDERS.get(provider, {}).get("env_key", "?")
gr.Warning(f"No hay API key para {provider}. Configure {env_key} en .env")
yield _build_status_html("error", "API key faltante"), _build_progress_html(-1), \
f"**Error:** No hay API key para {provider}. Configure `{env_key}` en .env", \
empty_df, "", ref_md, stats_html, ""
return
pipeline = ResearchPipeline(
provider=provider, search_model=search_model,
synthesis_model=synthesis_model, translation_model=translation_model,
api_key=api_key,
)
accumulated_report = ""
current_phase = 0
try:
async for report_md, docs_df in pipeline.run(
query=query.strip(), sources=[], profile=profile,
iterations=0, include_validation=include_validation,
docs_text=docs_text, enable_dme=enable_dme,
synthesis_strategy=synthesis_strategy,
grade_mode=grade_mode, geo_context=geo_context,
):
accumulated_report = report_md
detected_phase = _detect_phase(report_md)
if detected_phase != current_phase:
current_phase = detected_phase
sections_map = _parse_sections_from_report(accumulated_report)
sections_content = _build_section_cards_html(sections_map)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, empty_df)
yield (
_build_status_html("running", "Sintetizando"),
_build_progress_html(current_phase), accumulated_report, empty_df,
sections_content, ref_md, stats_html, accumulated_report
)
sections_map = _parse_sections_from_report(accumulated_report)
sections_content = _build_section_cards_html(sections_map, is_done=True)
ref_md = _build_references_html(docs_df, accumulated_report)
stats_html = _build_stats_html(accumulated_report, empty_df)
yield (
_build_status_html("done", "Síntesis completada"),
_build_progress_html(7), accumulated_report, empty_df,
sections_content, ref_md, stats_html, accumulated_report
)
except Exception as e:
yield (
_build_status_html("error", str(e)[:60]),
_build_progress_html(current_phase),
f"**Error:** {str(e)}", empty_df, "", ref_md, stats_html,
)
finally:
await pipeline.close()
# ══════════════════════════════════════════════════════════════
# HELPER: Build a premium tab section (shared layout)
# ══════════════════════════════════════════════════════════════
def _build_research_panel(prefix, title, subtitle, btn_label, handler_fn, is_super=False):
"""Build a unified premium research panel for Research/Super/Synthesis tabs"""
# ─── Header banner ───
gr.HTML(f'''
<div style="
display:flex; justify-content:space-between; align-items:center;
padding:14px 20px; margin-bottom:12px;
background:linear-gradient(135deg, rgba(139,92,246,0.08), rgba(99,102,241,0.04));
border:1px solid rgba(139,92,246,0.2); border-radius:14px;
">
<div style="display:flex; align-items:center; gap:12px;">
<div style="
width:40px; height:40px; border-radius:12px;
background:linear-gradient(135deg, #8b5cf6, #6366f1);
display:flex; align-items:center; justify-content:center;
font-size:20px; box-shadow:0 4px 15px rgba(139,92,246,0.3);
">{"🚀" if is_super else "🔬"}</div>
<div>
<div style="font-size:16px; font-weight:700; color:var(--text, #fff);">
{title}
</div>
<div style="font-size:11px; color:var(--text-muted, #9ca3af);">
{subtitle}
</div>
</div>
</div>
<div style="display:flex; gap:8px;">
<span style="
display:inline-flex; align-items:center; gap:5px;
padding:4px 12px; border-radius:20px; font-size:11px; font-weight:600;
background:rgba(139,92,246,0.1); border:1px solid rgba(139,92,246,0.3); color:#8b5cf6;
">Pipeline v2.0</span>
</div>
</div>
''')
with gr.Row():
# ─── LEFT: Controls ───
with gr.Column(scale=2):
status = gr.HTML(_build_status_html("idle"))
progress = gr.HTML(_build_progress_html(-1, "Esperando consulta..."))
gr.HTML('''<div class="section-header">💬 Consulta de investigación</div>''')
query = gr.Textbox(
label="",
placeholder="Ej: Impacto de la IA en la educación superior en Perú",
lines=3, show_label=False,
elem_classes=["glass-input-wrapper"]
)
with gr.Row():
prov = gr.Dropdown(
choices=list(PROVIDERS.keys()), value="mistral",
label="⚡ Proveedor IA", scale=1,
)
with gr.Accordion("🤖 Modelos por Rol", open=False):
search_m = gr.Dropdown(
choices=PROVIDERS["mistral"]["models"],
value=DEFAULT_MODEL, label="🔍 Búsqueda",
)
synth_m = gr.Dropdown(
choices=PROVIDERS["mistral"]["models"],
value=DEFAULT_MODEL, label="📝 Síntesis",
)
trans_m = gr.Dropdown(
choices=PROVIDERS["mistral"]["models"],
value=DEFAULT_MODEL, label="🌐 Traducción",
)
prov.change(
fn=update_models, inputs=[prov],
outputs=[search_m, synth_m, trans_m],
)
with gr.Accordion("📚 Parámetros de Búsqueda", open=False):
src = gr.CheckboxGroup(
choices=ALL_SOURCES, value=ALL_SOURCES, label="Fuentes", show_label=False,
)
gr.HTML('''
<div style="display:flex; gap:6px; flex-wrap:wrap; margin:6px 0;">
<span style="font-size:10px; padding:2px 8px; border-radius:6px; background:rgba(59,130,246,0.08); border:1px solid rgba(59,130,246,0.2); color:#3b82f6;">all = todas</span>
<span style="font-size:10px; padding:2px 8px; border-radius:6px; background:rgba(34,197,94,0.08); border:1px solid rgba(34,197,94,0.2); color:#22c55e;">latam = Latinoamérica</span>
<span style="font-size:10px; padding:2px 8px; border-radius:6px; background:rgba(168,85,247,0.08); border:1px solid rgba(168,85,247,0.2); color:#a855f7;">global = PubMed+ArXiv+OpenAlex</span>
</div>
''')
with gr.Row():
prof = gr.Dropdown(
choices=list(AGENT_PROFILES.keys()),
value="auto", label="🎭 Perfil",
)
dep = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="📏 Profundidad")
if is_super:
iters = gr.Slider(minimum=2, maximum=5, value=3, step=1, label="🔄 Rondas")
else:
iters = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="🔄 Iteraciones")
with gr.Accordion("🔧 Opciones Avanzadas", open=False):
geo = gr.Textbox(value="Automático", label="📍 Contexto Geográfico (País/Universidad)", placeholder="Ej: Perú, Universidad Nacional del Santa")
val = gr.Checkbox(value=True, label="🔬 Validación de citas (ARA+)")
dme = gr.Checkbox(value=True, label="🔧 DME: Reparación + Enriquecimiento")
strat = gr.Radio(
choices=["lineal", "jerárquica", "auto"],
value="jerárquica", label="📐 Estrategia de Síntesis",
)
grade_mode = gr.Radio(
choices=["original", "keywords", "llm", "oxford", "hybrid"],
value="original", label="📊 Algoritmo GRADE",
info="original: Beta SX | keywords: Rápido | llm: IA Preciso | oxford: CEBM | hybrid: Mixto",
)
with gr.Row():
yr_s = gr.Textbox(label="📅 Año inicio", placeholder="2020")
yr_e = gr.Textbox(label="📅 Año fin", placeholder="2025")
uni = gr.Textbox(label="🏛️ Universidad", placeholder="Ej: UNMSM")
inf_out = gr.Checkbox(value=True, label="♾️ Output Infinito")
max_cont = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="🔁 Max Continuaciones")
btn = gr.Button(
btn_label, variant="primary", size="lg",
elem_classes=["ejecutar-btn"]
)
# ─── Control Buttons (Stop/Pause/Resume) ───
with gr.Row():
pause_btn = gr.Button(
"⏸️ Pausar", size="sm", variant="secondary",
elem_classes=["control-btn-pause"]
)
resume_btn = gr.Button(
"▶️ Reanudar", size="sm", variant="secondary",
elem_classes=["control-btn-resume"]
)
stop_btn = gr.Button(
"⛔ Detener", size="sm", variant="stop",
elem_classes=["control-btn-stop"]
)
# ─── RIGHT: Results ───
with gr.Column(scale=3):
with gr.Tabs():
with gr.TabItem("📄 Informe"):
report = gr.HTML(_make_citations_interactive("", None))
with gr.TabItem("📚 Referencias"):
refs = gr.HTML("_Las referencias aparecerán durante la ejecución..._")
with gr.TabItem("📑 Secciones"):
sections = gr.HTML(_build_section_cards_html({}))
with gr.TabItem("📊 Estadísticas"):
stats = gr.HTML(_build_stats_html("", None))
with gr.TabItem("📋 Documentos"):
docs = gr.Dataframe(
headers=["Título", "Autores", "Año", "DOI", "Fuente", "GRADE", "PDF URL"],
label="Documentos Encontrados", wrap=True,
)
with gr.TabItem("🌐 Grafo"):
graph_btn = gr.Button("🌐 Generar Grafo de Relaciones", size="sm", elem_classes=["ejecutar-btn"])
graph_html = gr.HTML('''<div style="text-align:center; padding:30px; color:#6b7280;">
<div style="font-size:36px; margin-bottom:8px;">🌐</div>
<div style="font-size:13px;">Haz clic en el botón para generar el grafo.</div>
</div>''')
graph_btn.click(fn=_generate_graph_from_df, inputs=[docs], outputs=[graph_html])
report_md_state = gr.State("")
with gr.TabItem("📥 Exportar"):
gr.HTML('''<div style="padding:12px; background:rgba(99,102,241,0.06); border:1px solid rgba(99,102,241,0.2); border-radius:12px; margin-bottom:12px;">
<div style="font-size:14px; font-weight:600; color:#818cf8; margin-bottom:4px;">📥 Exportar Resultados</div>
<div style="font-size:11px; color:#9ca3af;">Descarga el informe y los documentos en distintos formatos.</div>
</div>''')
with gr.Row():
export_md_btn = gr.Button("📄 Markdown (.md)", size="sm", variant="secondary")
export_bib_btn = gr.Button("📚 BibTeX (.bib)", size="sm", variant="secondary")
with gr.Row():
export_docx_btn = gr.Button("📝 Word (.docx)", size="sm", variant="secondary")
export_zip_btn = gr.Button("📦 ZIP (Workspace)", size="sm", variant="primary")
export_file = gr.File(label="Archivo generado", visible=True)
from backend.tools.export_utils import export_markdown, export_bibtex, export_zip, export_docx
def _do_export_md(report_state, q):
if not report_state: return gr.update(value=None)
return export_markdown(report_state, q or "research")
def _do_export_bib(docs_df, q):
if docs_df is None or docs_df.empty: return gr.update(value=None)
return export_bibtex(docs_df, q or "references")
def _do_export_docx(report_state, q):
if not report_state: return gr.update(value=None)
path = export_docx(report_state, q or "research")
return path if path else gr.update(value=None)
def _do_export_zip(report_state, docs_df, q):
if not report_state: return gr.update(value=None)
import pandas as pd
if docs_df is None:
docs_df = pd.DataFrame()
return export_zip(report_state, docs_df, q or "research")
export_md_btn.click(fn=_do_export_md, inputs=[report_md_state, query], outputs=[export_file])
export_bib_btn.click(fn=_do_export_bib, inputs=[docs, query], outputs=[export_file])
export_docx_btn.click(fn=_do_export_docx, inputs=[report_md_state, query], outputs=[export_file])
export_zip_btn.click(fn=_do_export_zip, inputs=[report_md_state, docs, query], outputs=[export_file])
# Create chat tabs
from modules.chat_tab import create_chat_tabs
create_chat_tabs(report_md_state, docs, prov, synth_m)
# Wire control buttons
stop_btn.click(fn=_control_stop, outputs=[status])
pause_btn.click(fn=_control_pause, outputs=[status])
resume_btn.click(fn=_control_resume, outputs=[status])
# Return all components needed for event binding
return (btn, query, prov, search_m, synth_m, trans_m, prof, dep, iters,
val, src, dme, strat, yr_s, yr_e, uni, inf_out, max_cont, grade_mode, geo,
status, progress, report, docs, sections, refs, stats, report_md_state)
# ══════════════════════════════════════════════════════════════
# UI TAB
# ══════════════════════════════════════════════════════════════
def create_research_tab():
with gr.Tab("🔬 Research", id="research"):
gr.HTML('''<style>
@keyframes pulse { 0%,100%{opacity:1} 50%{opacity:.6} }
@keyframes slideIn { from{opacity:0;transform:translateY(8px)} to{opacity:1;transform:translateY(0)} }
@keyframes fadeIn { from{opacity:0} to{opacity:1} }
</style>''')
with gr.Tabs():
# ─── RESEARCH ───
with gr.TabItem("🔬 Research"):
r = _build_research_panel(
"r", "Research Pipeline",
"Búsqueda iterativa + síntesis con IA en tiempo real",
"🚀 Ejecutar Research", research_handler, is_super=False
)
r[0].click(
fn=research_handler,
inputs=list(r[1:20]),
outputs=list(r[20:28]),
)
# ─── SUPER RESEARCH ───
with gr.TabItem("🚀 Super Research"):
s = _build_research_panel(
"s", "Super Research Pipeline",
"Investigación profunda multi-ronda con validación cruzada",
"⚡ Ejecutar Super Research", super_research_handler, is_super=True
)
s[0].click(
fn=super_research_handler,
inputs=list(s[1:20]),
outputs=list(s[20:28]),
)
# ─── SÍNTESIS ───
with gr.TabItem("📝 Síntesis"):
gr.HTML('''
<div style="
display:flex; justify-content:space-between; align-items:center;
padding:14px 20px; margin-bottom:12px;
background:linear-gradient(135deg, rgba(16,185,129,0.08), rgba(6,182,212,0.04));
border:1px solid rgba(16,185,129,0.2); border-radius:14px;
">
<div style="display:flex; align-items:center; gap:12px;">
<div style="
width:40px; height:40px; border-radius:12px;
background:linear-gradient(135deg, #10b981, #06b6d4);
display:flex; align-items:center; justify-content:center;
font-size:20px; box-shadow:0 4px 15px rgba(16,185,129,0.3);
">📝</div>
<div>
<div style="font-size:16px; font-weight:700; color:var(--text, #fff);">
Síntesis de Documentos
</div>
<div style="font-size:11px; color:var(--text-muted, #9ca3af);">
Generar informe a partir de documentos proporcionados
</div>
</div>
</div>
</div>
''')
with gr.Row():
with gr.Column(scale=2):
y_status = gr.HTML(_build_status_html("idle"))
y_progress = gr.HTML(_build_progress_html(-1, "Esperando consulta..."))
gr.HTML('''<div class="section-header">💬 Tema / Título</div>''')
y_query = gr.Textbox(
label="", show_label=False,
placeholder="Ej: Marco teórico sobre gestión del conocimiento",
lines=2, elem_classes=["glass-input-wrapper"]
)
gr.HTML('''<div class="section-header" style="margin-top:8px;">📄 Documentos</div>''')
y_docs = gr.Textbox(
label="", show_label=False,
placeholder="[1] García (2023) - Gestión del conocimiento en Perú\n[2] Smith (2022) - Knowledge management systems\n[3] López (2024) - Bases de datos académicas",
lines=8, elem_classes=["glass-input-wrapper"]
)
y_provider = gr.Dropdown(
choices=list(PROVIDERS.keys()), value="mistral",
label="⚡ Proveedor IA",
)
with gr.Accordion("🤖 Modelos por Rol", open=False):
y_search_model = gr.Dropdown(
choices=PROVIDERS["mistral"]["models"],
value=DEFAULT_MODEL, label="🔍 Búsqueda",
)
y_synthesis_model = gr.Dropdown(
choices=PROVIDERS["mistral"]["models"],
value=DEFAULT_MODEL, label="📝 Síntesis",
)
y_translation_model = gr.Dropdown(
choices=PROVIDERS["mistral"]["models"],
value=DEFAULT_MODEL, label="🌐 Traducción",
)
y_provider.change(
fn=update_models, inputs=[y_provider],
outputs=[y_search_model, y_synthesis_model, y_translation_model],
)
with gr.Accordion("🔧 Opciones Avanzadas", open=False):
with gr.Row():
y_profile = gr.Dropdown(
choices=list(AGENT_PROFILES.keys()),
value="auto", label="🎭 Perfil",
)
y_validation = gr.Checkbox(value=True, label="🔬 Validación ARA+")
y_geo = gr.Textbox(value="Automático", label="📍 Contexto Geográfico (País/Universidad)", placeholder="Ej: Perú, Universidad Nacional del Santa")
y_enable_dme = gr.Checkbox(value=True, label="🔧 DME")
y_synthesis_strategy = gr.Radio(
choices=["lineal", "jerárquica", "auto"],
value="jerárquica", label="📐 Estrategia",
)
y_grade_mode = gr.Radio(
choices=["original", "keywords", "llm", "oxford", "hybrid"],
value="original", label="📊 Algoritmo GRADE",
)
y_btn = gr.Button(
"📝 Ejecutar Síntesis", variant="primary", size="lg",
elem_classes=["ejecutar-btn"]
)
with gr.Column(scale=3):
with gr.Tabs():
with gr.TabItem("📄 Informe"):
y_report = gr.HTML(_make_citations_interactive("", None))
with gr.TabItem("📚 Referencias"):
y_refs = gr.Markdown("_Las referencias aparecerán aquí..._")
with gr.TabItem("📑 Secciones"):
y_sections = gr.HTML(_build_section_cards_html({}))
with gr.TabItem("📊 Estadísticas"):
y_stats = gr.HTML(_build_stats_html("", None))
with gr.TabItem("📋 Documentos"):
y_docs_out = gr.Dataframe(
headers=["Título", "Autores", "Año", "DOI", "Fuente", "GRADE", "PDF URL"],
label="Documentos", wrap=True,
)
y_report_md_state = gr.State("")
y_btn.click(
fn=synthesis_handler,
inputs=[
y_query, y_docs, y_provider, y_search_model,
y_synthesis_model, y_translation_model, y_profile,
y_validation, y_enable_dme, y_synthesis_strategy,
y_grade_mode, y_geo,
],
outputs=[
y_status, y_progress, y_report, y_docs_out,
y_sections, y_refs, y_stats, y_report_md_state,
],
)