def _combine_query(title: str, abstract: str) -> str: t = (title or "").strip() a = (abstract or "").strip() return t if not a else f"{t}. {a}" import numpy as np import pandas as pd import pyarrow.parquet as pq from sentence_transformers import SentenceTransformer import gradio as gr import io, os, tempfile, base64, json from string import Template import networkx as nx from networkx.algorithms.community import greedy_modularity_communities # ========================= # Config # ========================= PARQUET_PATH = "scopus_corpus.parquet" # usa el parquet enriquecido si ya generaste SPECTER MODEL_NAME_E5 = "intfloat/multilingual-e5-small" # recuperador rápido MODEL_NAME_SPECTER = "allenai-specter" # embeddings científicos qry_prefix = "query: " # ========================= # Carga dataset # ========================= table = pq.read_table(PARQUET_PATH) df = table.to_pandas() # Embeddings E5 (documentos) normalizados embeddings = np.vstack(df["embedding"].to_list()).astype("float32") # Embeddings SPECTER (documentos), si existen specter_embs = None if "specter_embedding" in df.columns: specter_embs = np.vstack(df["specter_embedding"].to_list()).astype("float32") SPECTER_AVAILABLE = specter_embs is not None # ========================= # Modelos (E5 fijo, SPECTER lazy) # ========================= model_e5 = SentenceTransformer(MODEL_NAME_E5, device="cpu") _model_specter = None def get_specter(): global _model_specter if _model_specter is None: _model_specter = SentenceTransformer(MODEL_NAME_SPECTER, device="cpu") return _model_specter # ========================= # Recomendación (tabla) # ========================= def recommend(query_title: str, query_abstract: str, k_articles: int = 300, top_n: int = 10, min_year: str = "", max_year: str = "", use_specter: bool = False, alpha_e5: float = 0.6): query = _combine_query(query_title, query_abstract) if len(query) < 5: return pd.DataFrame({"Mensaje": ["Escribe un título (≥ 5 caracteres)."]}) # Filtro de años (opcional) sub_df = df if min_year.strip() or max_year.strip(): try: y0 = int(min_year) if min_year.strip() else None y1 = int(max_year) if max_year.strip() else None except ValueError: y0 = y1 = None if y0 is not None: sub_df = sub_df[sub_df["year"].fillna(-1) >= y0] if y1 is not None: sub_df = sub_df[sub_df["year"].fillna(99999) <= y1] if sub_df.empty: return pd.DataFrame({"Mensaje": ["No hay artículos en el rango de años solicitado."]}) sub_idx = sub_df.index.to_numpy() sub_e5 = embeddings[sub_idx] # Embedding de la consulta q_e5 = model_e5.encode([qry_prefix + query], normalize_embeddings=True)[0].astype("float32") sims_e5 = sub_e5 @ q_e5 sims = sims_e5 if use_specter and specter_embs is not None: # Mezcla con SPECTER spc = specter_embs[sub_idx] q_spc = get_specter().encode([query], normalize_embeddings=True)[0].astype("float32") sims_spc = spc @ q_spc alpha = float(alpha_e5) sims = alpha * sims_e5 + (1 - alpha) * sims_spc # Top-k artículos similares k = min(int(k_articles), len(sub_idx)) if k <= 0: return pd.DataFrame({"Mensaje": ["No hay artículos para comparar."]}) top_k_idx_local = np.argpartition(-sims, k - 1)[:k] top_rows = sub_df.iloc[top_k_idx_local].copy() top_rows["sim"] = sims[top_k_idx_local] # Agregar por revista grp_cols = ["source_title", "issn", "eissn"] best_idx = (top_rows.groupby(grp_cols)["sim"].idxmax()) agg = (top_rows.groupby(grp_cols) .agg(score=("sim", "mean"), best=("sim", "max"), n=("sim", "size")) .reset_index()) # Extra info (si existe) extra_cols = ["title", "doi", "link", "year", "Document Type", "Open Access"] extra_cols_present = [c for c in extra_cols if c in top_rows.columns] best_titles = top_rows.loc[best_idx, grp_cols + extra_cols_present].set_index(grp_cols) agg = agg.merge(best_titles, left_on=grp_cols, right_index=True, how="left") # Ranking híbrido agg["rank"] = agg["score"] * 0.8 + agg["best"] * 0.2 + np.log1p(agg["n"]) * 0.02 out = ( agg.sort_values("rank", ascending=False) .head(int(top_n)) .rename(columns={ "source_title": "Revista", "issn": "ISSN", "eissn": "eISSN", "n": "#similitudes", "year": "Año", "score": "Score medio", "best": "Mejor similitud", "title": "Título del artículo", "doi": "DOI", "link": "Link", "document type": "Document Type", "open access": "Open Access" }) ) if "Año" in out.columns: out["Año"] = out["Año"].fillna(0).astype(int).replace(0, "") cols = ["Revista","Año","ISSN","eISSN","#similitudes","Score medio","Mejor similitud", "Título del artículo","DOI","Link","Document Type","Open Access"] out = out[[c for c in cols if c in out.columns]] if "Score medio" in out.columns: out["Score medio"] = out["Score medio"].round(3) if "Mejor similitud" in out.columns: out["Mejor similitud"] = out["Mejor similitud"].round(3) return out # ========================= # Grafo interactivo (vis-network en iframe) # ========================= def build_similarity_network_html(query_title: str, query_abstract: str, k_articles: int, min_year: str, max_year: str, use_specter: bool = False, alpha_e5: float = 0.6, top_nodes: int = 15, doc_edge_threshold: float = 0.35) -> str: qtxt = _combine_query(query_title, query_abstract) if len(qtxt) < 5: return "

Escribe un título (≥ 5 caracteres).

" # ---- Filtro por años ---- sub_df = df if (min_year or "").strip() or (max_year or "").strip(): try: y0 = int(min_year) if (min_year or "").strip() else None y1 = int(max_year) if (max_year or "").strip() else None except ValueError: y0 = y1 = None if y0 is not None: sub_df = sub_df[sub_df["year"].fillna(-1) >= y0] if y1 is not None: sub_df = sub_df[sub_df["year"].fillna(99999) <= y1] if sub_df.empty: return "

No hay artículos en el rango de años solicitado.

" sub_idx = sub_df.index.to_numpy() sub_e5 = embeddings[sub_idx] # ---- Similitud a consulta (para tamaño de nodos) ---- q_e5 = model_e5.encode([qry_prefix + qtxt], normalize_embeddings=True)[0].astype("float32") scores_e5 = sub_e5 @ q_e5 # Híbrido (opcional) ns = scores_e5 if use_specter and specter_embs is not None: spc = specter_embs[sub_idx] q_spc = get_specter().encode([qtxt], normalize_embeddings=True)[0].astype("float32") scores_spc = spc @ q_spc alpha = float(alpha_e5) ns = alpha * scores_e5 + (1 - alpha) * scores_spc # Top-k por similitud k = min(int(k_articles), len(sub_idx)) top_idx_local = np.argpartition(-ns, k - 1)[:k] top_rows = sub_df.iloc[top_idx_local].copy() top_rows["sim_to_query"] = ns[top_idx_local] top_rows = top_rows.sort_values("sim_to_query", ascending=False).head(int(top_nodes)) if len(top_rows) < 2: return "

No hay suficientes artículos para graficar la red.

" node_idx = top_rows.index.to_numpy() node_e5 = embeddings[node_idx] # ---- Aristas artículo–artículo ---- # E5 por defecto; si SPECTER activo y disponible, usarlo para mayor coherencia temática pair_mat = node_e5 if use_specter and specter_embs is not None: pair_mat = specter_embs[node_idx] pair_sims = pair_mat @ pair_mat.T # ---- Colores por año (teal gradient estilo CP) ---- years = top_rows["year"].fillna(0).astype(int).to_numpy() y_valid = years[years > 0] y_min, y_max = (int(y_valid.min()), int(y_valid.max())) if len(y_valid) else (2000, 2025) def teal_year_color(y: int) -> str: t = 0.0 if (not y or y <= 0 or y_max == y_min) else (y - y_min) / (y_max - y_min) h = 170 s = int(35 + 35 * t) l = int(85 - 30 * t) return f"hsl({h}, {s}%, {l}%)" # ---- Comunidades (clusters) para modo color=Comunidad ---- ids = [str(row.get("eid", idx)) for idx, row in top_rows.iterrows()] Gc = nx.Graph() Gc.add_nodes_from(ids) n = len(ids) for i in range(n): for j in range(i + 1, n): w = float(pair_sims[i, j]) if w >= float(doc_edge_threshold): Gc.add_edge(ids[i], ids[j], weight=w) comms = list(greedy_modularity_communities(Gc, weight="weight")) if Gc.number_of_edges() else [set(ids)] node2comm = {nid: ci for ci, c in enumerate(comms) for nid in c} def pastel_palette(k, s=60, l=65): return [f"hsl({int(360*i/k)}, {s}%, {l}%)" for i in range(max(1, k))] comm_colors = pastel_palette(len(comms)) group_colors = {str(i): comm_colors[i] for i in range(len(comms))} # ---- Construcción nodos/aristas para vis.js ---- ns_nodes = top_rows["sim_to_query"].to_numpy(dtype=float) smin, smax = (float(ns_nodes.min()), float(ns_nodes.max())) if ns_nodes.size else (0.0, 0.0) def node_size(sim): if smax <= smin: return 18 return 14 + 40 * (float(sim) - smin) / (smax - smin) nodes, edges = [], [] nodes.append({ "id": "QUERY", "label": "Consulta", "title": qtxt, "shape": "star", "size": 46, "color": "#e45756", "font": {"size": 16, "strokeWidth": 6, "strokeColor": "#ffffff"} }) for _, row in top_rows.iterrows(): eid = str(row.get("eid", "")) or str(row.name) title = str(row.get("title", ""))[:160] journal = str(row.get("source_title", ""))[:120] year = int(row.get("year", 0)) if pd.notna(row.get("year", None)) else 0 doi = str(row.get("doi", "")) or "" link = str(row.get("link", "")) or "" sim = float(row.get("sim_to_query", 0.0)) label = (journal or title)[:40] or "Artículo" tooltip = ( f"{title}
" f"Revista: {journal}
" f"Año: {year if year>0 else 'N/D'}
" f"Similitud con consulta: {sim:.3f}
" f"DOI: {doi}
" f"Abrir" ) group = str(node2comm.get(eid, 0)) nodes.append({ "id": eid, "label": label, "title": tooltip, "size": node_size(sim), "year": year, "group": group, "colorYear": teal_year_color(year), "font": {"size": 14, "strokeWidth": 6, "strokeColor": "#ffffff"} }) edges.append({ "from": "QUERY", "to": eid, "value": sim, "width": 1 + 6*max(0.0, sim), "color": {"color": "#9fb7b3"}, "smooth": True }) for i in range(n): for j in range(i + 1, n): w = float(pair_sims[i, j]) edges.append({ "from": ids[i], "to": ids[j], "value": w, "width": max(0.8, 3.0*(w-0.2)), "hidden": w < doc_edge_threshold, "color": {"color": "#b9c7c5"}, "smooth": True }) options = { "interaction": { "hover": True, "multiselect": True, "dragNodes": True, "navigationButtons": False, "keyboard": {"enabled": True, "bindToWindow": True} }, "physics": { "enabled": True, "solver": "forceAtlas2Based", "forceAtlas2Based": { "avoidOverlap": 0.4, "gravitationalConstant": -45, "centralGravity": 0.015, "springLength": 135, "springConstant": 0.055, "damping": 0.45 }, "stabilization": {"iterations": 140} }, "nodes": { "shape": "dot", "borderWidth": 1, "shadow": {"enabled": True, "size": 8, "x": 0, "y": 1} }, "edges": { "smooth": {"type": "continuous"}, "selectionWidth": 2, "shadow": {"enabled": True, "size": 6, "x": 0, "y": 1} } } tmpl = Template(r"""
Años:
$YMIN – $YMAX
""") html = tmpl.substitute( NODES=json.dumps(nodes), EDGES=json.dumps(edges), OPTIONS=json.dumps(options), GROUPCOLORS=json.dumps(group_colors), YMIN=y_min, YMAX=y_max, THRESH=f"{doc_edge_threshold:.2f}", ) b64 = base64.b64encode(html.encode("utf-8")).decode("ascii") return ( f'' ) # ========================= # UI Gradio # ========================= with gr.Blocks(title="Recomendador de Revistas (Scopus)") as demo: gr.Markdown("## Investigaciones UPTC") gr.Markdown( """
¿Qué es un recomendador de revistas?
Es una herramienta que, a partir del título (y opcionalmente el resumen) de tu investigación, calcula su representación semántica y la compara con artículos indexados en Scopus. Con esas similitudes: Nota: esta herramienta no reemplaza la evaluación editorial; es una guía para identificar revistas afines.
""", elem_id="about-recommender" ) # --- Entrada principal --- with gr.Row(): query = gr.Textbox( label="Título de investigación", lines=2, placeholder="Ej.: Detección temprana de fallas en motores usando aprendizaje profundo…" ) with gr.Row(): query_abs = gr.Textbox( label="Resumen (opcional)", lines=6, placeholder="Escribe un resumen para mejorar la coincidencia semántica…" ) # --- Filtros de año --- with gr.Row(): min_year = gr.Textbox(label="Año mínimo (opcional)", placeholder="2019") max_year = gr.Textbox(label="Año máximo (opcional)", placeholder="2025") # --- Top-k y nº de revistas --- with gr.Row(): k_articles = gr.Slider(50, 1000, value=300, step=50, label="Artículos considerados (top-k)") top_n = gr.Slider(5, 20, value=10, step=1, label="Nº de revistas a mostrar") # --- Fusionar con SPECTER --- with gr.Row(): use_specter = gr.Checkbox( label="Fusionar con SPECTER (mejor afinidad científica)", value=SPECTER_AVAILABLE ) alpha_e5 = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="Peso E5 (1−α = SPECTER)") # --- BOTONES: SIEMPRE DEBAJO DE FUSIÓN --- with gr.Row(): btn = gr.Button("Recomendar") btn_net = gr.Button("Ver red de similitud") # --- SALIDAS --- out = gr.Dataframe( row_count=10, wrap=True, column_widths=[180, 60, 90, 90, 90, 90, 90, 250, 120, 120, 120, 100], label="Revistas recomendadas" ) # Botón para descargar Excel debajo de la tabla with gr.Row(): download_btn = gr.Button("Descargar tabla en Excel") download_file = gr.File(label="Archivo Excel generado") out_net_html = gr.HTML(label="Grafo interactivo (explorable)") # Descargar Excel: genera archivo para descargar def to_excel_file(*args): import io df = recommend(*args) output = io.BytesIO() df.to_excel(output, index=False) output.seek(0) with open("recomendaciones.xlsx", "wb") as f: f.write(output.read()) return "recomendaciones.xlsx" download_btn.click( fn=to_excel_file, inputs=[query, query_abs, k_articles, top_n, min_year, max_year, use_specter, alpha_e5], outputs=download_file ) # --- Acciones (pueden declararse después de crear 'out' y 'out_net_html') --- btn.click( fn=recommend, inputs=[query, query_abs, k_articles, top_n, min_year, max_year, use_specter, alpha_e5], outputs=out ) query.submit( fn=recommend, inputs=[query, query_abs, k_articles, top_n, min_year, max_year, use_specter, alpha_e5], outputs=out ) query_abs.submit( fn=recommend, inputs=[query, query_abs, k_articles, top_n, min_year, max_year, use_specter, alpha_e5], outputs=out ) btn_net.click( fn=lambda qt, qa, ka, y0, y1, us, a: build_similarity_network_html( qt, qa, ka, y0, y1, use_specter=us, alpha_e5=a, top_nodes=15, doc_edge_threshold=0.35 ), inputs=[query, query_abs, k_articles, min_year, max_year, use_specter, alpha_e5], outputs=[out_net_html] ) # --- Exportable para evaluación offline --- def embed_text_e5(title: str, abstract: str = ""): txt = _combine_query(title, abstract) return model_e5.encode([qry_prefix + txt], normalize_embeddings=True)[0].astype("float32") if __name__ == "__main__": demo.launch()