from __future__ import annotations import os from pathlib import Path from typing import List import pandas as pd import networkx as nx import streamlit as st import plotly.express as px import plotly.graph_objects as go from pyvis.network import Network import streamlit.components.v1 as components HF_REPO_ID = os.environ.get("HF_REPO_ID", "") HF_TOKEN = os.environ.get("HF_TOKEN", "") st.set_page_config(page_title="CitationHub", page_icon="πŸ“š", layout="wide") ALLOWED_INTENTS = [ "background","uses","similarities","motivation", "differences","future_work","extends", ] INTENT_COLORS = { "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6", "motivation":"#f59e0b","differences":"#ef4444", "future_work":"#8b5cf6","extends":"#06b6d4", } NODE_COLORS = { "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a", "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff", "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7", } NODE_TYPE_COLORS = { "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b", "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899", "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e", } DEFAULT_DATA_DIR = Path(os.environ.get( "CITATIONHUB_DATA_DIR", r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\Citehub_huggingface\data", )) def fmt_num(x): try: return f"{int(x):,}" except: return "-" def _hf_download(filename: str) -> str: from huggingface_hub import hf_hub_download return hf_hub_download( repo_id=HF_REPO_ID, repo_type="dataset", filename=f"data/{filename}", token=HF_TOKEN or None, ) def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame: if HF_REPO_ID: return pd.read_parquet(_hf_download(filename)) return pd.read_parquet(data_dir / filename) def plotly_network_fig( nodes_df: pd.DataFrame, edges_df: pd.DataFrame, title: str = "", height: int = 750, seed_node_ids: list | None = None, ) -> go.Figure: """SVG 기반 Plotly λ„€νŠΈμ›Œν¬ κ·Έλž˜ν”„ β€” ν™•λŒ€ν•΄λ„ μ„ λͺ….""" G = nx.Graph() node_meta: dict = {} for _, row in nodes_df.iterrows(): nid = str(row["node_id"]) G.add_node(nid) node_meta[nid] = row for _, row in edges_df.iterrows(): s, t = str(row["source"]), str(row["target"]) if s in node_meta and t in node_meta: G.add_edge(s, t, edge_type=row.get("edge_type", "")) if len(G.nodes) == 0: return go.Figure() k = max(1.5, 3.0 / (len(G.nodes) ** 0.4)) pos = nx.spring_layout(G, seed=42, k=k, iterations=60) # ── edges ───────────────────────────────── ex, ey = [], [] for src, tgt in G.edges(): x0, y0 = pos.get(src, (0, 0)) x1, y1 = pos.get(tgt, (0, 0)) ex += [x0, x1, None] ey += [y0, y1, None] traces: list[go.BaseTraceType] = [ go.Scatter( x=ex, y=ey, mode="lines", line=dict(width=0.8, color="#cbd5e1"), hoverinfo="none", showlegend=False, ) ] # ── nodes grouped by type ───────────────── for ntype, color in NODE_TYPE_COLORS.items(): subset = nodes_df[nodes_df["node_type"] == ntype] if subset.empty: continue xs, ys, hovers, texts = [], [], [], [] for _, row in subset.iterrows(): nid = str(row["node_id"]) if nid not in pos: continue x, y = pos[nid] xs.append(x); ys.append(y) label = str(row.get("label", ""))[:50] texts.append(label if ntype == "seed_paper" else "") hovers.append( f"{label}
" f"Type: {ntype}
" f"DOI: {row.get('doi','') or '-'}
" f"Pub: {row.get('publication_name','') or '-'}
" f"Group: {row.get('group','') or '-'}" ) is_seed = ntype == "seed_paper" traces.append(go.Scatter( x=xs, y=ys, mode="markers+text" if is_seed else "markers", text=texts, textposition="top center", hovertext=hovers, hoverinfo="text", name=ntype, marker=dict( size=20 if is_seed else 10, color=color, line=dict(width=1.5 if is_seed else 0.5, color="white"), symbol="circle", ), )) fig = go.Figure(data=traces) fig.update_layout( title=dict(text=title, font=dict(size=14)), showlegend=True, legend=dict(title="Node type", itemsizing="constant"), hovermode="closest", height=height, margin=dict(l=0, r=0, t=40 if title else 10, b=0), paper_bgcolor="white", plot_bgcolor="#f8fafc", xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), ) return fig def plotly_ontology_fig(height: int = 750) -> go.Figure: """CitationHub μ˜¨ν†¨λ‘œμ§€ ꡬ쑰 β€” Plotly SVG.""" node_defs = [ ("seed", "Top5PctCitedPaper", "seed_paper"), ("event", "CitationEvent", "citation_event"), ("citing", "CitingPaper", "citing_paper"), ("intent", "Intent", "intent"), ("journal", "Journal", "journal"), ("author", "Author", "author"), ("affiliation", "Affiliation", "affiliation"), ("city", "City", "city"), ("country", "Country", "country"), ("field", "Field", "field"), ] edge_defs = [ ("event","citing","hasCitingPaper"), ("event","seed","hasCitedPaper"), ("event","intent","hasPrimaryIntent"), ("seed","journal","publishedInJournal"), ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"), ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"), ("seed","field","belongsToField"), ] G = nx.DiGraph() for nid, _, _ in node_defs: G.add_node(nid) for s, t, _ in edge_defs: G.add_edge(s, t) pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80) # edges + edge labels ex, ey = [], [] ann = [] for s, t, lbl in edge_defs: x0, y0 = pos[s]; x1, y1 = pos[t] ex += [x0, x1, None]; ey += [y0, y1, None] mx, my = (x0+x1)/2, (y0+y1)/2 ann.append(dict(x=mx, y=my, text=f"{lbl}", showarrow=False, font=dict(size=9, color="#64748b"), bgcolor="rgba(255,255,255,0.7)")) traces: list[go.BaseTraceType] = [ go.Scatter(x=ex, y=ey, mode="lines", line=dict(width=1.2, color="#94a3b8"), hoverinfo="none", showlegend=False) ] for nid, label, ntype in node_defs: x, y = pos[nid] color = NODE_TYPE_COLORS.get(ntype, "#94a3b8") traces.append(go.Scatter( x=[x], y=[y], mode="markers+text", text=[label], textposition="top center", hoverinfo="text", hovertext=f"{label}
Type: {ntype}", name=label, showlegend=False, marker=dict(size=22, color=color, line=dict(width=1.5, color="white")), textfont=dict(size=11), )) fig = go.Figure(data=traces) fig.update_layout( showlegend=False, hovermode="closest", height=height, annotations=ann, margin=dict(l=0, r=0, t=10, b=0), paper_bgcolor="white", plot_bgcolor="#f8fafc", xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), ) return fig def inject_fullscreen(html: str) -> str: extra = """
πŸ–± Scroll: zoom  |  Drag: pan  |  Click node: info
""" return html.replace("", extra + "") # ── 메인 데이터 λ‘œλ“œ (11개) ──────────────────────────────────── @st.cache_data(show_spinner=False) def load_data(data_dir_str: str): d = None if HF_REPO_ID else Path(data_dir_str) seed_df = _read("seed_cited_papers_normalized.parquet", d) events_df = _read("citation_events_normalized.parquet", d) citing_df = _read("citing_papers_normalized.parquet", d) authors_df = _read("authors.parquet", d) affiliations_df = _read("affiliations.parquet", d) aff_geo_df = _read("affiliation_geo.parquet", d) cities_df = _read("cities.parquet", d) countries_df = _read("countries.parquet", d) fields_df = _read("fields.parquet", d) intents_df = _read("intents.parquet", d) journals_df = _read("journals.parquet", d) seed = pd.DataFrame({ "seed_paper_id": seed_df["seed_paper_id"], "doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""), "title": seed_df.get("title", pd.Series(dtype=str)).fillna(""), "journal": seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""), "author": seed_df.get("creator", pd.Series(dtype=str)).fillna(""), "affiliation": seed_df.get("affilname", pd.Series(dtype=str)).fillna(""), "city": seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""), "country": seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""), "field": seed_df.get("group", pd.Series(dtype=str)).fillna(""), "citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int), "author_id": seed_df.get("author_id", pd.Series(dtype=object)), "affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)), "country_id": seed_df.get("country_id", pd.Series(dtype=object)), "field_id": seed_df.get("field_id", pd.Series(dtype=object)), "journal_id": seed_df.get("journal_id", pd.Series(dtype=object)), }) for col in ["title","doi","journal","field","country"]: seed[f"{col}_lc"] = seed[col].astype(str).str.lower() seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True) events = pd.DataFrame({ "citation_event_id": events_df["citation_event_id"], "seed_paper_id": events_df["cited_seed_paper_id"], "citing_paper_id": events_df["citing_paper_id"], "citing_title": events_df.get("citing_title", pd.Series(dtype=str)).fillna(""), "citing_doi": events_df.get("citing_doi", pd.Series(dtype=str)).fillna(""), "citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"), "citing_venue": events_df.get("citing_venue", pd.Series(dtype=str)).fillna(""), "primary_intent": events_df.get("primary_intent", pd.Series(dtype=str)).fillna(""), "contexts": events_df.get("contexts"), "context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int), "intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int), "is_influential": events_df.get("is_influential", pd.Series(dtype=bool)).fillna(False), "field_id": events_df.get("field_id", pd.Series(dtype=object)), }) events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True) citing = pd.DataFrame({ "citing_paper_id": citing_df["citing_paper_id"], "doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""), "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""), "year": pd.to_numeric(citing_df.get("year"), errors="coerce"), "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""), "oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""), }) filters = { "fields": sorted([x for x in seed["field"].dropna().astype(str).unique() if x]), "countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]), "journals": sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]), "intents": ALLOWED_INTENTS, "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000, "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025, } overview = { "seed_papers": int(len(seed)), "citation_events": int(len(events)), "citing_papers": int(events["citing_paper_id"].nunique()), "authors": int(len(authors_df)), "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()), "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()), "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()), "intents": len(ALLOWED_INTENTS), } return (seed, events, citing, filters, overview, authors_df, affiliations_df, aff_geo_df, cities_df, countries_df, fields_df, intents_df, journals_df) # ── KG 데이터: DuckDB λ°©μ‹μœΌλ‘œ 뢄리 λ‘œλ“œ ───────────────────── # kg_nodes : pandas 전체 λ‘œλ“œ (~160MB 파일, λ©”λͺ¨λ¦¬ ν—ˆμš© λ²”μœ„) # kg_edges : DuckDB둜 ν•„μš”ν•œ λ…Έλ“œμ˜ μ—£μ§€λ§Œ 쿼리 (전체 λ‘œλ“œ μ•ˆ 함) # enriched : DuckDB둜 집계 ν†΅κ³„λ§Œ 쿼리 (전체 λ‘œλ“œ μ•ˆ 함) @st.cache_data(show_spinner=False) def load_kg_nodes(data_dir_str: str) -> pd.DataFrame: """kg_nodes 전체 λ‘œλ“œ (3.4M rows, ~160MB 파일)""" d = None if HF_REPO_ID else Path(data_dir_str) return _read("kg_nodes.parquet", d) @st.cache_data(show_spinner=False) def get_parquet_path(filename: str, data_dir_str: str) -> str: """파일 경둜 λ°˜ν™˜ (HFλ©΄ 둜컬 μΊμ‹œμ— λ‹€μš΄λ‘œλ“œ ν›„ 경둜 λ°˜ν™˜)""" if HF_REPO_ID: return _hf_download(filename) # DuckDB용: μ—­μŠ¬λž˜μ‹œ β†’ μŠ¬λž˜μ‹œ λ³€ν™˜ return str(Path(data_dir_str) / filename).replace("\\", "/") @st.cache_data(show_spinner=False) def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame: """DuckDB: νŠΉμ • λ…Έλ“œμ˜ μ—£μ§€λ§Œ parquetμ—μ„œ λ°”λ‘œ 쿼리 (전체 λ‘œλ“œ μ—†μŒ)""" import duckdb safe_path = kg_edges_path.replace("\\", "/") safe_node = node_id.replace("'", "''") q = f""" SELECT source, target, edge_type FROM read_parquet('{safe_path}') WHERE source = '{safe_node}' OR target = '{safe_node}' LIMIT {int(max_edges)} """ return duckdb.execute(q).df() @st.cache_data(show_spinner=False) def query_enriched_stats(enriched_path: str): """DuckDB: enriched 전체 λ‘œλ“œ 없이 집계 ν†΅κ³„λ§Œ 쿼리""" import duckdb safe_path = enriched_path.replace("\\", "/") sem_df = duckdb.execute(f""" SELECT has_semantic_evidence, COUNT(*) AS count FROM read_parquet('{safe_path}') GROUP BY has_semantic_evidence """).df() field_df = duckdb.execute(f""" SELECT field_folder AS field, AVG(CAST(has_semantic_evidence AS INTEGER)) AS sem_ratio, COUNT(*) AS event_count FROM read_parquet('{safe_path}') GROUP BY field_folder ORDER BY sem_ratio DESC LIMIT 20 """).df() return sem_df, field_df @st.cache_data(show_spinner=False) def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame: """DuckDB: KG Explorer용 μž„μ˜ λ…Έλ“œ μ—£μ§€ 쿼리""" import duckdb safe_path = kg_edges_path.replace("\\", "/") safe_node = node_id.replace("'", "''") q = f""" SELECT source, target, edge_type FROM read_parquet('{safe_path}') WHERE source = '{safe_node}' OR target = '{safe_node}' LIMIT {int(max_edges)} """ return duckdb.execute(q).df() # ── 헬퍼 ─────────────────────────────────────────────────────── def filter_seed_papers(seed, q, fields, countries, journals): df = seed.copy() q = (q or "").strip().lower() if q: df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)] if fields: df = df[df["field"].str.lower().isin({x.lower() for x in fields})] if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})] if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})] return df.reset_index(drop=True) def event_subset(events, seed_paper_id, year_min, year_max): df = events[events["seed_paper_id"] == seed_paper_id].copy() df = df[df["citing_year"].fillna(-99999) >= year_min] df = df[df["citing_year"].fillna(99999) <= year_max] return df.reset_index(drop=True) def build_intent_summary(df): counts = df.groupby("primary_intent").size().to_dict() return pd.DataFrame({"intent": ALLOWED_INTENTS, "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]}) def build_context_rows(df, limit=20): rows = [] df = df.sort_values(["context_count","intent_count","citing_year"], ascending=[False,False,False], na_position="last") for _, row in df.iterrows(): ctx = row["contexts"] if isinstance(ctx, list) and ctx: for c in ctx[:2]: rows.append({"primary_intent": row["primary_intent"], "citing_title": row["citing_title"], "citing_doi": row["citing_doi"], "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]), "context": c}) if len(rows) >= limit: break return pd.DataFrame(rows[:limit]) def build_citing_table(df, limit=30): if df.empty: return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"]) return (df.sort_values(["context_count","intent_count","citing_year"], ascending=[False,False,False], na_position="last") [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]] .drop_duplicates(subset=["citing_paper_id"]).head(limit)) def get_cocited_papers(selected_seed_id, events, seed, top_n=15): """μ„ νƒλœ seed paperλ₯Ό μΈμš©ν•œ 논문듀이 ν•¨κ»˜ μΈμš©ν•œ λ‹€λ₯Έ seed papers""" citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique() cocited = (events[events["citing_paper_id"].isin(citing_ids) & (events["seed_paper_id"] != selected_seed_id)] .groupby("seed_paper_id").size() .reset_index(name="co_citation_count") .sort_values("co_citation_count", ascending=False) .head(top_n)) return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]], on="seed_paper_id", how="left") def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80): """μ„ νƒλœ seed paper의 KG 1-hop μ„œλΈŒκ·Έλž˜ν”„ λ°˜ν™˜""" node_id = f"seed:{seed_doi}" edges = kg_edges[(kg_edges["source"] == node_id) | (kg_edges["target"] == node_id)].head(max_edges) if edges.empty: return None, None all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist()) nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)] return nodes, edges def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60): """KG Explorer: μž„μ˜ λ…Έλ“œ κΈ°μ€€ μ„œλΈŒκ·Έλž˜ν”„""" edges = kg_edges[(kg_edges["source"] == search_node_id) | (kg_edges["target"] == search_node_id)].head(max_edges) if edges.empty: return None, None all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist()) nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)] return nodes, edges # ── pyvis λΉŒλ” ───────────────────────────────────────────────── def pyvis_citation_graph(seed_row, events_df): net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True) sid = seed_row["seed_paper_id"] net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot", font={"color":"white"}) for _, row in events_df.sort_values(["context_count","intent_count"], ascending=False).head(40).iterrows(): cid = row["citing_paper_id"] net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60], color=NODE_COLORS["citing_paper"], size=18, shape="dot") ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else "" yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"]) net.add_edge(cid, sid, label=row["primary_intent"], color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"), title=f"Intent: {row['primary_intent']}
Year: {yr}
{ctx}") net.barnes_hut() return inject_fullscreen(net.generate_html()) def pyvis_ontology(): net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True) for nid, label, typ in [ ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"), ("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"), ("journal","Journal","journal"), ("author","Author","author"), ("affiliation","Affiliation","affiliation"),("city","City","city"), ("country","Country","country"), ("field","Field","field"), ]: net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24) for s, t, l in [ ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"), ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"), ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"), ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"), ("seed","field","belongsToField"), ]: net.add_edge(s, t, label=l) net.barnes_hut() return inject_fullscreen(net.generate_html()) def pyvis_from_kg(nodes_df, edges_df, height="780px"): """kg_nodes / kg_edges DataFrame으둜 pyvis κ·Έλž˜ν”„ 생성""" net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True) for _, row in nodes_df.iterrows(): ntype = row.get("node_type","") color = NODE_TYPE_COLORS.get(ntype,"#94a3b8") label = str(row.get("label",""))[:55] size = 30 if ntype == "seed_paper" else 16 font = {"color":"white"} if ntype == "seed_paper" else {} tooltip = f"Type: {ntype}
DOI: {row.get('doi','')}
Pub: {row.get('publication_name','')}" net.add_node(str(row["node_id"]), label=label, color=color, size=size, shape="dot", title=tooltip, font=font) for _, row in edges_df.iterrows(): net.add_edge(str(row["source"]), str(row["target"]), label=row.get("edge_type",""), color="#94a3b8") net.barnes_hut() return inject_fullscreen(net.generate_html()) # ═══════════════════════════════════════════════════════════════ # 메인 UI # ═══════════════════════════════════════════════════════════════ st.title("CitationHub") st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.") # ── Sidebar ──────────────────────────────────────────────────── with st.sidebar: st.subheader("Data source") if HF_REPO_ID: data_dir_val = "hf" st.caption(f"Hugging Face: {HF_REPO_ID}") else: data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR)) try: (seed, events, citing, filters, overview, authors_df, affiliations_df, aff_geo_df, cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val) st.success("Data loaded") except Exception as e: st.error(str(e)); st.stop() st.subheader("Search seed papers") q_input = st.text_input("Title or DOI") if "q_submit" not in st.session_state: st.session_state["q_submit"] = "" if st.button("Search", use_container_width=True): st.session_state["q_submit"] = q_input fields_sel = st.multiselect("Field", filters["fields"]) countries_sel = st.multiselect("Country", filters["countries"]) journals_sel = st.multiselect("Journal", filters["journals"][:200]) y_min = max(2000, filters["year_min"]) year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"])) seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"], fields_sel, countries_sel, journals_sel) st.subheader("Overview counts") c1, c2 = st.columns(2) c1.metric("Seed papers", fmt_num(overview["seed_papers"])) c2.metric("Citation events", fmt_num(overview["citation_events"])) c1.metric("Citing papers", fmt_num(overview["citing_papers"])) c2.metric("Authors", fmt_num(overview["authors"])) c1.metric("Countries", fmt_num(overview["countries"])) c2.metric("Fields", fmt_num(overview["fields"])) options = seed_filtered["seed_paper_id"].tolist() if not options: st.warning("No seed papers match the current search."); st.stop() current = st.session_state.get("selected_seed_id", options[0]) default_idx = options.index(current) if current in options else 0 selected_seed_id = st.selectbox( "Seed paper", options, index=default_idx, format_func=lambda sid: seed_filtered.loc[ seed_filtered["seed_paper_id"]==sid, "title"].iloc[0], ) st.session_state["selected_seed_id"] = selected_seed_id selected_seed = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0] seed_events = event_subset(events, selected_seed_id, year_min, year_max) intent_summary = build_intent_summary(seed_events) contexts_df = build_context_rows(seed_events) citing_table = build_citing_table(seed_events) # ── νƒ­ ───────────────────────────────────────────────────────── (tab_overview, tab_cnet, tab_ontology, tab_kg_exp, tab_geo, tab_analytics) = st.tabs([ "Overview","Citation Network","Ontology", "Knowledge Graph","Geographic Map","Analytics", ]) # ═══ 1. OVERVIEW ═══════════════════════════════════════════════ with tab_overview: col1, col2 = st.columns(2) with col1: st.subheader("Seed paper detail") dc1, dc2 = st.columns(2) dc1.metric("Cited by", fmt_num(selected_seed["citedby_count"])) dc2.metric("Citation events", fmt_num(len(seed_events))) for label, key in [ ("Title","title"),("DOI","doi"),("Journal","journal"), ("Author","author"),("Affiliation","affiliation"), ("City","city"),("Country","country"),("Field","field"), ]: st.markdown(f"**{label}** \n{selected_seed[key] or '-'}") st.subheader("Related citing papers") st.dataframe(citing_table.rename(columns={ "citing_title":"Title","citing_year":"Year", "primary_intent":"Intent","context_count":"Contexts"}), use_container_width=True, hide_index=True) st.subheader("Co-cited seed papers") st.caption("같은 citing paper에 μ˜ν•΄ ν•¨κ»˜ 인용된 λ‹€λ₯Έ top 5% λ…Όλ¬Έλ“€") cocited = get_cocited_papers(selected_seed_id, events, seed) if cocited.empty: st.info("Co-cited papers not found.") else: st.dataframe(cocited.rename(columns={ "co_citation_count":"Co-citations","title":"Title", "field":"Field","citedby_count":"Cited by"}), use_container_width=True, hide_index=True) with col2: st.subheader("Intent distribution (selected paper)") fig = px.bar(intent_summary, x="intent", y="count", color="intent", color_discrete_map=INTENT_COLORS) fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count") st.plotly_chart(fig, use_container_width=True) st.subheader("Citation trend (selected paper)") trend = (seed_events.dropna(subset=["citing_year"]) .assign(citing_year=lambda df: df["citing_year"].astype(int)) .groupby("citing_year").size().reset_index(name="count")) if not trend.empty: st.plotly_chart( px.line(trend, x="citing_year", y="count", markers=True) .update_layout(xaxis_title="Year", yaxis_title="Citations"), use_container_width=True) st.subheader("CitationHub Intent Distribution") all_intents = events.groupby("primary_intent").size().to_dict() ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS, "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]}) fig2 = px.bar(ai_df, x="intent", y="count", color="intent", color_discrete_map=INTENT_COLORS) fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count") st.plotly_chart(fig2, use_container_width=True) st.subheader("CitationHub Field Distribution") fd = (seed_filtered.groupby("field", dropna=False).size() .reset_index(name="count").sort_values("count", ascending=False).head(20)) fd["field"] = fd["field"].replace("","Unknown") st.plotly_chart( px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"), use_container_width=True) st.subheader("Citation contexts") if contexts_df.empty: st.info("No contexts available.") else: for _, row in contexts_df.iterrows(): st.markdown( f"""
{row['primary_intent']}
{row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}
{row['context']}
""", unsafe_allow_html=True) # ═══ 2. CITATION NETWORK ════════════════════════════════════════ with tab_cnet: st.subheader("Citation Network") st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen") if seed_events.empty: st.info("No citation network data for this seed paper.") else: components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True) # ═══ 3. ONTOLOGY ════════════════════════════════════════════════ with tab_ontology: st.subheader("CitationHub Ontology") st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True) # ═══ 4. KNOWLEDGE GRAPH (KG Explorer) ═══════════════════════════ with tab_kg_exp: st.subheader("KG Explorer") try: with st.spinner("Loading..."): kg_nodes_exp = load_kg_nodes(data_dir_val) kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val) # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계 col_a, col_b = st.columns([1, 2]) with col_a: st.subheader("Node Types") nt = kg_nodes_exp["node_type"].value_counts().reset_index() nt.columns = ["node_type", "count"] st.dataframe(nt, use_container_width=True, hide_index=True) st.subheader("Edge Types") import duckdb as _ddb et = _ddb.execute(f""" SELECT edge_type, COUNT(*) AS count FROM read_parquet('{kg_edges_path}') GROUP BY edge_type ORDER BY count DESC """).df() st.dataframe(et, use_container_width=True, hide_index=True) with col_b: st.subheader("CitationHub KG Node Distribution") nt_fig = px.bar(nt, x="node_type", y="count", color="node_type", color_discrete_map=NODE_TYPE_COLORS) nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count") st.plotly_chart(nt_fig, use_container_width=True) # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types) st.markdown("---") st.subheader("Multi-Node Knowledge Graph") st.caption("All 10 node types and all edge types β€” 2-hop from top cited seed papers") n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds") edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type") with st.spinner("Querying graph..."): # ── 1-hop: 인용수 μƒμœ„ seed papers κΈ°μ€€ λͺ¨λ“  μ—£μ§€ top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"] .sort_values("citedby_count", ascending=False) .head(n_seeds)) seed_ids = top_seeds["node_id"].tolist() if seed_ids: ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids) # 1-hop: seed paper와 μ—°κ²°λœ λͺ¨λ“  edge (journal, author, affiliation, city, # country, field, citation_event) hop1 = _ddb.execute(f""" WITH ranked AS ( SELECT source, target, edge_type, ROW_NUMBER() OVER ( PARTITION BY edge_type ORDER BY source ) AS rn FROM read_parquet('{kg_edges_path}') WHERE source IN ({ids_sql}) OR target IN ({ids_sql}) ) SELECT source, target, edge_type FROM ranked WHERE rn <= {int(edges_per_type)} """).df() # 2-hop: citation_event β†’ HAS_CITING_PAPER β†’ citing_paper # citation_event β†’ HAS_PRIMARY_INTENT β†’ intent event_ids = [ x for x in set(hop1["source"].tolist()) | set(hop1["target"].tolist()) if str(x).startswith("event:") ][:20] if event_ids: ev_sql = ", ".join(f"'{eid}'" for eid in event_ids) hop2 = _ddb.execute(f""" WITH ranked AS ( SELECT source, target, edge_type, ROW_NUMBER() OVER ( PARTITION BY edge_type ORDER BY source ) AS rn FROM read_parquet('{kg_edges_path}') WHERE (source IN ({ev_sql}) OR target IN ({ev_sql})) AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT') ) SELECT source, target, edge_type FROM ranked WHERE rn <= {int(edges_per_type)} """).df() exp_edges = pd.concat([hop1, hop2]).drop_duplicates( subset=["source", "target", "edge_type"] ) else: exp_edges = hop1 all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist()) exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)] c1, c2, c3, c4 = st.columns(4) c1.metric("Nodes", fmt_num(len(exp_nodes))) c2.metric("Edges", fmt_num(len(exp_edges))) c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique())) c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique())) # 컀버리지 확인 ν‘œμ‹œ present_ntypes = sorted(exp_nodes["node_type"].unique().tolist()) present_etypes = sorted(exp_edges["edge_type"].unique().tolist()) all_10_ntypes = sorted(NODE_TYPE_COLORS.keys()) missing_nt = [t for t in all_10_ntypes if t not in present_ntypes] if missing_nt: st.caption(f"⚠ Node types not yet in graph: {', '.join(missing_nt)} " f"β€” try increasing 'Edges per type'") else: st.caption("βœ… All 10 node types represented") st.plotly_chart( plotly_network_fig(exp_nodes, exp_edges, height=800, seed_node_ids=seed_ids), use_container_width=True) except Exception as e: st.error(str(e)) # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════ with tab_geo: st.subheader("Geographic Distribution of Seed Papers") country_cnt = (seed_filtered.groupby("country", dropna=False).size() .reset_index(name="count").rename(columns={"country":"country_name"})) country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""] if not country_cnt.empty: fig_map = px.choropleth(country_cnt, locations="country_name", locationmode="country names", color="count", hover_name="country_name", color_continuous_scale="Blues", title="Seed Papers by Country") fig_map.update_layout(geo=dict(showframe=False), height=500) st.plotly_chart(fig_map, use_container_width=True) st.subheader("Top Cities (Affiliation)") city_cnt = (seed_filtered.merge( aff_geo_df[["affiliation_name","city_name","country_name"]], left_on="affiliation", right_on="affiliation_name", how="left") .groupby(["country_name","city_name"], dropna=False).size() .reset_index(name="count").dropna(subset=["country_name"]) .sort_values("count", ascending=False).head(30)) if not city_cnt.empty: st.plotly_chart( px.bar(city_cnt, x="city_name", y="count", color="country_name", title="Top 30 Cities") .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40), use_container_width=True) st.subheader("Citation Trend over Time (selected paper)") trend2 = (seed_events.dropna(subset=["citing_year"]) .assign(citing_year=lambda df: df["citing_year"].astype(int)) .groupby("citing_year").size().reset_index(name="count")) if not trend2.empty: st.plotly_chart( px.line(trend2, x="citing_year", y="count", markers=True, title="Citations per Year") .update_layout(xaxis_title="Year", yaxis_title="Citations"), use_container_width=True) # ═══ 7. ANALYTICS ═══════════════════════════════════════════════ with tab_analytics: col_a, col_b = st.columns(2) with col_a: st.subheader("Top Authors") if "author_id" in seed.columns and not seed["author_id"].isna().all(): top_auth = (seed.explode("author_id") .merge(authors_df, on="author_id", how="left") .groupby("author_name").size() .reset_index(name="paper_count") .sort_values("paper_count", ascending=False).head(20)) else: top_auth = (seed["author"].value_counts() .reset_index().rename(columns={"author":"author_name","count":"paper_count"}) .head(20)) top_auth = top_auth[top_auth["author_name"].str.strip() != ""] st.plotly_chart( px.bar(top_auth, x="paper_count", y="author_name", orientation="h", title="Top 20 Authors") .update_layout(yaxis=dict(autorange="reversed"), xaxis_title="Seed Papers", yaxis_title=""), use_container_width=True) with col_b: st.subheader("Top Journals") top_jnl = (seed.groupby("journal").size() .reset_index(name="count").sort_values("count", ascending=False).head(20)) top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""] st.plotly_chart( px.bar(top_jnl, x="count", y="journal", orientation="h", title="Top 20 Journals") .update_layout(yaxis=dict(autorange="reversed"), xaxis_title="Seed Papers", yaxis_title=""), use_container_width=True) st.markdown("---") col_c, col_d = st.columns(2) with col_c: st.subheader("CitationHub Field Γ— Intent Distribution Heatmap") fi = (seed[["seed_paper_id","field"]] .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner") .groupby(["field","primary_intent"]).size().reset_index(name="count")) if not fi.empty: pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0) st.plotly_chart( px.imshow(pivot, color_continuous_scale="Blues", title="CitationHub Field Γ— Intent Distribution Heatmap", aspect="auto") .update_layout(xaxis_title="Intent", yaxis_title="Field"), use_container_width=True) with col_d: st.subheader("Influential Citations (selected paper)") if "is_influential" in seed_events.columns: inf = seed_events["is_influential"].value_counts().reset_index() inf.columns = ["is_influential","count"] inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"}) st.plotly_chart( px.pie(inf, names="label", values="count", title="Influential vs Non-influential"), use_container_width=True) # ── Intent Evolution over Years ──────────────────────────── st.markdown("---") st.subheader("CitationHub Intent Evolution over Years") st.caption("How citation intents have changed across all papers over time") intent_trend_raw = ( events.dropna(subset=["citing_year"]) .assign(year=lambda df: df["citing_year"].astype(int)) .query("year >= 2000") .groupby(["year", "primary_intent"]).size() .reset_index(name="count") ) if not intent_trend_raw.empty: st.plotly_chart( px.area( intent_trend_raw, x="year", y="count", color="primary_intent", color_discrete_map=INTENT_COLORS, labels={"primary_intent": "Intent", "count": "Citations", "year": "Year"}, ).update_layout( legend_title="Intent", xaxis_title="Year", yaxis_title="# Citations", hovermode="x unified", ), use_container_width=True, ) # ── Top Citing Venues ─────────────────────────────────────── st.markdown("---") col_v1, col_v2 = st.columns(2) with col_v1: st.subheader("Top Citing Venues") st.caption("Journals/conferences that cite seed papers most") venue_cnt = ( events[events["citing_venue"].str.strip() != ""] .groupby("citing_venue").size() .reset_index(name="count") .sort_values("count", ascending=False).head(20) ) if not venue_cnt.empty: st.plotly_chart( px.bar(venue_cnt, x="count", y="citing_venue", orientation="h", labels={"count": "Citations", "citing_venue": ""}) .update_layout(yaxis=dict(autorange="reversed"), xaxis_title="Citations", yaxis_title="", height=520), use_container_width=True, ) with col_v2: st.subheader("CitationHub Field Γ— Intent Distribution") st.caption("How each field uses citations differently (all fields)") fi_pct = ( seed[["seed_paper_id", "field"]] .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner") .groupby(["field", "primary_intent"]).size().reset_index(name="count") ) if not fi_pct.empty: totals = fi_pct.groupby("field")["count"].transform("sum") fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1) n_fields = fi_pct["field"].nunique() chart_height = max(520, n_fields * 28) st.plotly_chart( px.bar(fi_pct, x="pct", y="field", color="primary_intent", orientation="h", color_discrete_map=INTENT_COLORS, labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"}) .update_layout( barmode="stack", yaxis=dict(autorange="reversed", categoryorder="total ascending"), xaxis_title="% of citations", yaxis_title="", legend_title="Intent", height=chart_height, ), use_container_width=True, ) # ── Export ───────────────────────────────────────────────── st.markdown("---") st.subheader("Export Data") col_e1, col_e2, col_e3 = st.columns(3) with col_e1: csv_seed = seed_filtered[ ["title", "doi", "journal", "author", "country", "field", "citedby_count"] ].to_csv(index=False).encode("utf-8") st.download_button( "⬇ Seed Papers (CSV)", csv_seed, "seed_papers.csv", "text/csv", use_container_width=True, ) with col_e2: cite_export = seed_events[ ["citing_title", "citing_doi", "citing_year", "citing_venue", "primary_intent", "context_count", "is_influential"] ].rename(columns={ "citing_title": "title", "citing_doi": "doi", "citing_year": "year", "citing_venue": "venue", "primary_intent": "intent", "context_count": "contexts", "is_influential": "influential", }).to_csv(index=False).encode("utf-8") st.download_button( "⬇ Citation Events (CSV)", cite_export, "citation_events.csv", "text/csv", use_container_width=True, ) with col_e3: intent_csv = intent_summary.to_csv(index=False).encode("utf-8") st.download_button( "⬇ Intent Summary (CSV)", intent_csv, "intent_summary.csv", "text/csv", use_container_width=True, )