Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import os | |
| from pathlib import Path | |
| from typing import List | |
| import pandas as pd | |
| import streamlit as st | |
| import plotly.express as px | |
| from pyvis.network import Network | |
| import streamlit.components.v1 as components | |
| HF_REPO_ID = os.environ.get("HF_REPO_ID", "") | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| st.set_page_config(page_title="CitationHub", page_icon="π", layout="wide") | |
| ALLOWED_INTENTS = [ | |
| "background","uses","similarities","motivation", | |
| "differences","future_work","extends", | |
| ] | |
| INTENT_COLORS = { | |
| "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6", | |
| "motivation":"#f59e0b","differences":"#ef4444", | |
| "future_work":"#8b5cf6","extends":"#06b6d4", | |
| } | |
| NODE_COLORS = { | |
| "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a", | |
| "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff", | |
| "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7", | |
| } | |
| NODE_TYPE_COLORS = { | |
| "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b", | |
| "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899", | |
| "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e", | |
| } | |
| DEFAULT_DATA_DIR = Path(os.environ.get( | |
| "CITATIONHUB_DATA_DIR", | |
| r"C:\Users\user\OneDrive\λ°ν νλ©΄\Citehub_huggingface\data", | |
| )) | |
| def fmt_num(x): | |
| try: return f"{int(x):,}" | |
| except: return "-" | |
| def _hf_download(filename: str) -> str: | |
| from huggingface_hub import hf_hub_download | |
| return hf_hub_download( | |
| repo_id=HF_REPO_ID, repo_type="dataset", | |
| filename=f"data/{filename}", token=HF_TOKEN or None, | |
| ) | |
| def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame: | |
| if HF_REPO_ID: | |
| return pd.read_parquet(_hf_download(filename)) | |
| return pd.read_parquet(data_dir / filename) | |
| def inject_fullscreen(html: str) -> str: | |
| btn = """ | |
| <button onclick="var el=document.getElementById('mynetwork'); | |
| if(el){if(el.requestFullscreen)el.requestFullscreen(); | |
| else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}" | |
| style="position:fixed;bottom:18px;right:18px;z-index:9999; | |
| padding:8px 18px;background:#1e293b;color:white;border:none; | |
| border-radius:8px;cursor:pointer;font-size:13px; | |
| box-shadow:0 2px 8px rgba(0,0,0,0.35);">βΆ Fullscreen</button> | |
| <div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px; | |
| color:#64748b;background:rgba(255,255,255,0.85); | |
| padding:5px 10px;border-radius:6px;"> | |
| π± Scroll: zoom | Drag: pan | Click node: info</div> | |
| """ | |
| return html.replace("</body>", btn + "</body>") | |
| # ββ λ©μΈ λ°μ΄ν° λ‘λ (11κ°) ββββββββββββββββββββββββββββββββββββ | |
| def load_data(data_dir_str: str): | |
| d = None if HF_REPO_ID else Path(data_dir_str) | |
| seed_df = _read("seed_cited_papers_normalized.parquet", d) | |
| events_df = _read("citation_events_normalized.parquet", d) | |
| citing_df = _read("citing_papers_normalized.parquet", d) | |
| authors_df = _read("authors.parquet", d) | |
| affiliations_df = _read("affiliations.parquet", d) | |
| aff_geo_df = _read("affiliation_geo.parquet", d) | |
| cities_df = _read("cities.parquet", d) | |
| countries_df = _read("countries.parquet", d) | |
| fields_df = _read("fields.parquet", d) | |
| intents_df = _read("intents.parquet", d) | |
| journals_df = _read("journals.parquet", d) | |
| seed = pd.DataFrame({ | |
| "seed_paper_id": seed_df["seed_paper_id"], | |
| "doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""), | |
| "title": seed_df.get("title", pd.Series(dtype=str)).fillna(""), | |
| "journal": seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""), | |
| "author": seed_df.get("creator", pd.Series(dtype=str)).fillna(""), | |
| "affiliation": seed_df.get("affilname", pd.Series(dtype=str)).fillna(""), | |
| "city": seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""), | |
| "country": seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""), | |
| "field": seed_df.get("group", pd.Series(dtype=str)).fillna(""), | |
| "citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int), | |
| "author_id": seed_df.get("author_id", pd.Series(dtype=object)), | |
| "affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)), | |
| "country_id": seed_df.get("country_id", pd.Series(dtype=object)), | |
| "field_id": seed_df.get("field_id", pd.Series(dtype=object)), | |
| "journal_id": seed_df.get("journal_id", pd.Series(dtype=object)), | |
| }) | |
| for col in ["title","doi","journal","field","country"]: | |
| seed[f"{col}_lc"] = seed[col].astype(str).str.lower() | |
| seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True) | |
| events = pd.DataFrame({ | |
| "citation_event_id": events_df["citation_event_id"], | |
| "seed_paper_id": events_df["cited_seed_paper_id"], | |
| "citing_paper_id": events_df["citing_paper_id"], | |
| "citing_title": events_df.get("citing_title", pd.Series(dtype=str)).fillna(""), | |
| "citing_doi": events_df.get("citing_doi", pd.Series(dtype=str)).fillna(""), | |
| "citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"), | |
| "citing_venue": events_df.get("citing_venue", pd.Series(dtype=str)).fillna(""), | |
| "primary_intent": events_df.get("primary_intent", pd.Series(dtype=str)).fillna(""), | |
| "contexts": events_df.get("contexts"), | |
| "context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int), | |
| "intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int), | |
| "is_influential": events_df.get("is_influential", pd.Series(dtype=bool)).fillna(False), | |
| "field_id": events_df.get("field_id", pd.Series(dtype=object)), | |
| }) | |
| events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True) | |
| citing = pd.DataFrame({ | |
| "citing_paper_id": citing_df["citing_paper_id"], | |
| "doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""), | |
| "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""), | |
| "year": pd.to_numeric(citing_df.get("year"), errors="coerce"), | |
| "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""), | |
| "oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""), | |
| }) | |
| filters = { | |
| "fields": sorted([x for x in seed["field"].dropna().astype(str).unique() if x]), | |
| "countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]), | |
| "journals": sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]), | |
| "intents": ALLOWED_INTENTS, | |
| "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000, | |
| "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025, | |
| } | |
| overview = { | |
| "seed_papers": int(len(seed)), | |
| "citation_events": int(len(events)), | |
| "citing_papers": int(events["citing_paper_id"].nunique()), | |
| "authors": int(len(authors_df)), | |
| "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()), | |
| "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()), | |
| "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()), | |
| "intents": len(ALLOWED_INTENTS), | |
| } | |
| return (seed, events, citing, filters, overview, | |
| authors_df, affiliations_df, aff_geo_df, | |
| cities_df, countries_df, fields_df, intents_df, journals_df) | |
| # ββ KG + Enriched λ°μ΄ν° (λ³λ μ§μ° λ‘λ) βββββββββββββββββββββ | |
| def load_kg_data(data_dir_str: str): | |
| d = None if HF_REPO_ID else Path(data_dir_str) | |
| kg_nodes = _read("kg_nodes.parquet", d) | |
| kg_edges = _read("kg_edges.parquet", d) | |
| enriched = _read("citation_events_enriched.parquet", d) | |
| return kg_nodes, kg_edges, enriched | |
| # ββ ν¬νΌ βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def filter_seed_papers(seed, q, fields, countries, journals): | |
| df = seed.copy() | |
| q = (q or "").strip().lower() | |
| if q: | |
| df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)] | |
| if fields: df = df[df["field"].str.lower().isin({x.lower() for x in fields})] | |
| if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})] | |
| if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})] | |
| return df.reset_index(drop=True) | |
| def event_subset(events, seed_paper_id, year_min, year_max): | |
| df = events[events["seed_paper_id"] == seed_paper_id].copy() | |
| df = df[df["citing_year"].fillna(-99999) >= year_min] | |
| df = df[df["citing_year"].fillna(99999) <= year_max] | |
| return df.reset_index(drop=True) | |
| def build_intent_summary(df): | |
| counts = df.groupby("primary_intent").size().to_dict() | |
| return pd.DataFrame({"intent": ALLOWED_INTENTS, | |
| "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]}) | |
| def build_context_rows(df, limit=20): | |
| rows = [] | |
| df = df.sort_values(["context_count","intent_count","citing_year"], | |
| ascending=[False,False,False], na_position="last") | |
| for _, row in df.iterrows(): | |
| ctx = row["contexts"] | |
| if isinstance(ctx, list) and ctx: | |
| for c in ctx[:2]: | |
| rows.append({"primary_intent": row["primary_intent"], | |
| "citing_title": row["citing_title"], | |
| "citing_doi": row["citing_doi"], | |
| "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]), | |
| "context": c}) | |
| if len(rows) >= limit: break | |
| return pd.DataFrame(rows[:limit]) | |
| def build_citing_table(df, limit=30): | |
| if df.empty: | |
| return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"]) | |
| return (df.sort_values(["context_count","intent_count","citing_year"], | |
| ascending=[False,False,False], na_position="last") | |
| [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]] | |
| .drop_duplicates(subset=["citing_paper_id"]).head(limit)) | |
| def get_cocited_papers(selected_seed_id, events, seed, top_n=15): | |
| """μ νλ seed paperλ₯Ό μΈμ©ν λ Όλ¬Έλ€μ΄ ν¨κ» μΈμ©ν λ€λ₯Έ seed papers""" | |
| citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique() | |
| cocited = (events[events["citing_paper_id"].isin(citing_ids) & | |
| (events["seed_paper_id"] != selected_seed_id)] | |
| .groupby("seed_paper_id").size() | |
| .reset_index(name="co_citation_count") | |
| .sort_values("co_citation_count", ascending=False) | |
| .head(top_n)) | |
| return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]], | |
| on="seed_paper_id", how="left") | |
| def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80): | |
| """μ νλ seed paperμ KG 1-hop μλΈκ·Έλν λ°ν""" | |
| node_id = f"seed:{seed_doi}" | |
| edges = kg_edges[(kg_edges["source"] == node_id) | | |
| (kg_edges["target"] == node_id)].head(max_edges) | |
| if edges.empty: | |
| return None, None | |
| all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist()) | |
| nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)] | |
| return nodes, edges | |
| def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60): | |
| """KG Explorer: μμ λ Έλ κΈ°μ€ μλΈκ·Έλν""" | |
| edges = kg_edges[(kg_edges["source"] == search_node_id) | | |
| (kg_edges["target"] == search_node_id)].head(max_edges) | |
| if edges.empty: | |
| return None, None | |
| all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist()) | |
| nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)] | |
| return nodes, edges | |
| # ββ pyvis λΉλ βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def pyvis_citation_graph(seed_row, events_df): | |
| net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True) | |
| sid = seed_row["seed_paper_id"] | |
| net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot", | |
| font={"color":"white"}) | |
| for _, row in events_df.sort_values(["context_count","intent_count"], | |
| ascending=False).head(40).iterrows(): | |
| cid = row["citing_paper_id"] | |
| net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60], | |
| color=NODE_COLORS["citing_paper"], size=18, shape="dot") | |
| ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else "" | |
| yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"]) | |
| net.add_edge(cid, sid, label=row["primary_intent"], | |
| color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"), | |
| title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}") | |
| net.barnes_hut() | |
| return inject_fullscreen(net.generate_html()) | |
| def pyvis_ontology(): | |
| net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True) | |
| for nid, label, typ in [ | |
| ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"), | |
| ("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"), | |
| ("journal","Journal","journal"), ("author","Author","author"), | |
| ("affiliation","Affiliation","affiliation"),("city","City","city"), | |
| ("country","Country","country"), ("field","Field","field"), | |
| ]: | |
| net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24) | |
| for s, t, l in [ | |
| ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"), | |
| ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"), | |
| ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"), | |
| ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"), | |
| ("seed","field","belongsToField"), | |
| ]: | |
| net.add_edge(s, t, label=l) | |
| net.barnes_hut() | |
| return inject_fullscreen(net.generate_html()) | |
| def pyvis_from_kg(nodes_df, edges_df, height="780px"): | |
| """kg_nodes / kg_edges DataFrameμΌλ‘ pyvis κ·Έλν μμ±""" | |
| net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True) | |
| for _, row in nodes_df.iterrows(): | |
| ntype = row.get("node_type","") | |
| color = NODE_TYPE_COLORS.get(ntype,"#94a3b8") | |
| label = str(row.get("label",""))[:55] | |
| size = 30 if ntype == "seed_paper" else 16 | |
| font = {"color":"white"} if ntype == "seed_paper" else {} | |
| tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}" | |
| net.add_node(str(row["node_id"]), label=label, color=color, | |
| size=size, shape="dot", title=tooltip, font=font) | |
| for _, row in edges_df.iterrows(): | |
| net.add_edge(str(row["source"]), str(row["target"]), | |
| label=row.get("edge_type",""), color="#94a3b8") | |
| net.barnes_hut() | |
| return inject_fullscreen(net.generate_html()) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # λ©μΈ UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.title("CitationHub") | |
| st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.") | |
| # ββ Sidebar ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.subheader("Data source") | |
| if HF_REPO_ID: | |
| data_dir_val = "hf" | |
| st.caption(f"Hugging Face: {HF_REPO_ID}") | |
| else: | |
| data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR)) | |
| try: | |
| (seed, events, citing, filters, overview, | |
| authors_df, affiliations_df, aff_geo_df, | |
| cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val) | |
| st.success("Data loaded") | |
| except Exception as e: | |
| st.error(str(e)); st.stop() | |
| st.subheader("Search seed papers") | |
| q_input = st.text_input("Title or DOI") | |
| if "q_submit" not in st.session_state: st.session_state["q_submit"] = "" | |
| if st.button("Search", use_container_width=True): | |
| st.session_state["q_submit"] = q_input | |
| fields_sel = st.multiselect("Field", filters["fields"]) | |
| countries_sel = st.multiselect("Country", filters["countries"]) | |
| journals_sel = st.multiselect("Journal", filters["journals"][:200]) | |
| y_min = max(2000, filters["year_min"]) | |
| year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"])) | |
| seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"], | |
| fields_sel, countries_sel, journals_sel) | |
| st.subheader("Overview counts") | |
| c1, c2 = st.columns(2) | |
| c1.metric("Seed papers", fmt_num(overview["seed_papers"])) | |
| c2.metric("Citation events", fmt_num(overview["citation_events"])) | |
| c1.metric("Citing papers", fmt_num(overview["citing_papers"])) | |
| c2.metric("Authors", fmt_num(overview["authors"])) | |
| c1.metric("Countries", fmt_num(overview["countries"])) | |
| c2.metric("Fields", fmt_num(overview["fields"])) | |
| options = seed_filtered["seed_paper_id"].tolist() | |
| if not options: | |
| st.warning("No seed papers match the current search."); st.stop() | |
| current = st.session_state.get("selected_seed_id", options[0]) | |
| default_idx = options.index(current) if current in options else 0 | |
| selected_seed_id = st.selectbox( | |
| "Seed paper", options, index=default_idx, | |
| format_func=lambda sid: seed_filtered.loc[ | |
| seed_filtered["seed_paper_id"]==sid, "title"].iloc[0], | |
| ) | |
| st.session_state["selected_seed_id"] = selected_seed_id | |
| selected_seed = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0] | |
| seed_events = event_subset(events, selected_seed_id, year_min, year_max) | |
| intent_summary = build_intent_summary(seed_events) | |
| contexts_df = build_context_rows(seed_events) | |
| citing_table = build_citing_table(seed_events) | |
| # ββ ν βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| (tab_overview, tab_cnet, tab_ontology, tab_kg, | |
| tab_kg_exp, tab_geo, tab_analytics) = st.tabs([ | |
| "Overview","Citation Network","Ontology", | |
| "Knowledge Graph","KG Explorer","Geographic Map","Analytics", | |
| ]) | |
| # βββ 1. OVERVIEW βββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_overview: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Seed paper detail") | |
| dc1, dc2 = st.columns(2) | |
| dc1.metric("Cited by", fmt_num(selected_seed["citedby_count"])) | |
| dc2.metric("Citation events", fmt_num(len(seed_events))) | |
| for label, key in [ | |
| ("Title","title"),("DOI","doi"),("Journal","journal"), | |
| ("Author","author"),("Affiliation","affiliation"), | |
| ("City","city"),("Country","country"),("Field","field"), | |
| ]: | |
| st.markdown(f"**{label}** \n{selected_seed[key] or '-'}") | |
| st.subheader("Related citing papers") | |
| st.dataframe(citing_table.rename(columns={ | |
| "citing_title":"Title","citing_year":"Year", | |
| "primary_intent":"Intent","context_count":"Contexts"}), | |
| use_container_width=True, hide_index=True) | |
| st.subheader("Co-cited seed papers") | |
| st.caption("κ°μ citing paperμ μν΄ ν¨κ» μΈμ©λ λ€λ₯Έ top 5% λ Όλ¬Έλ€") | |
| cocited = get_cocited_papers(selected_seed_id, events, seed) | |
| if cocited.empty: | |
| st.info("Co-cited papers not found.") | |
| else: | |
| st.dataframe(cocited.rename(columns={ | |
| "co_citation_count":"Co-citations","title":"Title", | |
| "field":"Field","citedby_count":"Cited by"}), | |
| use_container_width=True, hide_index=True) | |
| with col2: | |
| st.subheader("Intent distribution (selected paper)") | |
| fig = px.bar(intent_summary, x="intent", y="count", color="intent", | |
| color_discrete_map=INTENT_COLORS) | |
| fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count") | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.subheader("Citation trend (selected paper)") | |
| trend = (seed_events.dropna(subset=["citing_year"]) | |
| .assign(citing_year=lambda df: df["citing_year"].astype(int)) | |
| .groupby("citing_year").size().reset_index(name="count")) | |
| if not trend.empty: | |
| st.plotly_chart( | |
| px.line(trend, x="citing_year", y="count", markers=True) | |
| .update_layout(xaxis_title="Year", yaxis_title="Citations"), | |
| use_container_width=True) | |
| st.subheader("Field distribution") | |
| fd = (seed_filtered.groupby("field", dropna=False).size() | |
| .reset_index(name="count").sort_values("count", ascending=False).head(20)) | |
| fd["field"] = fd["field"].replace("","Unknown") | |
| st.plotly_chart( | |
| px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"), | |
| use_container_width=True) | |
| st.subheader("Citation contexts") | |
| if contexts_df.empty: | |
| st.info("No contexts available.") | |
| else: | |
| for _, row in contexts_df.iterrows(): | |
| st.markdown( | |
| f"""<div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px; | |
| margin-bottom:10px;background:#f8fafc;"> | |
| <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'],'#64748b')}; | |
| color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;"> | |
| {row['primary_intent']}</div> | |
| <div style="font-size:12px;color:#64748b;margin-bottom:6px;"> | |
| {row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div> | |
| <div>{row['context']}</div></div>""", | |
| unsafe_allow_html=True) | |
| # βββ 2. CITATION NETWORK ββββββββββββββββββββββββββββββββββββββββ | |
| with tab_cnet: | |
| st.subheader("Citing β Cited Citation Network") | |
| st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen") | |
| if seed_events.empty: | |
| st.info("No citation network data for this seed paper.") | |
| else: | |
| components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True) | |
| # βββ 3. ONTOLOGY ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_ontology: | |
| st.subheader("CitationHub Ontology") | |
| st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen") | |
| components.html(pyvis_ontology(), height=820, scrolling=True) | |
| # βββ 4. KNOWLEDGE GRAPH (μ€μ KG λ°μ΄ν°) βββββββββββββββββββββββββ | |
| with tab_kg: | |
| st.subheader("Knowledge Graph β Selected Seed Paper") | |
| st.caption("kg_nodes + kg_edges μ 체 λ°μ΄ν°μμ μ νλ seed paperμ 1-hop μλΈκ·Έλν") | |
| st.info("μλ λ²νΌμ λλ¬ KG λ°μ΄ν°λ₯Ό λ‘λνμΈμ (μ΅μ΄ 1ν, μ΄ν μΊμλ¨)") | |
| if st.button("KG λ°μ΄ν° λ‘λ", key="kg_load"): | |
| with st.spinner("kg_nodes / kg_edges / enriched λ‘λ© μ€ ..."): | |
| st.session_state["kg_loaded"] = True | |
| if st.session_state.get("kg_loaded"): | |
| try: | |
| kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val) | |
| seed_doi = selected_seed["doi"] | |
| if not seed_doi: | |
| st.warning("μ νλ seed paperμ DOIκ° μμ΄ KG μ‘°νκ° λΆκ°ν©λλ€.") | |
| else: | |
| nodes_sub, edges_sub = get_kg_subgraph(seed_doi, kg_nodes, kg_edges) | |
| if nodes_sub is None: | |
| st.warning(f"KGμμ λ Έλλ₯Ό μ°Ύμ μ μμ΅λλ€. (DOI: {seed_doi})") | |
| else: | |
| # ν΅κ³ | |
| c1, c2, c3 = st.columns(3) | |
| c1.metric("Nodes", fmt_num(len(nodes_sub))) | |
| c2.metric("Edges", fmt_num(len(edges_sub))) | |
| c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique())) | |
| type_counts = nodes_sub["node_type"].value_counts().reset_index() | |
| type_counts.columns = ["node_type","count"] | |
| st.plotly_chart( | |
| px.bar(type_counts, x="node_type", y="count", | |
| color="node_type", | |
| color_discrete_map=NODE_TYPE_COLORS, | |
| title="Node Type Distribution") | |
| .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"), | |
| use_container_width=True) | |
| st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen") | |
| components.html(pyvis_from_kg(nodes_sub, edges_sub), height=820, scrolling=True) | |
| except Exception as e: | |
| st.error(str(e)) | |
| # βββ 5. KG EXPLORER βββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_kg_exp: | |
| st.subheader("KG Explorer") | |
| st.caption("kg_nodes μ 체λ₯Ό νμνκ³ μμ λ Έλμ μ°κ²° κ΄κ³λ₯Ό μκ°νν©λλ€.") | |
| st.info("μλ λ²νΌμ λλ¬ KG λ°μ΄ν°λ₯Ό λ‘λνμΈμ (μ΅μ΄ 1ν, μ΄ν μΊμλ¨)") | |
| if st.button("KG λ°μ΄ν° λ‘λ", key="kg_exp_load"): | |
| with st.spinner("λ‘λ© μ€..."): | |
| st.session_state["kg_loaded"] = True | |
| if st.session_state.get("kg_loaded"): | |
| try: | |
| kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val) | |
| # ββ μ 체 λ Έλ νμ λΆν¬ | |
| col_a, col_b = st.columns([1,2]) | |
| with col_a: | |
| st.subheader("Node Type Counts") | |
| nt = kg_nodes["node_type"].value_counts().reset_index() | |
| nt.columns = ["node_type","count"] | |
| st.dataframe(nt, use_container_width=True, hide_index=True) | |
| st.subheader("Edge Type Counts") | |
| et = kg_edges["edge_type"].value_counts().reset_index() | |
| et.columns = ["edge_type","count"] | |
| st.dataframe(et, use_container_width=True, hide_index=True) | |
| with col_b: | |
| st.subheader("Node Type Distribution") | |
| nt_fig = px.bar(nt, x="node_type", y="count", color="node_type", | |
| color_discrete_map=NODE_TYPE_COLORS) | |
| nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count") | |
| st.plotly_chart(nt_fig, use_container_width=True) | |
| st.markdown("---") | |
| st.subheader("Node Search & Ego Network") | |
| exp_col1, exp_col2 = st.columns([1,3]) | |
| with exp_col1: | |
| type_options = ["(all)"] + sorted(kg_nodes["node_type"].unique().tolist()) | |
| sel_type = st.selectbox("Filter by node type", type_options) | |
| filtered_nodes = (kg_nodes if sel_type == "(all)" | |
| else kg_nodes[kg_nodes["node_type"]==sel_type]) | |
| search_q = st.text_input("Search node label / DOI") | |
| if search_q: | |
| filtered_nodes = filtered_nodes[ | |
| filtered_nodes["label"].str.contains(search_q, case=False, na=False) | | |
| filtered_nodes["doi"].str.contains(search_q, case=False, na=False) | |
| ] | |
| sample = filtered_nodes.head(100) | |
| node_options = sample["node_id"].tolist() | |
| if not node_options: | |
| st.warning("κ²μ κ²°κ³Όκ° μμ΅λλ€.") | |
| else: | |
| sel_node_id = st.selectbox( | |
| "Select node", | |
| node_options, | |
| format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60], | |
| ) | |
| sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0] | |
| st.markdown(f"**Type**: {sel_node_info.get('node_type','')}") | |
| st.markdown(f"**DOI**: {sel_node_info.get('doi','') or '-'}") | |
| st.markdown(f"**Publication**: {sel_node_info.get('publication_name','') or '-'}") | |
| st.markdown(f"**Group**: {sel_node_info.get('group','') or '-'}") | |
| st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}") | |
| max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max") | |
| if st.button("Show ego network", key="kg_exp_show"): | |
| exp_nodes, exp_edges = get_explorer_subgraph(sel_node_id, kg_nodes, kg_edges, max_e) | |
| if exp_nodes is None: | |
| st.warning("μ°κ²°λ μ£μ§κ° μμ΅λλ€.") | |
| else: | |
| st.session_state["exp_nodes"] = exp_nodes | |
| st.session_state["exp_edges"] = exp_edges | |
| with exp_col2: | |
| if "exp_nodes" in st.session_state: | |
| en = st.session_state["exp_nodes"] | |
| ee = st.session_state["exp_edges"] | |
| st.caption(f"Nodes: {len(en)} | Edges: {len(ee)}") | |
| st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen") | |
| components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True) | |
| else: | |
| st.info("μΌμͺ½μμ λ Έλλ₯Ό μ ννκ³ 'Show ego network'λ₯Ό ν΄λ¦νμΈμ.") | |
| # ββ Enriched μΈμ¬μ΄νΈ | |
| st.markdown("---") | |
| st.subheader("Enriched Citation Insights") | |
| st.caption("citation_events_enriched: μλ―Έμ μ¦κ±°(semantic evidence) λΆμ") | |
| if "has_semantic_evidence" in enriched.columns: | |
| sem = enriched["has_semantic_evidence"].value_counts().reset_index() | |
| sem.columns = ["has_semantic_evidence","count"] | |
| sem["label"] = sem["has_semantic_evidence"].map({True:"With evidence", False:"Without evidence"}) | |
| st.plotly_chart( | |
| px.pie(sem, names="label", values="count", | |
| title="Semantic Evidence Coverage (all citation events)") | |
| .update_layout(legend_title=""), | |
| use_container_width=True) | |
| # λΆμΌλ³ semantic evidence λΉμ¨ | |
| if "field_folder" in enriched.columns: | |
| field_sem = (enriched.groupby("field_folder")["has_semantic_evidence"] | |
| .mean().reset_index() | |
| .rename(columns={"has_semantic_evidence":"sem_ratio","field_folder":"field"}) | |
| .sort_values("sem_ratio", ascending=False).head(20)) | |
| st.plotly_chart( | |
| px.bar(field_sem, x="field", y="sem_ratio", | |
| title="Semantic Evidence Rate by Field", | |
| labels={"sem_ratio":"Evidence Rate","field":"Field"}) | |
| .update_layout(xaxis_tickangle=-40), | |
| use_container_width=True) | |
| else: | |
| st.info("has_semantic_evidence 컬λΌμ΄ μμ΅λλ€.") | |
| except Exception as e: | |
| st.error(str(e)) | |
| # βββ 6. GEOGRAPHIC MAP ββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_geo: | |
| st.subheader("Geographic Distribution of Seed Papers") | |
| country_cnt = (seed_filtered.groupby("country", dropna=False).size() | |
| .reset_index(name="count").rename(columns={"country":"country_name"})) | |
| country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""] | |
| if not country_cnt.empty: | |
| fig_map = px.choropleth(country_cnt, locations="country_name", | |
| locationmode="country names", color="count", | |
| hover_name="country_name", | |
| color_continuous_scale="Blues", | |
| title="Seed Papers by Country") | |
| fig_map.update_layout(geo=dict(showframe=False), height=500) | |
| st.plotly_chart(fig_map, use_container_width=True) | |
| st.subheader("Top Cities (Affiliation)") | |
| city_cnt = (seed_filtered.merge( | |
| aff_geo_df[["affiliation_name","city_name","country_name"]], | |
| left_on="affiliation", right_on="affiliation_name", how="left") | |
| .groupby(["country_name","city_name"], dropna=False).size() | |
| .reset_index(name="count").dropna(subset=["country_name"]) | |
| .sort_values("count", ascending=False).head(30)) | |
| if not city_cnt.empty: | |
| st.plotly_chart( | |
| px.bar(city_cnt, x="city_name", y="count", color="country_name", | |
| title="Top 30 Cities") | |
| .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40), | |
| use_container_width=True) | |
| st.subheader("Citation Trend over Time (selected paper)") | |
| trend2 = (seed_events.dropna(subset=["citing_year"]) | |
| .assign(citing_year=lambda df: df["citing_year"].astype(int)) | |
| .groupby("citing_year").size().reset_index(name="count")) | |
| if not trend2.empty: | |
| st.plotly_chart( | |
| px.line(trend2, x="citing_year", y="count", markers=True, | |
| title="Citations per Year") | |
| .update_layout(xaxis_title="Year", yaxis_title="Citations"), | |
| use_container_width=True) | |
| # βββ 7. ANALYTICS βββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_analytics: | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| st.subheader("Top Authors") | |
| if "author_id" in seed.columns and not seed["author_id"].isna().all(): | |
| top_auth = (seed.explode("author_id") | |
| .merge(authors_df, on="author_id", how="left") | |
| .groupby("author_name").size() | |
| .reset_index(name="paper_count") | |
| .sort_values("paper_count", ascending=False).head(20)) | |
| else: | |
| top_auth = (seed["author"].value_counts() | |
| .reset_index().rename(columns={"author":"author_name","count":"paper_count"}) | |
| .head(20)) | |
| top_auth = top_auth[top_auth["author_name"].str.strip() != ""] | |
| st.plotly_chart( | |
| px.bar(top_auth, x="paper_count", y="author_name", orientation="h", | |
| title="Top 20 Authors") | |
| .update_layout(yaxis=dict(autorange="reversed"), | |
| xaxis_title="Seed Papers", yaxis_title=""), | |
| use_container_width=True) | |
| with col_b: | |
| st.subheader("Top Journals") | |
| top_jnl = (seed.groupby("journal").size() | |
| .reset_index(name="count").sort_values("count", ascending=False).head(20)) | |
| top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""] | |
| st.plotly_chart( | |
| px.bar(top_jnl, x="count", y="journal", orientation="h", | |
| title="Top 20 Journals") | |
| .update_layout(yaxis=dict(autorange="reversed"), | |
| xaxis_title="Seed Papers", yaxis_title=""), | |
| use_container_width=True) | |
| st.markdown("---") | |
| col_c, col_d = st.columns(2) | |
| with col_c: | |
| st.subheader("Field Γ Intent Heatmap") | |
| fi = (seed[["seed_paper_id","field"]] | |
| .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner") | |
| .groupby(["field","primary_intent"]).size().reset_index(name="count")) | |
| if not fi.empty: | |
| pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0) | |
| st.plotly_chart( | |
| px.imshow(pivot, color_continuous_scale="Blues", | |
| title="Citation Intent by Field", aspect="auto") | |
| .update_layout(xaxis_title="Intent", yaxis_title="Field"), | |
| use_container_width=True) | |
| with col_d: | |
| st.subheader("Influential Citations (selected paper)") | |
| if "is_influential" in seed_events.columns: | |
| inf = seed_events["is_influential"].value_counts().reset_index() | |
| inf.columns = ["is_influential","count"] | |
| inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"}) | |
| st.plotly_chart( | |
| px.pie(inf, names="label", values="count", | |
| title="Influential vs Non-influential"), | |
| use_container_width=True) | |
| st.subheader("Intent Reference") | |
| st.dataframe(intents_df, use_container_width=True, hide_index=True) | |
| st.markdown("---") | |
| st.subheader("Field Reference") | |
| st.dataframe(fields_df, use_container_width=True, hide_index=True) | |