Spaces:

Daniel0315
/

cithub_website

Sleeping

App Files Files Community

Daniel0315 commited on Mar 19

Commit

af9c904

verified ·

1 Parent(s): 41e4f48

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +462 -329

src/app.py CHANGED Viewed

@@ -7,34 +7,32 @@ from typing import List
 import pandas as pd
 import streamlit as st
 import plotly.express as px
-import plotly.graph_objects as go
 from pyvis.network import Network
 import streamlit.components.v1 as components
 HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-st.set_page_config(
-    page_title="CitationHub",
-    page_icon="📚",
-    layout="wide",
-)
 ALLOWED_INTENTS = [
-    "background", "uses", "similarities", "motivation",
-    "differences", "future_work", "extends",
 ]
 INTENT_COLORS = {
-    "background": "#94a3b8", "uses": "#22c55e", "similarities": "#3b82f6",
-    "motivation": "#f59e0b", "differences": "#ef4444",
-    "future_work": "#8b5cf6", "extends": "#06b6d4",
 }
 NODE_COLORS = {
-    "seed_paper": "#111827", "citing_paper": "#dbeafe", "citation_event": "#fde68a",
-    "journal": "#ede9fe", "author": "#fee2e2", "affiliation": "#fae8ff",
-    "city": "#cffafe", "country": "#ffedd5", "field": "#e0e7ff", "intent": "#dcfce7",
 }
 DEFAULT_DATA_DIR = Path(os.environ.get(
@@ -44,10 +42,8 @@ DEFAULT_DATA_DIR = Path(os.environ.get(
 def fmt_num(x):
-    try:
-        return f"{int(x):,}"
-    except Exception:
-        return "-"
 def _hf_download(filename: str) -> str:
@@ -65,47 +61,39 @@ def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
 def inject_fullscreen(html: str) -> str:
-    """pyvis HTML에 전체화면 버튼을 주입합니다."""
     btn = """
-    <button
-      onclick="var el=document.getElementById('mynetwork');
-               if(el){if(el.requestFullscreen)el.requestFullscreen();
-               else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
       style="position:fixed;bottom:18px;right:18px;z-index:9999;
-             padding:8px 18px;background:#1e293b;color:white;
-             border:none;border-radius:8px;cursor:pointer;font-size:13px;
-             box-shadow:0 2px 8px rgba(0,0,0,0.35);">
-      ⛶ Fullscreen
-    </button>
-    <div style="position:fixed;bottom:18px;left:18px;z-index:9999;
-                font-size:12px;color:#64748b;background:rgba(255,255,255,0.85);
                 padding:5px 10px;border-radius:6px;">
-      🖱 Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info
-    </div>
     """
     return html.replace("</body>", btn + "</body>")
 @st.cache_data(show_spinner=False)
 def load_data(data_dir_str: str):
     d = None if HF_REPO_ID else Path(data_dir_str)
-    # --- 핵심 3개 (대용량) ---
-    seed_df = _read("seed_cited_papers_normalized.parquet", d)
-    events_df = _read("citation_events_normalized.parquet", d)
-    citing_df = _read("citing_papers_normalized.parquet", d)
-    # --- 참조 테이블 (소용량) ---
-    authors_df     = _read("authors.parquet", d)
     affiliations_df = _read("affiliations.parquet", d)
-    aff_geo_df     = _read("affiliation_geo.parquet", d)
-    cities_df      = _read("cities.parquet", d)
-    countries_df   = _read("countries.parquet", d)
-    fields_df      = _read("fields.parquet", d)
-    intents_df     = _read("intents.parquet", d)
-    journals_df    = _read("journals.parquet", d)
-    # --- seed 가공 ---
     seed = pd.DataFrame({
         "seed_paper_id":  seed_df["seed_paper_id"],
         "doi":            seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
@@ -123,11 +111,10 @@ def load_data(data_dir_str: str):
         "field_id":       seed_df.get("field_id", pd.Series(dtype=object)),
         "journal_id":     seed_df.get("journal_id", pd.Series(dtype=object)),
     })
-    for col in ["title", "doi", "journal", "field", "country"]:
         seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
-    seed = seed.sort_values(["citedby_count", "title"], ascending=[False, True]).reset_index(drop=True)
-    # --- events 가공 ---
     events = pd.DataFrame({
         "citation_event_id": events_df["citation_event_id"],
         "seed_paper_id":     events_df["cited_seed_paper_id"],
@@ -145,14 +132,13 @@ def load_data(data_dir_str: str):
     })
     events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
-    # --- citing 가공 ---
     citing = pd.DataFrame({
         "citing_paper_id": citing_df["citing_paper_id"],
-        "doi":   citing_df.get("doi", pd.Series(dtype=str)).fillna(""),
-        "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""),
-        "year":  pd.to_numeric(citing_df.get("year"), errors="coerce"),
-        "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
-        "oa_pdf": citing_df.get("oa_pdf", pd.Series(dtype=str)).fillna(""),
     })
     filters = {
@@ -163,35 +149,40 @@ def load_data(data_dir_str: str):
         "year_min":  int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
         "year_max":  int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
     }
     overview = {
-        "seed_papers":      int(len(seed)),
-        "citation_events":  int(len(events)),
-        "citing_papers":    int(events["citing_paper_id"].nunique()),
-        "journals":         int(seed["journal"].replace("", pd.NA).dropna().nunique()),
-        "countries":        int(seed["country"].replace("", pd.NA).dropna().nunique()),
-        "fields":           int(seed["field"].replace("", pd.NA).dropna().nunique()),
-        "intents":          len(ALLOWED_INTENTS),
-        "authors":          int(len(authors_df)),
     }
     return (seed, events, citing, filters, overview,
             authors_df, affiliations_df, aff_geo_df,
             cities_df, countries_df, fields_df, intents_df, journals_df)
-# ── 필터 헬퍼 ──────────────────────────────────────────────
 def filter_seed_papers(seed, q, fields, countries, journals):
     df = seed.copy()
     q = (q or "").strip().lower()
     if q:
         df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
-    if fields:
-        df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
-    if countries:
-        df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
-    if journals:
-        df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
     return df.reset_index(drop=True)
@@ -204,111 +195,107 @@ def event_subset(events, seed_paper_id, year_min, year_max):
 def build_intent_summary(df):
     counts = df.groupby("primary_intent").size().to_dict()
-    return pd.DataFrame({
-        "intent": ALLOWED_INTENTS,
-        "count": [int(counts.get(i, 0)) for i in ALLOWED_INTENTS],
-    })
 def build_context_rows(df, limit=20):
     rows = []
-    df = df.sort_values(["context_count", "intent_count", "citing_year"],
-                        ascending=[False, False, False], na_position="last")
     for _, row in df.iterrows():
-        contexts = row["contexts"]
-        if isinstance(contexts, list) and contexts:
-            for ctx in contexts[:2]:
-                rows.append({
-                    "primary_intent": row["primary_intent"],
-                    "citing_title": row["citing_title"],
-                    "citing_doi": row["citing_doi"],
-                    "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
-                    "context": ctx,
-                })
-        if len(rows) >= limit:
-            break
     return pd.DataFrame(rows[:limit])
 def build_citing_table(df, limit=30):
     if df.empty:
-        return pd.DataFrame(columns=["citing_title", "citing_year", "primary_intent", "context_count"])
-    return (
-        df.sort_values(["context_count", "intent_count", "citing_year"],
-                       ascending=[False, False, False], na_position="last")
-        [["citing_paper_id", "citing_title", "citing_doi", "citing_year", "primary_intent", "context_count"]]
-        .drop_duplicates(subset=["citing_paper_id"])
-        .head(limit)
-    )
-# ── pyvis 빌더 ─────────────────────────────────────────────
 def pyvis_citation_graph(seed_row, events_df):
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     sid = seed_row["seed_paper_id"]
     net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
-                 font={"color": "white"})
-    for _, row in events_df.sort_values(["context_count", "intent_count"],
                                          ascending=False).head(40).iterrows():
         cid = row["citing_paper_id"]
         net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
                      color=NODE_COLORS["citing_paper"], size=18, shape="dot")
         ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
-        yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
         net.add_edge(cid, sid, label=row["primary_intent"],
-                     color=INTENT_COLORS.get(row["primary_intent"], "#94a3b8"),
                      title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
-def pyvis_kg(seed_row, events_df):
-    net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
-    sid = seed_row["seed_paper_id"]
-    net.add_node(sid, label=seed_row["title"][:60], color=NODE_COLORS["seed_paper"],
-                 font={"color": "white"}, size=34, shape="dot")
-    for key, typ, rel in [
-        ("journal", "journal", "PUBLISHED_IN"), ("author", "author", "HAS_AUTHOR"),
-        ("affiliation", "affiliation", "HAS_AFFILIATION"), ("city", "city", "LOCATED_IN_CITY"),
-        ("country", "country", "LOCATED_IN_COUNTRY"), ("field", "field", "BELONGS_TO_FIELD"),
-    ]:
-        val = seed_row.get(key, "")
-        if val:
-            nid = f"{typ}:{val}"
-            net.add_node(nid, label=str(val)[:50], color=NODE_COLORS[typ], size=16)
-            net.add_edge(sid, nid, label=rel)
-    top = events_df.sort_values(["context_count", "intent_count"], ascending=False).head(20)
-    for intent, cnt in top.groupby("primary_intent").size().items():
-        iid = f"intent:{intent}"
-        net.add_node(iid, label=f"{intent} ({cnt})", color=NODE_COLORS["intent"], size=18)
-        net.add_edge(sid, iid, label="HAS_INTENT_CLUSTER")
-    for _, row in top.iterrows():
-        eid, cid = row["citation_event_id"], row["citing_paper_id"]
-        net.add_node(eid, label=row["primary_intent"], color=NODE_COLORS["citation_event"], size=14)
-        net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:55],
-                     color=NODE_COLORS["citing_paper"], size=14)
-        net.add_edge(eid, sid, label="HAS_CITED_PAPER")
-        net.add_edge(eid, cid, label="HAS_CITING_PAPER")
-        net.add_edge(eid, f"intent:{row['primary_intent']}", label="HAS_PRIMARY_INTENT")
-    net.barnes_hut()
-    return inject_fullscreen(net.generate_html())
 def pyvis_ontology():
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     for nid, label, typ in [
-        ("seed","Top5PctCitedPaper","seed_paper"), ("event","CitationEvent","citation_event"),
-        ("citing","CitingPaper","citing_paper"),   ("intent","Intent","intent"),
-        ("journal","Journal","journal"),            ("author","Author","author"),
         ("affiliation","Affiliation","affiliation"),("city","City","city"),
-        ("country","Country","country"),            ("field","Field","field"),
     ]:
         net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
     for s, t, l in [
-        ("event","citing","hasCitingPaper"), ("event","seed","hasCitedPaper"),
-        ("event","intent","hasPrimaryIntent"), ("seed","journal","publishedInJournal"),
-        ("seed","author","hasAuthor"),         ("seed","affiliation","hasAffiliation"),
-        ("seed","city","locatedInCity"),        ("seed","country","locatedInCountry"),
         ("seed","field","belongsToField"),
     ]:
         net.add_edge(s, t, label=l)
@@ -316,10 +303,32 @@ def pyvis_ontology():
     return inject_fullscreen(net.generate_html())
-# ── 메인 UI ────────────────────────────────────────────────
 st.title("CitationHub")
-st.caption("Explore influential papers, their citation networks, and related research.")
 with st.sidebar:
     st.subheader("Data source")
     if HF_REPO_ID:
@@ -334,13 +343,11 @@ with st.sidebar:
          cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
         st.success("Data loaded")
     except Exception as e:
-        st.error(str(e))
-        st.stop()
     st.subheader("Search seed papers")
     q_input = st.text_input("Title or DOI")
-    if "q_submit" not in st.session_state:
-        st.session_state["q_submit"] = ""
     if st.button("Search", use_container_width=True):
         st.session_state["q_submit"] = q_input
@@ -355,61 +362,70 @@ with st.sidebar:
     st.subheader("Overview counts")
     c1, c2 = st.columns(2)
-    c1.metric("Seed papers",      fmt_num(overview["seed_papers"]))
-    c2.metric("Citation events",  fmt_num(overview["citation_events"]))
-    c1.metric("Citing papers",    fmt_num(overview["citing_papers"]))
-    c2.metric("Authors",          fmt_num(overview["authors"]))
-    c1.metric("Countries",        fmt_num(overview["countries"]))
-    c2.metric("Fields",           fmt_num(overview["fields"]))
     options = seed_filtered["seed_paper_id"].tolist()
     if not options:
-        st.warning("No seed papers match the current search.")
-        st.stop()
-    current = st.session_state.get("selected_seed_id", options[0])
     default_idx = options.index(current) if current in options else 0
     selected_seed_id = st.selectbox(
         "Seed paper", options, index=default_idx,
         format_func=lambda sid: seed_filtered.loc[
-            seed_filtered["seed_paper_id"] == sid, "title"].iloc[0],
     )
     st.session_state["selected_seed_id"] = selected_seed_id
-selected_seed = seed_filtered[seed_filtered["seed_paper_id"] == selected_seed_id].iloc[0]
-seed_events   = event_subset(events, selected_seed_id, year_min, year_max)
 intent_summary = build_intent_summary(seed_events)
 contexts_df    = build_context_rows(seed_events)
 citing_table   = build_citing_table(seed_events)
-# ── 탭 ────────────────────────────────────────────────────
 (tab_overview, tab_cnet, tab_ontology, tab_kg,
- tab_geo, tab_analytics) = st.tabs([
-    "Overview", "Citation Network", "Ontology", "Knowledge Graph",
-    "Geographic Map", "Analytics",
 ])
-# ─────────────────── 1. OVERVIEW ──────────────────────────
 with tab_overview:
     col1, col2 = st.columns(2)
     with col1:
         st.subheader("Seed paper detail")
-        st.columns(2)[0].metric("Cited by", fmt_num(selected_seed["citedby_count"]))
-        st.columns(2)[1].metric("Citation events", fmt_num(len(seed_events)))
         for label, key in [
-            ("Title","title"), ("DOI","doi"), ("Journal","journal"),
-            ("Author","author"), ("Affiliation","affiliation"),
-            ("City","city"), ("Country","country"), ("Field","field"),
         ]:
             st.markdown(f"**{label}**  \n{selected_seed[key] or '-'}")
         st.subheader("Related citing papers")
-        st.dataframe(
-            citing_table.rename(columns={
-                "citing_title":"Title","citing_year":"Year",
-                "primary_intent":"Intent","context_count":"Contexts",
-            }),
-            use_container_width=True, hide_index=True,
-        )
     with col2:
         st.subheader("Intent distribution (selected paper)")
@@ -418,23 +434,23 @@ with tab_overview:
         fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
         st.plotly_chart(fig, use_container_width=True)
         st.subheader("Field distribution")
         fd = (seed_filtered.groupby("field", dropna=False).size()
               .reset_index(name="count").sort_values("count", ascending=False).head(20))
-        fd["field"] = fd["field"].replace("", "Unknown")
         st.plotly_chart(
             px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
-            use_container_width=True,
-        )
-        st.subheader("Overall intent distribution")
-        all_intents = events.groupby("primary_intent").size().to_dict()
-        ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
-                               "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
-        fig2 = px.bar(ai_df, x="intent", y="count", color="intent",
-                      color_discrete_map=INTENT_COLORS)
-        fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
-        st.plotly_chart(fig2, use_container_width=True)
     st.subheader("Citation contexts")
     if contexts_df.empty:
@@ -450,10 +466,10 @@ with tab_overview:
                 <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
                 {row['citing_year'] or '-'} · {row['citing_title'] or row['citing_doi']}</div>
                 <div>{row['context']}</div></div>""",
-                unsafe_allow_html=True,
-            )
-# ─────────────────── 2. CITATION NETWORK ──────────────────
 with tab_cnet:
     st.subheader("Citing ↔ Cited Citation Network")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
@@ -462,168 +478,285 @@ with tab_cnet:
     else:
         components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
-# ─────────────────── 3. ONTOLOGY ──────────────────────────
 with tab_ontology:
     st.subheader("CitationHub Ontology")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
     components.html(pyvis_ontology(), height=820, scrolling=True)
-# ─────────────────── 4. KNOWLEDGE GRAPH ───────────────────
 with tab_kg:
     st.subheader("Knowledge Graph — Selected Seed Paper")
-    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
-    if seed_events.empty:
-        st.info("No knowledge graph data for this seed paper.")
-    else:
-        components.html(pyvis_kg(selected_seed, seed_events), height=820, scrolling=True)
-# ─────────────────── 5. GEOGRAPHIC MAP ────────────────────
 with tab_geo:
     st.subheader("Geographic Distribution of Seed Papers")
-    # 국가별 seed paper 수
-    country_cnt = (
-        seed_filtered.groupby("country", dropna=False).size()
-        .reset_index(name="count")
-        .rename(columns={"country": "country_name"})
-    )
     country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
-    country_cnt = country_cnt.merge(countries_df, on="country_name", how="left")
     if not country_cnt.empty:
-        fig_map = px.choropleth(
-            country_cnt,
-            locations="country_name",
-            locationmode="country names",
-            color="count",
-            hover_name="country_name",
-            color_continuous_scale="Blues",
-            title="Seed Papers by Country",
-        )
         fig_map.update_layout(geo=dict(showframe=False), height=500)
         st.plotly_chart(fig_map, use_container_width=True)
-    # 도시별 분포 (affiliation_geo 활용)
-    st.subheader("Affiliation Geo Distribution")
-    city_cnt = (
-        seed_filtered.merge(
-            aff_geo_df[["affiliation_name", "city_name", "country_name"]],
-            left_on="affiliation", right_on="affiliation_name", how="left",
-        )
-        .groupby(["country_name","city_name"], dropna=False).size()
-        .reset_index(name="count")
-        .dropna(subset=["country_name"])
-        .sort_values("count", ascending=False)
-        .head(30)
-    )
     if not city_cnt.empty:
-        fig_city = px.bar(
-            city_cnt, x="city_name", y="count", color="country_name",
-            title="Top 30 Cities (Affiliation)",
-        )
-        fig_city.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40)
-        st.plotly_chart(fig_city, use_container_width=True)
-    # 연도별 citing 추이 (국가 필터)
-    st.subheader("Citation Trend over Time")
-    year_trend = (
-        seed_events.groupby("citing_year").size()
-        .reset_index(name="count")
-        .dropna()
-    )
-    year_trend["citing_year"] = year_trend["citing_year"].astype(int)
-    if not year_trend.empty:
-        fig_trend = px.line(year_trend, x="citing_year", y="count",
-                            title="Citations per Year (selected seed paper)",
-                            markers=True)
-        fig_trend.update_layout(xaxis_title="Year", yaxis_title="Citations")
-        st.plotly_chart(fig_trend, use_container_width=True)
-# ─────────────────── 6. ANALYTICS ────────────────────────
 with tab_analytics:
     col_a, col_b = st.columns(2)
-    # ── 저자 랭킹
     with col_a:
-        st.subheader("Top Authors (by seed paper count)")
-        # seed_cited_papers_normalized에 author_id 있으면 join
         if "author_id" in seed.columns and not seed["author_id"].isna().all():
-            top_authors = (
-                seed.explode("author_id")
-                .merge(authors_df, on="author_id", how="left")
-                .groupby("author_name").size()
-                .reset_index(name="paper_count")
-                .sort_values("paper_count", ascending=False)
-                .head(20)
-            )
         else:
-            # creator 컬럼에서 직접 추출
-            top_authors = (
-                seed["author"].value_counts()
-                .reset_index()
-                .rename(columns={"author": "author_name", "count": "paper_count"})
-                .head(20)
-            )
-        top_authors = top_authors[top_authors["author_name"].str.strip() != ""]
-        fig_auth = px.bar(top_authors, x="paper_count", y="author_name",
-                          orientation="h", title="Top 20 Authors")
-        fig_auth.update_layout(yaxis=dict(autorange="reversed"),
-                               xaxis_title="Seed Papers", yaxis_title="")
-        st.plotly_chart(fig_auth, use_container_width=True)
-    # ── 저널 랭킹
     with col_b:
-        st.subheader("Top Journals (by seed paper count)")
-        top_journals = (
-            seed.groupby("journal").size()
-            .reset_index(name="count")
-            .sort_values("count", ascending=False)
-            .head(20)
-        )
-        top_journals = top_journals[top_journals["journal"].str.strip() != ""]
-        fig_jnl = px.bar(top_journals, x="count", y="journal",
-                         orientation="h", title="Top 20 Journals")
-        fig_jnl.update_layout(yaxis=dict(autorange="reversed"),
-                               xaxis_title="Seed Papers", yaxis_title="")
-        st.plotly_chart(fig_jnl, use_container_width=True)
     st.markdown("---")
     col_c, col_d = st.columns(2)
-    # ── 분야별 인용 의도 히트맵
     with col_c:
         st.subheader("Field × Intent Heatmap")
-        field_intent = (
-            seed[["seed_paper_id", "field"]]
-            .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
-            .groupby(["field", "primary_intent"]).size()
-            .reset_index(name="count")
-        )
-        if not field_intent.empty:
-            pivot = field_intent.pivot(index="field", columns="primary_intent", values="count").fillna(0)
-            fig_hm = px.imshow(pivot, color_continuous_scale="Blues",
-                               title="Citation Intent by Field",
-                               aspect="auto")
-            fig_hm.update_layout(xaxis_title="Intent", yaxis_title="Field")
-            st.plotly_chart(fig_hm, use_container_width=True)
-    # ── Influential citation 비율
     with col_d:
-        st.subheader("Influential Citations")
         if "is_influential" in seed_events.columns:
-            inf_cnt = seed_events["is_influential"].value_counts().reset_index()
-            inf_cnt.columns = ["is_influential", "count"]
-            inf_cnt["label"] = inf_cnt["is_influential"].map({True: "Influential", False: "Non-influential"})
-            fig_inf = px.pie(inf_cnt, names="label", values="count",
-                             title="Influential vs Non-influential (selected paper)")
-            st.plotly_chart(fig_inf, use_container_width=True)
-        else:
-            st.info("is_influential 컬럼이 없습니다.")
-    # ── Intent 상세 정보
-    st.subheader("Intent Reference Table")
-    st.dataframe(intents_df, use_container_width=True, hide_index=True)
-    # ── Fields 상세 정보
-    st.subheader("Field Reference Table")
     st.dataframe(fields_df, use_container_width=True, hide_index=True)

 import pandas as pd
 import streamlit as st
 import plotly.express as px
 from pyvis.network import Network
 import streamlit.components.v1 as components
 HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
+HF_TOKEN   = os.environ.get("HF_TOKEN", "")
+st.set_page_config(page_title="CitationHub", page_icon="📚", layout="wide")
 ALLOWED_INTENTS = [
+    "background","uses","similarities","motivation",
+    "differences","future_work","extends",
 ]
 INTENT_COLORS = {
+    "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6",
+    "motivation":"#f59e0b","differences":"#ef4444",
+    "future_work":"#8b5cf6","extends":"#06b6d4",
 }
 NODE_COLORS = {
+    "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a",
+    "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff",
+    "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7",
+}
+NODE_TYPE_COLORS = {
+    "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b",
+    "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899",
+    "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e",
 }
 DEFAULT_DATA_DIR = Path(os.environ.get(
 def fmt_num(x):
+    try: return f"{int(x):,}"
+    except: return "-"
 def _hf_download(filename: str) -> str:
 def inject_fullscreen(html: str) -> str:
     btn = """
+    <button onclick="var el=document.getElementById('mynetwork');
+      if(el){if(el.requestFullscreen)el.requestFullscreen();
+      else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
       style="position:fixed;bottom:18px;right:18px;z-index:9999;
+             padding:8px 18px;background:#1e293b;color:white;border:none;
+             border-radius:8px;cursor:pointer;font-size:13px;
+             box-shadow:0 2px 8px rgba(0,0,0,0.35);">⛶ Fullscreen</button>
+    <div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px;
+                color:#64748b;background:rgba(255,255,255,0.85);
                 padding:5px 10px;border-radius:6px;">
+      🖱 Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
     """
     return html.replace("</body>", btn + "</body>")
+# ── 메인 데이터 로드 (11개) ────────────────────────────────────
 @st.cache_data(show_spinner=False)
 def load_data(data_dir_str: str):
     d = None if HF_REPO_ID else Path(data_dir_str)
+    seed_df    = _read("seed_cited_papers_normalized.parquet", d)
+    events_df  = _read("citation_events_normalized.parquet", d)
+    citing_df  = _read("citing_papers_normalized.parquet", d)
+    authors_df      = _read("authors.parquet", d)
     affiliations_df = _read("affiliations.parquet", d)
+    aff_geo_df      = _read("affiliation_geo.parquet", d)
+    cities_df       = _read("cities.parquet", d)
+    countries_df    = _read("countries.parquet", d)
+    fields_df       = _read("fields.parquet", d)
+    intents_df      = _read("intents.parquet", d)
+    journals_df     = _read("journals.parquet", d)
     seed = pd.DataFrame({
         "seed_paper_id":  seed_df["seed_paper_id"],
         "doi":            seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
         "field_id":       seed_df.get("field_id", pd.Series(dtype=object)),
         "journal_id":     seed_df.get("journal_id", pd.Series(dtype=object)),
     })
+    for col in ["title","doi","journal","field","country"]:
         seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
+    seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True)
     events = pd.DataFrame({
         "citation_event_id": events_df["citation_event_id"],
         "seed_paper_id":     events_df["cited_seed_paper_id"],
     })
     events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
     citing = pd.DataFrame({
         "citing_paper_id": citing_df["citing_paper_id"],
+        "doi":    citing_df.get("doi",   pd.Series(dtype=str)).fillna(""),
+        "title":  citing_df.get("title", pd.Series(dtype=str)).fillna(""),
+        "year":   pd.to_numeric(citing_df.get("year"), errors="coerce"),
+        "venue":  citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
+        "oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""),
     })
     filters = {
         "year_min":  int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
         "year_max":  int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
     }
     overview = {
+        "seed_papers":     int(len(seed)),
+        "citation_events": int(len(events)),
+        "citing_papers":   int(events["citing_paper_id"].nunique()),
+        "authors":         int(len(authors_df)),
+        "journals":        int(seed["journal"].replace("", pd.NA).dropna().nunique()),
+        "countries":       int(seed["country"].replace("", pd.NA).dropna().nunique()),
+        "fields":          int(seed["field"].replace("", pd.NA).dropna().nunique()),
+        "intents":         len(ALLOWED_INTENTS),
     }
     return (seed, events, citing, filters, overview,
             authors_df, affiliations_df, aff_geo_df,
             cities_df, countries_df, fields_df, intents_df, journals_df)
+# ── KG + Enriched 데이터 (별도 지연 로드) ─────────────────────
+@st.cache_data(show_spinner=False)
+def load_kg_data(data_dir_str: str):
+    d = None if HF_REPO_ID else Path(data_dir_str)
+    kg_nodes = _read("kg_nodes.parquet", d)
+    kg_edges = _read("kg_edges.parquet", d)
+    enriched = _read("citation_events_enriched.parquet", d)
+    return kg_nodes, kg_edges, enriched
+# ── 헬퍼 ───────────────────────────────────────────────────────
 def filter_seed_papers(seed, q, fields, countries, journals):
     df = seed.copy()
     q = (q or "").strip().lower()
     if q:
         df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
+    if fields:    df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
+    if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
+    if journals:  df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
     return df.reset_index(drop=True)
 def build_intent_summary(df):
     counts = df.groupby("primary_intent").size().to_dict()
+    return pd.DataFrame({"intent": ALLOWED_INTENTS,
+                          "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
 def build_context_rows(df, limit=20):
     rows = []
+    df = df.sort_values(["context_count","intent_count","citing_year"],
+                        ascending=[False,False,False], na_position="last")
     for _, row in df.iterrows():
+        ctx = row["contexts"]
+        if isinstance(ctx, list) and ctx:
+            for c in ctx[:2]:
+                rows.append({"primary_intent": row["primary_intent"],
+                             "citing_title": row["citing_title"],
+                             "citing_doi": row["citing_doi"],
+                             "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
+                             "context": c})
+        if len(rows) >= limit: break
     return pd.DataFrame(rows[:limit])
 def build_citing_table(df, limit=30):
     if df.empty:
+        return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
+    return (df.sort_values(["context_count","intent_count","citing_year"],
+                            ascending=[False,False,False], na_position="last")
+            [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
+            .drop_duplicates(subset=["citing_paper_id"]).head(limit))
+def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
+    """선택된 seed paper를 인용한 논문들이 함께 인용한 다른 seed papers"""
+    citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
+    cocited = (events[events["citing_paper_id"].isin(citing_ids) &
+                      (events["seed_paper_id"] != selected_seed_id)]
+               .groupby("seed_paper_id").size()
+               .reset_index(name="co_citation_count")
+               .sort_values("co_citation_count", ascending=False)
+               .head(top_n))
+    return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
+                         on="seed_paper_id", how="left")
+def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
+    """선택된 seed paper의 KG 1-hop 서브그래프 반환"""
+    node_id = f"seed:{seed_doi}"
+    edges = kg_edges[(kg_edges["source"] == node_id) |
+                     (kg_edges["target"] == node_id)].head(max_edges)
+    if edges.empty:
+        return None, None
+    all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
+    nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
+    return nodes, edges
+def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
+    """KG Explorer: 임의 노드 기준 서브그래프"""
+    edges = kg_edges[(kg_edges["source"] == search_node_id) |
+                     (kg_edges["target"] == search_node_id)].head(max_edges)
+    if edges.empty:
+        return None, None
+    all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
+    nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
+    return nodes, edges
+# ── pyvis 빌더 ─────────────────────────────────────────────────
 def pyvis_citation_graph(seed_row, events_df):
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     sid = seed_row["seed_paper_id"]
     net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
+                 font={"color":"white"})
+    for _, row in events_df.sort_values(["context_count","intent_count"],
                                          ascending=False).head(40).iterrows():
         cid = row["citing_paper_id"]
         net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
                      color=NODE_COLORS["citing_paper"], size=18, shape="dot")
         ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
+        yr  = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
         net.add_edge(cid, sid, label=row["primary_intent"],
+                     color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"),
                      title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
 def pyvis_ontology():
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     for nid, label, typ in [
+        ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"),
+        ("citing","CitingPaper","citing_paper"),  ("intent","Intent","intent"),
+        ("journal","Journal","journal"),           ("author","Author","author"),
         ("affiliation","Affiliation","affiliation"),("city","City","city"),
+        ("country","Country","country"),           ("field","Field","field"),
     ]:
         net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
     for s, t, l in [
+        ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"),
+        ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"),
+        ("seed","author","hasAuthor"),        ("seed","affiliation","hasAffiliation"),
+        ("seed","city","locatedInCity"),      ("seed","country","locatedInCountry"),
         ("seed","field","belongsToField"),
     ]:
         net.add_edge(s, t, label=l)
     return inject_fullscreen(net.generate_html())
+def pyvis_from_kg(nodes_df, edges_df, height="780px"):
+    """kg_nodes / kg_edges DataFrame으로 pyvis 그래프 생성"""
+    net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
+    for _, row in nodes_df.iterrows():
+        ntype = row.get("node_type","")
+        color = NODE_TYPE_COLORS.get(ntype,"#94a3b8")
+        label = str(row.get("label",""))[:55]
+        size  = 30 if ntype == "seed_paper" else 16
+        font  = {"color":"white"} if ntype == "seed_paper" else {}
+        tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}"
+        net.add_node(str(row["node_id"]), label=label, color=color,
+                     size=size, shape="dot", title=tooltip, font=font)
+    for _, row in edges_df.iterrows():
+        net.add_edge(str(row["source"]), str(row["target"]),
+                     label=row.get("edge_type",""), color="#94a3b8")
+    net.barnes_hut()
+    return inject_fullscreen(net.generate_html())
+# ═══════════════════════════════════════════════════════════════
+#  메인 UI
+# ═══════════════════════════════════════════════════════════════
 st.title("CitationHub")
+st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
+# ── Sidebar ────────────────────────────────────────────────────
 with st.sidebar:
     st.subheader("Data source")
     if HF_REPO_ID:
          cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
         st.success("Data loaded")
     except Exception as e:
+        st.error(str(e)); st.stop()
     st.subheader("Search seed papers")
     q_input = st.text_input("Title or DOI")
+    if "q_submit" not in st.session_state: st.session_state["q_submit"] = ""
     if st.button("Search", use_container_width=True):
         st.session_state["q_submit"] = q_input
     st.subheader("Overview counts")
     c1, c2 = st.columns(2)
+    c1.metric("Seed papers",     fmt_num(overview["seed_papers"]))
+    c2.metric("Citation events", fmt_num(overview["citation_events"]))
+    c1.metric("Citing papers",   fmt_num(overview["citing_papers"]))
+    c2.metric("Authors",         fmt_num(overview["authors"]))
+    c1.metric("Countries",       fmt_num(overview["countries"]))
+    c2.metric("Fields",          fmt_num(overview["fields"]))
     options = seed_filtered["seed_paper_id"].tolist()
     if not options:
+        st.warning("No seed papers match the current search."); st.stop()
+    current     = st.session_state.get("selected_seed_id", options[0])
     default_idx = options.index(current) if current in options else 0
     selected_seed_id = st.selectbox(
         "Seed paper", options, index=default_idx,
         format_func=lambda sid: seed_filtered.loc[
+            seed_filtered["seed_paper_id"]==sid, "title"].iloc[0],
     )
     st.session_state["selected_seed_id"] = selected_seed_id
+selected_seed  = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0]
+seed_events    = event_subset(events, selected_seed_id, year_min, year_max)
 intent_summary = build_intent_summary(seed_events)
 contexts_df    = build_context_rows(seed_events)
 citing_table   = build_citing_table(seed_events)
+# ── 탭 ─────────────────────────────────────────────────────────
 (tab_overview, tab_cnet, tab_ontology, tab_kg,
+ tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
+    "Overview","Citation Network","Ontology",
+    "Knowledge Graph","KG Explorer","Geographic Map","Analytics",
 ])
+# ═══ 1. OVERVIEW ═══════════════════════════════════════════════
 with tab_overview:
     col1, col2 = st.columns(2)
     with col1:
         st.subheader("Seed paper detail")
+        dc1, dc2 = st.columns(2)
+        dc1.metric("Cited by",        fmt_num(selected_seed["citedby_count"]))
+        dc2.metric("Citation events", fmt_num(len(seed_events)))
         for label, key in [
+            ("Title","title"),("DOI","doi"),("Journal","journal"),
+            ("Author","author"),("Affiliation","affiliation"),
+            ("City","city"),("Country","country"),("Field","field"),
         ]:
             st.markdown(f"**{label}**  \n{selected_seed[key] or '-'}")
         st.subheader("Related citing papers")
+        st.dataframe(citing_table.rename(columns={
+            "citing_title":"Title","citing_year":"Year",
+            "primary_intent":"Intent","context_count":"Contexts"}),
+            use_container_width=True, hide_index=True)
+        st.subheader("Co-cited seed papers")
+        st.caption("같은 citing paper에 의해 함께 인용된 다른 top 5% 논문들")
+        cocited = get_cocited_papers(selected_seed_id, events, seed)
+        if cocited.empty:
+            st.info("Co-cited papers not found.")
+        else:
+            st.dataframe(cocited.rename(columns={
+                "co_citation_count":"Co-citations","title":"Title",
+                "field":"Field","citedby_count":"Cited by"}),
+                use_container_width=True, hide_index=True)
     with col2:
         st.subheader("Intent distribution (selected paper)")
         fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
         st.plotly_chart(fig, use_container_width=True)
+        st.subheader("Citation trend (selected paper)")
+        trend = (seed_events.dropna(subset=["citing_year"])
+                 .assign(citing_year=lambda df: df["citing_year"].astype(int))
+                 .groupby("citing_year").size().reset_index(name="count"))
+        if not trend.empty:
+            st.plotly_chart(
+                px.line(trend, x="citing_year", y="count", markers=True)
+                .update_layout(xaxis_title="Year", yaxis_title="Citations"),
+                use_container_width=True)
         st.subheader("Field distribution")
         fd = (seed_filtered.groupby("field", dropna=False).size()
               .reset_index(name="count").sort_values("count", ascending=False).head(20))
+        fd["field"] = fd["field"].replace("","Unknown")
         st.plotly_chart(
             px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
+            use_container_width=True)
     st.subheader("Citation contexts")
     if contexts_df.empty:
                 <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
                 {row['citing_year'] or '-'} · {row['citing_title'] or row['citing_doi']}</div>
                 <div>{row['context']}</div></div>""",
+                unsafe_allow_html=True)
+# ═══ 2. CITATION NETWORK ════════════════════════════════════════
 with tab_cnet:
     st.subheader("Citing ↔ Cited Citation Network")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
     else:
         components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
+# ═══ 3. ONTOLOGY ════════════════════════════════════════════════
 with tab_ontology:
     st.subheader("CitationHub Ontology")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
     components.html(pyvis_ontology(), height=820, scrolling=True)
+# ═══ 4. KNOWLEDGE GRAPH (실제 KG 데이터) ═════════════════════════
 with tab_kg:
     st.subheader("Knowledge Graph — Selected Seed Paper")
+    st.caption("kg_nodes + kg_edges 전체 데이터에서 선택된 seed paper의 1-hop 서브그래프")
+    st.info("아래 버튼을 눌러 KG 데이터를 로드하세요 (최초 1회, 이후 캐시됨)")
+    if st.button("KG 데이터 로드", key="kg_load"):
+        with st.spinner("kg_nodes / kg_edges / enriched 로딩 중 ..."):
+            st.session_state["kg_loaded"] = True
+    if st.session_state.get("kg_loaded"):
+        try:
+            kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
+            seed_doi = selected_seed["doi"]
+            if not seed_doi:
+                st.warning("선택된 seed paper의 DOI가 없어 KG 조회가 불가합니다.")
+            else:
+                nodes_sub, edges_sub = get_kg_subgraph(seed_doi, kg_nodes, kg_edges)
+                if nodes_sub is None:
+                    st.warning(f"KG에서 노드를 찾을 수 없습니다. (DOI: {seed_doi})")
+                else:
+                    # 통계
+                    c1, c2, c3 = st.columns(3)
+                    c1.metric("Nodes", fmt_num(len(nodes_sub)))
+                    c2.metric("Edges", fmt_num(len(edges_sub)))
+                    c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
+                    type_counts = nodes_sub["node_type"].value_counts().reset_index()
+                    type_counts.columns = ["node_type","count"]
+                    st.plotly_chart(
+                        px.bar(type_counts, x="node_type", y="count",
+                               color="node_type",
+                               color_discrete_map=NODE_TYPE_COLORS,
+                               title="Node Type Distribution")
+                        .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
+                        use_container_width=True)
+                    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
+                    components.html(pyvis_from_kg(nodes_sub, edges_sub), height=820, scrolling=True)
+        except Exception as e:
+            st.error(str(e))
+# ═══ 5. KG EXPLORER ═════════════════════════════════════════════
+with tab_kg_exp:
+    st.subheader("KG Explorer")
+    st.caption("kg_nodes 전체를 탐색하고 임의 노드의 연결 관계를 시각화합니다.")
+    st.info("아래 버튼을 눌러 KG 데이터를 로드하세요 (최초 1회, 이후 캐시됨)")
+    if st.button("KG 데이터 로드", key="kg_exp_load"):
+        with st.spinner("로딩 중..."):
+            st.session_state["kg_loaded"] = True
+    if st.session_state.get("kg_loaded"):
+        try:
+            kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
+            # ── 전체 노드 타입 분포
+            col_a, col_b = st.columns([1,2])
+            with col_a:
+                st.subheader("Node Type Counts")
+                nt = kg_nodes["node_type"].value_counts().reset_index()
+                nt.columns = ["node_type","count"]
+                st.dataframe(nt, use_container_width=True, hide_index=True)
+                st.subheader("Edge Type Counts")
+                et = kg_edges["edge_type"].value_counts().reset_index()
+                et.columns = ["edge_type","count"]
+                st.dataframe(et, use_container_width=True, hide_index=True)
+            with col_b:
+                st.subheader("Node Type Distribution")
+                nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
+                                color_discrete_map=NODE_TYPE_COLORS)
+                nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
+                st.plotly_chart(nt_fig, use_container_width=True)
+            st.markdown("---")
+            st.subheader("Node Search & Ego Network")
+            exp_col1, exp_col2 = st.columns([1,3])
+            with exp_col1:
+                type_options = ["(all)"] + sorted(kg_nodes["node_type"].unique().tolist())
+                sel_type = st.selectbox("Filter by node type", type_options)
+                filtered_nodes = (kg_nodes if sel_type == "(all)"
+                                  else kg_nodes[kg_nodes["node_type"]==sel_type])
+                search_q = st.text_input("Search node label / DOI")
+                if search_q:
+                    filtered_nodes = filtered_nodes[
+                        filtered_nodes["label"].str.contains(search_q, case=False, na=False) |
+                        filtered_nodes["doi"].str.contains(search_q, case=False, na=False)
+                    ]
+                sample = filtered_nodes.head(100)
+                node_options = sample["node_id"].tolist()
+                if not node_options:
+                    st.warning("검색 결과가 없습니다.")
+                else:
+                    sel_node_id = st.selectbox(
+                        "Select node",
+                        node_options,
+                        format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60],
+                    )
+                    sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0]
+                    st.markdown(f"**Type**: {sel_node_info.get('node_type','')}")
+                    st.markdown(f"**DOI**: {sel_node_info.get('doi','') or '-'}")
+                    st.markdown(f"**Publication**: {sel_node_info.get('publication_name','') or '-'}")
+                    st.markdown(f"**Group**: {sel_node_info.get('group','') or '-'}")
+                    st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}")
+                    max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
+                    if st.button("Show ego network", key="kg_exp_show"):
+                        exp_nodes, exp_edges = get_explorer_subgraph(sel_node_id, kg_nodes, kg_edges, max_e)
+                        if exp_nodes is None:
+                            st.warning("연결된 엣지가 없습니다.")
+                        else:
+                            st.session_state["exp_nodes"] = exp_nodes
+                            st.session_state["exp_edges"] = exp_edges
+            with exp_col2:
+                if "exp_nodes" in st.session_state:
+                    en = st.session_state["exp_nodes"]
+                    ee = st.session_state["exp_edges"]
+                    st.caption(f"Nodes: {len(en)}  |  Edges: {len(ee)}")
+                    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
+                    components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True)
+                else:
+                    st.info("왼쪽에서 노드를 선택하고 'Show ego network'를 클릭하세요.")
+            # ── Enriched 인사이트
+            st.markdown("---")
+            st.subheader("Enriched Citation Insights")
+            st.caption("citation_events_enriched: 의미적 증거(semantic evidence) 분석")
+            if "has_semantic_evidence" in enriched.columns:
+                sem = enriched["has_semantic_evidence"].value_counts().reset_index()
+                sem.columns = ["has_semantic_evidence","count"]
+                sem["label"] = sem["has_semantic_evidence"].map({True:"With evidence", False:"Without evidence"})
+                st.plotly_chart(
+                    px.pie(sem, names="label", values="count",
+                           title="Semantic Evidence Coverage (all citation events)")
+                    .update_layout(legend_title=""),
+                    use_container_width=True)
+                # 분야별 semantic evidence 비율
+                if "field_folder" in enriched.columns:
+                    field_sem = (enriched.groupby("field_folder")["has_semantic_evidence"]
+                                 .mean().reset_index()
+                                 .rename(columns={"has_semantic_evidence":"sem_ratio","field_folder":"field"})
+                                 .sort_values("sem_ratio", ascending=False).head(20))
+                    st.plotly_chart(
+                        px.bar(field_sem, x="field", y="sem_ratio",
+                               title="Semantic Evidence Rate by Field",
+                               labels={"sem_ratio":"Evidence Rate","field":"Field"})
+                        .update_layout(xaxis_tickangle=-40),
+                        use_container_width=True)
+            else:
+                st.info("has_semantic_evidence 컬럼이 없습니다.")
+        except Exception as e:
+            st.error(str(e))
+# ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
 with tab_geo:
     st.subheader("Geographic Distribution of Seed Papers")
+    country_cnt = (seed_filtered.groupby("country", dropna=False).size()
+                   .reset_index(name="count").rename(columns={"country":"country_name"}))
     country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
     if not country_cnt.empty:
+        fig_map = px.choropleth(country_cnt, locations="country_name",
+                                locationmode="country names", color="count",
+                                hover_name="country_name",
+                                color_continuous_scale="Blues",
+                                title="Seed Papers by Country")
         fig_map.update_layout(geo=dict(showframe=False), height=500)
         st.plotly_chart(fig_map, use_container_width=True)
+    st.subheader("Top Cities (Affiliation)")
+    city_cnt = (seed_filtered.merge(
+                    aff_geo_df[["affiliation_name","city_name","country_name"]],
+                    left_on="affiliation", right_on="affiliation_name", how="left")
+                .groupby(["country_name","city_name"], dropna=False).size()
+                .reset_index(name="count").dropna(subset=["country_name"])
+                .sort_values("count", ascending=False).head(30))
     if not city_cnt.empty:
+        st.plotly_chart(
+            px.bar(city_cnt, x="city_name", y="count", color="country_name",
+                   title="Top 30 Cities")
+            .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
+            use_container_width=True)
+    st.subheader("Citation Trend over Time (selected paper)")
+    trend2 = (seed_events.dropna(subset=["citing_year"])
+              .assign(citing_year=lambda df: df["citing_year"].astype(int))
+              .groupby("citing_year").size().reset_index(name="count"))
+    if not trend2.empty:
+        st.plotly_chart(
+            px.line(trend2, x="citing_year", y="count", markers=True,
+                    title="Citations per Year")
+            .update_layout(xaxis_title="Year", yaxis_title="Citations"),
+            use_container_width=True)
+# ═══ 7. ANALYTICS ═══════════════════════════════════════════════
 with tab_analytics:
     col_a, col_b = st.columns(2)
     with col_a:
+        st.subheader("Top Authors")
         if "author_id" in seed.columns and not seed["author_id"].isna().all():
+            top_auth = (seed.explode("author_id")
+                        .merge(authors_df, on="author_id", how="left")
+                        .groupby("author_name").size()
+                        .reset_index(name="paper_count")
+                        .sort_values("paper_count", ascending=False).head(20))
         else:
+            top_auth = (seed["author"].value_counts()
+                        .reset_index().rename(columns={"author":"author_name","count":"paper_count"})
+                        .head(20))
+        top_auth = top_auth[top_auth["author_name"].str.strip() != ""]
+        st.plotly_chart(
+            px.bar(top_auth, x="paper_count", y="author_name", orientation="h",
+                   title="Top 20 Authors")
+            .update_layout(yaxis=dict(autorange="reversed"),
+                           xaxis_title="Seed Papers", yaxis_title=""),
+            use_container_width=True)
     with col_b:
+        st.subheader("Top Journals")
+        top_jnl = (seed.groupby("journal").size()
+                   .reset_index(name="count").sort_values("count", ascending=False).head(20))
+        top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""]
+        st.plotly_chart(
+            px.bar(top_jnl, x="count", y="journal", orientation="h",
+                   title="Top 20 Journals")
+            .update_layout(yaxis=dict(autorange="reversed"),
+                           xaxis_title="Seed Papers", yaxis_title=""),
+            use_container_width=True)
     st.markdown("---")
     col_c, col_d = st.columns(2)
     with col_c:
         st.subheader("Field × Intent Heatmap")
+        fi = (seed[["seed_paper_id","field"]]
+              .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
+              .groupby(["field","primary_intent"]).size().reset_index(name="count"))
+        if not fi.empty:
+            pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
+            st.plotly_chart(
+                px.imshow(pivot, color_continuous_scale="Blues",
+                          title="Citation Intent by Field", aspect="auto")
+                .update_layout(xaxis_title="Intent", yaxis_title="Field"),
+                use_container_width=True)
     with col_d:
+        st.subheader("Influential Citations (selected paper)")
         if "is_influential" in seed_events.columns:
+            inf = seed_events["is_influential"].value_counts().reset_index()
+            inf.columns = ["is_influential","count"]
+            inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"})
+            st.plotly_chart(
+                px.pie(inf, names="label", values="count",
+                       title="Influential vs Non-influential"),
+                use_container_width=True)
+        st.subheader("Intent Reference")
+        st.dataframe(intents_df, use_container_width=True, hide_index=True)
+    st.markdown("---")
+    st.subheader("Field Reference")
     st.dataframe(fields_df, use_container_width=True, hide_index=True)