Spaces:

Daniel0315
/

cithub_website

Sleeping

File size: 39,147 Bytes

00ff4cf
 
 
 
41e4f48
00ff4cf
 
 
 
 
 
 
 
af9c904
00ff4cf
af9c904
00ff4cf
 
af9c904
 
00ff4cf
 
af9c904
 
 
00ff4cf
 
af9c904
 
 
 
 
 
 
 
00ff4cf
 
41e4f48
 
 
 
 
00ff4cf
 
af9c904
 
00ff4cf
 
41e4f48
 
 
 
 
 
00ff4cf
41e4f48
 
00ff4cf
41e4f48
 
 
 
 
 
af9c904
 
 
41e4f48
af9c904
 
 
 
 
41e4f48
af9c904
41e4f48
 
00ff4cf
41e4f48
af9c904
41e4f48
 
 
 
af9c904
 
 
 
41e4f48
af9c904
 
 
 
 
 
 
00ff4cf
41e4f48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00ff4cf
af9c904
00ff4cf
af9c904
00ff4cf
 
 
41e4f48
 
 
 
 
 
 
 
 
 
 
 
00ff4cf
 
 
 
 
af9c904
 
 
 
 
00ff4cf
 
 
41e4f48
 
 
 
 
 
00ff4cf
 
af9c904
 
 
 
 
 
 
 
00ff4cf
41e4f48
 
 
00ff4cf
 
af9c904
 
 
 
 
 
 
 
 
 
 
41e4f48
00ff4cf
 
 
 
af9c904
 
 
00ff4cf
 
 
41e4f48
00ff4cf
 
 
 
 
 
41e4f48
00ff4cf
af9c904
 
00ff4cf
 
41e4f48
00ff4cf
af9c904
 
00ff4cf
af9c904
 
 
 
 
 
 
 
 
00ff4cf
 
 
41e4f48
00ff4cf
af9c904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41e4f48
 
 
 
af9c904
 
41e4f48
00ff4cf
41e4f48
 
 
af9c904
41e4f48
af9c904
41e4f48
00ff4cf
41e4f48
 
 
 
 
 
af9c904
 
 
41e4f48
af9c904
41e4f48
00ff4cf
41e4f48
af9c904
 
 
 
41e4f48
 
00ff4cf
 
41e4f48
00ff4cf
 
af9c904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00ff4cf
af9c904
00ff4cf
af9c904
00ff4cf
 
 
41e4f48
 
00ff4cf
41e4f48
 
00ff4cf
41e4f48
 
 
00ff4cf
 
af9c904
00ff4cf
 
 
af9c904
00ff4cf
 
 
41e4f48
 
 
 
 
00ff4cf
41e4f48
 
00ff4cf
 
 
af9c904
 
 
 
 
 
00ff4cf
 
 
af9c904
 
41e4f48
00ff4cf
41e4f48
 
af9c904
00ff4cf
 
 
af9c904
 
00ff4cf
41e4f48
 
00ff4cf
af9c904
41e4f48
af9c904
 
 
41e4f48
00ff4cf
af9c904
 
00ff4cf
41e4f48
00ff4cf
41e4f48
af9c904
 
 
41e4f48
af9c904
 
 
41e4f48
 
00ff4cf
 
af9c904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00ff4cf
 
41e4f48
 
 
 
 
 
af9c904
 
 
 
 
 
 
 
 
 
41e4f48
 
 
af9c904
41e4f48
 
af9c904
41e4f48
 
00ff4cf
41e4f48
00ff4cf
 
 
41e4f48
 
 
 
 
 
 
 
af9c904
 
00ff4cf
af9c904
00ff4cf
41e4f48
 
00ff4cf
 
 
41e4f48
00ff4cf
af9c904
 
00ff4cf
41e4f48
 
 
00ff4cf
af9c904
 
00ff4cf
41e4f48
af9c904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41e4f48
 
 
af9c904
 
41e4f48
 
 
af9c904
 
 
 
 
41e4f48
 
 
af9c904
 
 
 
 
 
 
41e4f48
af9c904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41e4f48
 
 
 
af9c904
41e4f48
af9c904
 
 
 
 
41e4f48
af9c904
 
 
 
 
 
 
 
 
 
 
41e4f48
af9c904
 
 
 
 
 
 
 
 
 
41e4f48
 
 
 
 
 
af9c904
 
 
 
 
 
 
 
 
 
 
41e4f48
af9c904
41e4f48
af9c904
 
 
 
 
 
 
41e4f48
af9c904
 
41e4f48
af9c904
 
41e4f48

from __future__ import annotations

import os
from pathlib import Path
from typing import List

import pandas as pd
import streamlit as st
import plotly.express as px
from pyvis.network import Network
import streamlit.components.v1 as components

HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
HF_TOKEN   = os.environ.get("HF_TOKEN", "")

st.set_page_config(page_title="CitationHub", page_icon="📚", layout="wide")

ALLOWED_INTENTS = [
    "background","uses","similarities","motivation",
    "differences","future_work","extends",
]
INTENT_COLORS = {
    "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6",
    "motivation":"#f59e0b","differences":"#ef4444",
    "future_work":"#8b5cf6","extends":"#06b6d4",
}
NODE_COLORS = {
    "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a",
    "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff",
    "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7",
}
NODE_TYPE_COLORS = {
    "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b",
    "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899",
    "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e",
}

DEFAULT_DATA_DIR = Path(os.environ.get(
    "CITATIONHUB_DATA_DIR",
    r"C:\Users\user\OneDrive\바탕 화면\Citehub_huggingface\data",
))


def fmt_num(x):
    try: return f"{int(x):,}"
    except: return "-"


def _hf_download(filename: str) -> str:
    from huggingface_hub import hf_hub_download
    return hf_hub_download(
        repo_id=HF_REPO_ID, repo_type="dataset",
        filename=f"data/{filename}", token=HF_TOKEN or None,
    )


def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
    if HF_REPO_ID:
        return pd.read_parquet(_hf_download(filename))
    return pd.read_parquet(data_dir / filename)


def inject_fullscreen(html: str) -> str:
    btn = """
    <button onclick="var el=document.getElementById('mynetwork');
      if(el){if(el.requestFullscreen)el.requestFullscreen();
      else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
      style="position:fixed;bottom:18px;right:18px;z-index:9999;
             padding:8px 18px;background:#1e293b;color:white;border:none;
             border-radius:8px;cursor:pointer;font-size:13px;
             box-shadow:0 2px 8px rgba(0,0,0,0.35);">⛶ Fullscreen</button>
    <div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px;
                color:#64748b;background:rgba(255,255,255,0.85);
                padding:5px 10px;border-radius:6px;">
      🖱 Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
    """
    return html.replace("</body>", btn + "</body>")


# ── 메인 데이터 로드 (11개) ────────────────────────────────────
@st.cache_data(show_spinner=False)
def load_data(data_dir_str: str):
    d = None if HF_REPO_ID else Path(data_dir_str)

    seed_df    = _read("seed_cited_papers_normalized.parquet", d)
    events_df  = _read("citation_events_normalized.parquet", d)
    citing_df  = _read("citing_papers_normalized.parquet", d)
    authors_df      = _read("authors.parquet", d)
    affiliations_df = _read("affiliations.parquet", d)
    aff_geo_df      = _read("affiliation_geo.parquet", d)
    cities_df       = _read("cities.parquet", d)
    countries_df    = _read("countries.parquet", d)
    fields_df       = _read("fields.parquet", d)
    intents_df      = _read("intents.parquet", d)
    journals_df     = _read("journals.parquet", d)

    seed = pd.DataFrame({
        "seed_paper_id":  seed_df["seed_paper_id"],
        "doi":            seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
        "title":          seed_df.get("title", pd.Series(dtype=str)).fillna(""),
        "journal":        seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""),
        "author":         seed_df.get("creator", pd.Series(dtype=str)).fillna(""),
        "affiliation":    seed_df.get("affilname", pd.Series(dtype=str)).fillna(""),
        "city":           seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""),
        "country":        seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""),
        "field":          seed_df.get("group", pd.Series(dtype=str)).fillna(""),
        "citedby_count":  pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
        "author_id":      seed_df.get("author_id", pd.Series(dtype=object)),
        "affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)),
        "country_id":     seed_df.get("country_id", pd.Series(dtype=object)),
        "field_id":       seed_df.get("field_id", pd.Series(dtype=object)),
        "journal_id":     seed_df.get("journal_id", pd.Series(dtype=object)),
    })
    for col in ["title","doi","journal","field","country"]:
        seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
    seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True)

    events = pd.DataFrame({
        "citation_event_id": events_df["citation_event_id"],
        "seed_paper_id":     events_df["cited_seed_paper_id"],
        "citing_paper_id":   events_df["citing_paper_id"],
        "citing_title":      events_df.get("citing_title", pd.Series(dtype=str)).fillna(""),
        "citing_doi":        events_df.get("citing_doi", pd.Series(dtype=str)).fillna(""),
        "citing_year":       pd.to_numeric(events_df.get("citing_year"), errors="coerce"),
        "citing_venue":      events_df.get("citing_venue", pd.Series(dtype=str)).fillna(""),
        "primary_intent":    events_df.get("primary_intent", pd.Series(dtype=str)).fillna(""),
        "contexts":          events_df.get("contexts"),
        "context_count":     pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int),
        "intent_count":      pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int),
        "is_influential":    events_df.get("is_influential", pd.Series(dtype=bool)).fillna(False),
        "field_id":          events_df.get("field_id", pd.Series(dtype=object)),
    })
    events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)

    citing = pd.DataFrame({
        "citing_paper_id": citing_df["citing_paper_id"],
        "doi":    citing_df.get("doi",   pd.Series(dtype=str)).fillna(""),
        "title":  citing_df.get("title", pd.Series(dtype=str)).fillna(""),
        "year":   pd.to_numeric(citing_df.get("year"), errors="coerce"),
        "venue":  citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
        "oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""),
    })

    filters = {
        "fields":    sorted([x for x in seed["field"].dropna().astype(str).unique() if x]),
        "countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]),
        "journals":  sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]),
        "intents":   ALLOWED_INTENTS,
        "year_min":  int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
        "year_max":  int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
    }
    overview = {
        "seed_papers":     int(len(seed)),
        "citation_events": int(len(events)),
        "citing_papers":   int(events["citing_paper_id"].nunique()),
        "authors":         int(len(authors_df)),
        "journals":        int(seed["journal"].replace("", pd.NA).dropna().nunique()),
        "countries":       int(seed["country"].replace("", pd.NA).dropna().nunique()),
        "fields":          int(seed["field"].replace("", pd.NA).dropna().nunique()),
        "intents":         len(ALLOWED_INTENTS),
    }
    return (seed, events, citing, filters, overview,
            authors_df, affiliations_df, aff_geo_df,
            cities_df, countries_df, fields_df, intents_df, journals_df)


# ── KG + Enriched 데이터 (별도 지연 로드) ─────────────────────
@st.cache_data(show_spinner=False)
def load_kg_data(data_dir_str: str):
    d = None if HF_REPO_ID else Path(data_dir_str)
    kg_nodes = _read("kg_nodes.parquet", d)
    kg_edges = _read("kg_edges.parquet", d)
    enriched = _read("citation_events_enriched.parquet", d)
    return kg_nodes, kg_edges, enriched


# ── 헬퍼 ───────────────────────────────────────────────────────
def filter_seed_papers(seed, q, fields, countries, journals):
    df = seed.copy()
    q = (q or "").strip().lower()
    if q:
        df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
    if fields:    df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
    if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
    if journals:  df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
    return df.reset_index(drop=True)


def event_subset(events, seed_paper_id, year_min, year_max):
    df = events[events["seed_paper_id"] == seed_paper_id].copy()
    df = df[df["citing_year"].fillna(-99999) >= year_min]
    df = df[df["citing_year"].fillna(99999) <= year_max]
    return df.reset_index(drop=True)


def build_intent_summary(df):
    counts = df.groupby("primary_intent").size().to_dict()
    return pd.DataFrame({"intent": ALLOWED_INTENTS,
                          "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})


def build_context_rows(df, limit=20):
    rows = []
    df = df.sort_values(["context_count","intent_count","citing_year"],
                        ascending=[False,False,False], na_position="last")
    for _, row in df.iterrows():
        ctx = row["contexts"]
        if isinstance(ctx, list) and ctx:
            for c in ctx[:2]:
                rows.append({"primary_intent": row["primary_intent"],
                             "citing_title": row["citing_title"],
                             "citing_doi": row["citing_doi"],
                             "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
                             "context": c})
        if len(rows) >= limit: break
    return pd.DataFrame(rows[:limit])


def build_citing_table(df, limit=30):
    if df.empty:
        return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
    return (df.sort_values(["context_count","intent_count","citing_year"],
                            ascending=[False,False,False], na_position="last")
            [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
            .drop_duplicates(subset=["citing_paper_id"]).head(limit))


def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
    """선택된 seed paper를 인용한 논문들이 함께 인용한 다른 seed papers"""
    citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
    cocited = (events[events["citing_paper_id"].isin(citing_ids) &
                      (events["seed_paper_id"] != selected_seed_id)]
               .groupby("seed_paper_id").size()
               .reset_index(name="co_citation_count")
               .sort_values("co_citation_count", ascending=False)
               .head(top_n))
    return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
                         on="seed_paper_id", how="left")


def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
    """선택된 seed paper의 KG 1-hop 서브그래프 반환"""
    node_id = f"seed:{seed_doi}"
    edges = kg_edges[(kg_edges["source"] == node_id) |
                     (kg_edges["target"] == node_id)].head(max_edges)
    if edges.empty:
        return None, None
    all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
    nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
    return nodes, edges


def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
    """KG Explorer: 임의 노드 기준 서브그래프"""
    edges = kg_edges[(kg_edges["source"] == search_node_id) |
                     (kg_edges["target"] == search_node_id)].head(max_edges)
    if edges.empty:
        return None, None
    all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
    nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
    return nodes, edges


# ── pyvis 빌더 ─────────────────────────────────────────────────
def pyvis_citation_graph(seed_row, events_df):
    net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
    sid = seed_row["seed_paper_id"]
    net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
                 font={"color":"white"})
    for _, row in events_df.sort_values(["context_count","intent_count"],
                                         ascending=False).head(40).iterrows():
        cid = row["citing_paper_id"]
        net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
                     color=NODE_COLORS["citing_paper"], size=18, shape="dot")
        ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
        yr  = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
        net.add_edge(cid, sid, label=row["primary_intent"],
                     color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"),
                     title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
    net.barnes_hut()
    return inject_fullscreen(net.generate_html())


def pyvis_ontology():
    net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
    for nid, label, typ in [
        ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"),
        ("citing","CitingPaper","citing_paper"),  ("intent","Intent","intent"),
        ("journal","Journal","journal"),           ("author","Author","author"),
        ("affiliation","Affiliation","affiliation"),("city","City","city"),
        ("country","Country","country"),           ("field","Field","field"),
    ]:
        net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
    for s, t, l in [
        ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"),
        ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"),
        ("seed","author","hasAuthor"),        ("seed","affiliation","hasAffiliation"),
        ("seed","city","locatedInCity"),      ("seed","country","locatedInCountry"),
        ("seed","field","belongsToField"),
    ]:
        net.add_edge(s, t, label=l)
    net.barnes_hut()
    return inject_fullscreen(net.generate_html())


def pyvis_from_kg(nodes_df, edges_df, height="780px"):
    """kg_nodes / kg_edges DataFrame으로 pyvis 그래프 생성"""
    net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
    for _, row in nodes_df.iterrows():
        ntype = row.get("node_type","")
        color = NODE_TYPE_COLORS.get(ntype,"#94a3b8")
        label = str(row.get("label",""))[:55]
        size  = 30 if ntype == "seed_paper" else 16
        font  = {"color":"white"} if ntype == "seed_paper" else {}
        tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}"
        net.add_node(str(row["node_id"]), label=label, color=color,
                     size=size, shape="dot", title=tooltip, font=font)
    for _, row in edges_df.iterrows():
        net.add_edge(str(row["source"]), str(row["target"]),
                     label=row.get("edge_type",""), color="#94a3b8")
    net.barnes_hut()
    return inject_fullscreen(net.generate_html())


# ═══════════════════════════════════════════════════════════════
#  메인 UI
# ═══════════════════════════════════════════════════════════════
st.title("CitationHub")
st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")

# ── Sidebar ────────────────────────────────────────────────────
with st.sidebar:
    st.subheader("Data source")
    if HF_REPO_ID:
        data_dir_val = "hf"
        st.caption(f"Hugging Face: {HF_REPO_ID}")
    else:
        data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))

    try:
        (seed, events, citing, filters, overview,
         authors_df, affiliations_df, aff_geo_df,
         cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
        st.success("Data loaded")
    except Exception as e:
        st.error(str(e)); st.stop()

    st.subheader("Search seed papers")
    q_input = st.text_input("Title or DOI")
    if "q_submit" not in st.session_state: st.session_state["q_submit"] = ""
    if st.button("Search", use_container_width=True):
        st.session_state["q_submit"] = q_input

    fields_sel    = st.multiselect("Field", filters["fields"])
    countries_sel = st.multiselect("Country", filters["countries"])
    journals_sel  = st.multiselect("Journal", filters["journals"][:200])
    y_min = max(2000, filters["year_min"])
    year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"]))

    seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"],
                                       fields_sel, countries_sel, journals_sel)

    st.subheader("Overview counts")
    c1, c2 = st.columns(2)
    c1.metric("Seed papers",     fmt_num(overview["seed_papers"]))
    c2.metric("Citation events", fmt_num(overview["citation_events"]))
    c1.metric("Citing papers",   fmt_num(overview["citing_papers"]))
    c2.metric("Authors",         fmt_num(overview["authors"]))
    c1.metric("Countries",       fmt_num(overview["countries"]))
    c2.metric("Fields",          fmt_num(overview["fields"]))

    options = seed_filtered["seed_paper_id"].tolist()
    if not options:
        st.warning("No seed papers match the current search."); st.stop()
    current     = st.session_state.get("selected_seed_id", options[0])
    default_idx = options.index(current) if current in options else 0
    selected_seed_id = st.selectbox(
        "Seed paper", options, index=default_idx,
        format_func=lambda sid: seed_filtered.loc[
            seed_filtered["seed_paper_id"]==sid, "title"].iloc[0],
    )
    st.session_state["selected_seed_id"] = selected_seed_id

selected_seed  = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0]
seed_events    = event_subset(events, selected_seed_id, year_min, year_max)
intent_summary = build_intent_summary(seed_events)
contexts_df    = build_context_rows(seed_events)
citing_table   = build_citing_table(seed_events)

# ── 탭 ─────────────────────────────────────────────────────────
(tab_overview, tab_cnet, tab_ontology, tab_kg,
 tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
    "Overview","Citation Network","Ontology",
    "Knowledge Graph","KG Explorer","Geographic Map","Analytics",
])


# ═══ 1. OVERVIEW ═══════════════════════════════════════════════
with tab_overview:
    col1, col2 = st.columns(2)
    with col1:
        st.subheader("Seed paper detail")
        dc1, dc2 = st.columns(2)
        dc1.metric("Cited by",        fmt_num(selected_seed["citedby_count"]))
        dc2.metric("Citation events", fmt_num(len(seed_events)))
        for label, key in [
            ("Title","title"),("DOI","doi"),("Journal","journal"),
            ("Author","author"),("Affiliation","affiliation"),
            ("City","city"),("Country","country"),("Field","field"),
        ]:
            st.markdown(f"**{label}**  \n{selected_seed[key] or '-'}")

        st.subheader("Related citing papers")
        st.dataframe(citing_table.rename(columns={
            "citing_title":"Title","citing_year":"Year",
            "primary_intent":"Intent","context_count":"Contexts"}),
            use_container_width=True, hide_index=True)

        st.subheader("Co-cited seed papers")
        st.caption("같은 citing paper에 의해 함께 인용된 다른 top 5% 논문들")
        cocited = get_cocited_papers(selected_seed_id, events, seed)
        if cocited.empty:
            st.info("Co-cited papers not found.")
        else:
            st.dataframe(cocited.rename(columns={
                "co_citation_count":"Co-citations","title":"Title",
                "field":"Field","citedby_count":"Cited by"}),
                use_container_width=True, hide_index=True)

    with col2:
        st.subheader("Intent distribution (selected paper)")
        fig = px.bar(intent_summary, x="intent", y="count", color="intent",
                     color_discrete_map=INTENT_COLORS)
        fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
        st.plotly_chart(fig, use_container_width=True)

        st.subheader("Citation trend (selected paper)")
        trend = (seed_events.dropna(subset=["citing_year"])
                 .assign(citing_year=lambda df: df["citing_year"].astype(int))
                 .groupby("citing_year").size().reset_index(name="count"))
        if not trend.empty:
            st.plotly_chart(
                px.line(trend, x="citing_year", y="count", markers=True)
                .update_layout(xaxis_title="Year", yaxis_title="Citations"),
                use_container_width=True)

        st.subheader("Field distribution")
        fd = (seed_filtered.groupby("field", dropna=False).size()
              .reset_index(name="count").sort_values("count", ascending=False).head(20))
        fd["field"] = fd["field"].replace("","Unknown")
        st.plotly_chart(
            px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
            use_container_width=True)

    st.subheader("Citation contexts")
    if contexts_df.empty:
        st.info("No contexts available.")
    else:
        for _, row in contexts_df.iterrows():
            st.markdown(
                f"""<div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;
                margin-bottom:10px;background:#f8fafc;">
                <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'],'#64748b')};
                color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">
                {row['primary_intent']}</div>
                <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
                {row['citing_year'] or '-'} · {row['citing_title'] or row['citing_doi']}</div>
                <div>{row['context']}</div></div>""",
                unsafe_allow_html=True)


# ═══ 2. CITATION NETWORK ════════════════════════════════════════
with tab_cnet:
    st.subheader("Citing ↔ Cited Citation Network")
    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
    if seed_events.empty:
        st.info("No citation network data for this seed paper.")
    else:
        components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)


# ═══ 3. ONTOLOGY ════════════════════════════════════════════════
with tab_ontology:
    st.subheader("CitationHub Ontology")
    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
    components.html(pyvis_ontology(), height=820, scrolling=True)


# ═══ 4. KNOWLEDGE GRAPH (실제 KG 데이터) ═════════════════════════
with tab_kg:
    st.subheader("Knowledge Graph — Selected Seed Paper")
    st.caption("kg_nodes + kg_edges 전체 데이터에서 선택된 seed paper의 1-hop 서브그래프")
    st.info("아래 버튼을 눌러 KG 데이터를 로드하세요 (최초 1회, 이후 캐시됨)")

    if st.button("KG 데이터 로드", key="kg_load"):
        with st.spinner("kg_nodes / kg_edges / enriched 로딩 중 ..."):
            st.session_state["kg_loaded"] = True

    if st.session_state.get("kg_loaded"):
        try:
            kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
            seed_doi = selected_seed["doi"]
            if not seed_doi:
                st.warning("선택된 seed paper의 DOI가 없어 KG 조회가 불가합니다.")
            else:
                nodes_sub, edges_sub = get_kg_subgraph(seed_doi, kg_nodes, kg_edges)
                if nodes_sub is None:
                    st.warning(f"KG에서 노드를 찾을 수 없습니다. (DOI: {seed_doi})")
                else:
                    # 통계
                    c1, c2, c3 = st.columns(3)
                    c1.metric("Nodes", fmt_num(len(nodes_sub)))
                    c2.metric("Edges", fmt_num(len(edges_sub)))
                    c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))

                    type_counts = nodes_sub["node_type"].value_counts().reset_index()
                    type_counts.columns = ["node_type","count"]
                    st.plotly_chart(
                        px.bar(type_counts, x="node_type", y="count",
                               color="node_type",
                               color_discrete_map=NODE_TYPE_COLORS,
                               title="Node Type Distribution")
                        .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
                        use_container_width=True)

                    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
                    components.html(pyvis_from_kg(nodes_sub, edges_sub), height=820, scrolling=True)
        except Exception as e:
            st.error(str(e))


# ═══ 5. KG EXPLORER ═════════════════════════════════════════════
with tab_kg_exp:
    st.subheader("KG Explorer")
    st.caption("kg_nodes 전체를 탐색하고 임의 노드의 연결 관계를 시각화합니다.")
    st.info("아래 버튼을 눌러 KG 데이터를 로드하세요 (최초 1회, 이후 캐시됨)")

    if st.button("KG 데이터 로드", key="kg_exp_load"):
        with st.spinner("로딩 중..."):
            st.session_state["kg_loaded"] = True

    if st.session_state.get("kg_loaded"):
        try:
            kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)

            # ── 전체 노드 타입 분포
            col_a, col_b = st.columns([1,2])
            with col_a:
                st.subheader("Node Type Counts")
                nt = kg_nodes["node_type"].value_counts().reset_index()
                nt.columns = ["node_type","count"]
                st.dataframe(nt, use_container_width=True, hide_index=True)

                st.subheader("Edge Type Counts")
                et = kg_edges["edge_type"].value_counts().reset_index()
                et.columns = ["edge_type","count"]
                st.dataframe(et, use_container_width=True, hide_index=True)

            with col_b:
                st.subheader("Node Type Distribution")
                nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
                                color_discrete_map=NODE_TYPE_COLORS)
                nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
                st.plotly_chart(nt_fig, use_container_width=True)

            st.markdown("---")
            st.subheader("Node Search & Ego Network")
            exp_col1, exp_col2 = st.columns([1,3])
            with exp_col1:
                type_options = ["(all)"] + sorted(kg_nodes["node_type"].unique().tolist())
                sel_type = st.selectbox("Filter by node type", type_options)
                filtered_nodes = (kg_nodes if sel_type == "(all)"
                                  else kg_nodes[kg_nodes["node_type"]==sel_type])
                search_q = st.text_input("Search node label / DOI")
                if search_q:
                    filtered_nodes = filtered_nodes[
                        filtered_nodes["label"].str.contains(search_q, case=False, na=False) |
                        filtered_nodes["doi"].str.contains(search_q, case=False, na=False)
                    ]

                sample = filtered_nodes.head(100)
                node_options = sample["node_id"].tolist()
                if not node_options:
                    st.warning("검색 결과가 없습니다.")
                else:
                    sel_node_id = st.selectbox(
                        "Select node",
                        node_options,
                        format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60],
                    )
                    sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0]
                    st.markdown(f"**Type**: {sel_node_info.get('node_type','')}")
                    st.markdown(f"**DOI**: {sel_node_info.get('doi','') or '-'}")
                    st.markdown(f"**Publication**: {sel_node_info.get('publication_name','') or '-'}")
                    st.markdown(f"**Group**: {sel_node_info.get('group','') or '-'}")
                    st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}")

                    max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")

                    if st.button("Show ego network", key="kg_exp_show"):
                        exp_nodes, exp_edges = get_explorer_subgraph(sel_node_id, kg_nodes, kg_edges, max_e)
                        if exp_nodes is None:
                            st.warning("연결된 엣지가 없습니다.")
                        else:
                            st.session_state["exp_nodes"] = exp_nodes
                            st.session_state["exp_edges"] = exp_edges

            with exp_col2:
                if "exp_nodes" in st.session_state:
                    en = st.session_state["exp_nodes"]
                    ee = st.session_state["exp_edges"]
                    st.caption(f"Nodes: {len(en)}  |  Edges: {len(ee)}")
                    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
                    components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True)
                else:
                    st.info("왼쪽에서 노드를 선택하고 'Show ego network'를 클릭하세요.")

            # ── Enriched 인사이트
            st.markdown("---")
            st.subheader("Enriched Citation Insights")
            st.caption("citation_events_enriched: 의미적 증거(semantic evidence) 분석")
            if "has_semantic_evidence" in enriched.columns:
                sem = enriched["has_semantic_evidence"].value_counts().reset_index()
                sem.columns = ["has_semantic_evidence","count"]
                sem["label"] = sem["has_semantic_evidence"].map({True:"With evidence", False:"Without evidence"})
                st.plotly_chart(
                    px.pie(sem, names="label", values="count",
                           title="Semantic Evidence Coverage (all citation events)")
                    .update_layout(legend_title=""),
                    use_container_width=True)

                # 분야별 semantic evidence 비율
                if "field_folder" in enriched.columns:
                    field_sem = (enriched.groupby("field_folder")["has_semantic_evidence"]
                                 .mean().reset_index()
                                 .rename(columns={"has_semantic_evidence":"sem_ratio","field_folder":"field"})
                                 .sort_values("sem_ratio", ascending=False).head(20))
                    st.plotly_chart(
                        px.bar(field_sem, x="field", y="sem_ratio",
                               title="Semantic Evidence Rate by Field",
                               labels={"sem_ratio":"Evidence Rate","field":"Field"})
                        .update_layout(xaxis_tickangle=-40),
                        use_container_width=True)
            else:
                st.info("has_semantic_evidence 컬럼이 없습니다.")

        except Exception as e:
            st.error(str(e))


# ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
with tab_geo:
    st.subheader("Geographic Distribution of Seed Papers")

    country_cnt = (seed_filtered.groupby("country", dropna=False).size()
                   .reset_index(name="count").rename(columns={"country":"country_name"}))
    country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]

    if not country_cnt.empty:
        fig_map = px.choropleth(country_cnt, locations="country_name",
                                locationmode="country names", color="count",
                                hover_name="country_name",
                                color_continuous_scale="Blues",
                                title="Seed Papers by Country")
        fig_map.update_layout(geo=dict(showframe=False), height=500)
        st.plotly_chart(fig_map, use_container_width=True)

    st.subheader("Top Cities (Affiliation)")
    city_cnt = (seed_filtered.merge(
                    aff_geo_df[["affiliation_name","city_name","country_name"]],
                    left_on="affiliation", right_on="affiliation_name", how="left")
                .groupby(["country_name","city_name"], dropna=False).size()
                .reset_index(name="count").dropna(subset=["country_name"])
                .sort_values("count", ascending=False).head(30))
    if not city_cnt.empty:
        st.plotly_chart(
            px.bar(city_cnt, x="city_name", y="count", color="country_name",
                   title="Top 30 Cities")
            .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
            use_container_width=True)

    st.subheader("Citation Trend over Time (selected paper)")
    trend2 = (seed_events.dropna(subset=["citing_year"])
              .assign(citing_year=lambda df: df["citing_year"].astype(int))
              .groupby("citing_year").size().reset_index(name="count"))
    if not trend2.empty:
        st.plotly_chart(
            px.line(trend2, x="citing_year", y="count", markers=True,
                    title="Citations per Year")
            .update_layout(xaxis_title="Year", yaxis_title="Citations"),
            use_container_width=True)


# ═══ 7. ANALYTICS ═══════════════════════════════════════════════
with tab_analytics:
    col_a, col_b = st.columns(2)

    with col_a:
        st.subheader("Top Authors")
        if "author_id" in seed.columns and not seed["author_id"].isna().all():
            top_auth = (seed.explode("author_id")
                        .merge(authors_df, on="author_id", how="left")
                        .groupby("author_name").size()
                        .reset_index(name="paper_count")
                        .sort_values("paper_count", ascending=False).head(20))
        else:
            top_auth = (seed["author"].value_counts()
                        .reset_index().rename(columns={"author":"author_name","count":"paper_count"})
                        .head(20))
        top_auth = top_auth[top_auth["author_name"].str.strip() != ""]
        st.plotly_chart(
            px.bar(top_auth, x="paper_count", y="author_name", orientation="h",
                   title="Top 20 Authors")
            .update_layout(yaxis=dict(autorange="reversed"),
                           xaxis_title="Seed Papers", yaxis_title=""),
            use_container_width=True)

    with col_b:
        st.subheader("Top Journals")
        top_jnl = (seed.groupby("journal").size()
                   .reset_index(name="count").sort_values("count", ascending=False).head(20))
        top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""]
        st.plotly_chart(
            px.bar(top_jnl, x="count", y="journal", orientation="h",
                   title="Top 20 Journals")
            .update_layout(yaxis=dict(autorange="reversed"),
                           xaxis_title="Seed Papers", yaxis_title=""),
            use_container_width=True)

    st.markdown("---")
    col_c, col_d = st.columns(2)

    with col_c:
        st.subheader("Field × Intent Heatmap")
        fi = (seed[["seed_paper_id","field"]]
              .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
              .groupby(["field","primary_intent"]).size().reset_index(name="count"))
        if not fi.empty:
            pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
            st.plotly_chart(
                px.imshow(pivot, color_continuous_scale="Blues",
                          title="Citation Intent by Field", aspect="auto")
                .update_layout(xaxis_title="Intent", yaxis_title="Field"),
                use_container_width=True)

    with col_d:
        st.subheader("Influential Citations (selected paper)")
        if "is_influential" in seed_events.columns:
            inf = seed_events["is_influential"].value_counts().reset_index()
            inf.columns = ["is_influential","count"]
            inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"})
            st.plotly_chart(
                px.pie(inf, names="label", values="count",
                       title="Influential vs Non-influential"),
                use_container_width=True)

        st.subheader("Intent Reference")
        st.dataframe(intents_df, use_container_width=True, hide_index=True)

    st.markdown("---")
    st.subheader("Field Reference")
    st.dataframe(fields_df, use_container_width=True, hide_index=True)