cithub_website / src /app.py
Daniel0315's picture
Upload app.py
af9c904 verified
raw
history blame
39.1 kB
from __future__ import annotations
import os
from pathlib import Path
from typing import List
import pandas as pd
import streamlit as st
import plotly.express as px
from pyvis.network import Network
import streamlit.components.v1 as components
HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
st.set_page_config(page_title="CitationHub", page_icon="πŸ“š", layout="wide")
ALLOWED_INTENTS = [
"background","uses","similarities","motivation",
"differences","future_work","extends",
]
INTENT_COLORS = {
"background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6",
"motivation":"#f59e0b","differences":"#ef4444",
"future_work":"#8b5cf6","extends":"#06b6d4",
}
NODE_COLORS = {
"seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a",
"journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff",
"city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7",
}
NODE_TYPE_COLORS = {
"seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b",
"journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899",
"city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e",
}
DEFAULT_DATA_DIR = Path(os.environ.get(
"CITATIONHUB_DATA_DIR",
r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\Citehub_huggingface\data",
))
def fmt_num(x):
try: return f"{int(x):,}"
except: return "-"
def _hf_download(filename: str) -> str:
from huggingface_hub import hf_hub_download
return hf_hub_download(
repo_id=HF_REPO_ID, repo_type="dataset",
filename=f"data/{filename}", token=HF_TOKEN or None,
)
def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
if HF_REPO_ID:
return pd.read_parquet(_hf_download(filename))
return pd.read_parquet(data_dir / filename)
def inject_fullscreen(html: str) -> str:
btn = """
<button onclick="var el=document.getElementById('mynetwork');
if(el){if(el.requestFullscreen)el.requestFullscreen();
else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
style="position:fixed;bottom:18px;right:18px;z-index:9999;
padding:8px 18px;background:#1e293b;color:white;border:none;
border-radius:8px;cursor:pointer;font-size:13px;
box-shadow:0 2px 8px rgba(0,0,0,0.35);">β›Ά Fullscreen</button>
<div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px;
color:#64748b;background:rgba(255,255,255,0.85);
padding:5px 10px;border-radius:6px;">
πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
"""
return html.replace("</body>", btn + "</body>")
# ── 메인 데이터 λ‘œλ“œ (11개) ────────────────────────────────────
@st.cache_data(show_spinner=False)
def load_data(data_dir_str: str):
d = None if HF_REPO_ID else Path(data_dir_str)
seed_df = _read("seed_cited_papers_normalized.parquet", d)
events_df = _read("citation_events_normalized.parquet", d)
citing_df = _read("citing_papers_normalized.parquet", d)
authors_df = _read("authors.parquet", d)
affiliations_df = _read("affiliations.parquet", d)
aff_geo_df = _read("affiliation_geo.parquet", d)
cities_df = _read("cities.parquet", d)
countries_df = _read("countries.parquet", d)
fields_df = _read("fields.parquet", d)
intents_df = _read("intents.parquet", d)
journals_df = _read("journals.parquet", d)
seed = pd.DataFrame({
"seed_paper_id": seed_df["seed_paper_id"],
"doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
"title": seed_df.get("title", pd.Series(dtype=str)).fillna(""),
"journal": seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""),
"author": seed_df.get("creator", pd.Series(dtype=str)).fillna(""),
"affiliation": seed_df.get("affilname", pd.Series(dtype=str)).fillna(""),
"city": seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""),
"country": seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""),
"field": seed_df.get("group", pd.Series(dtype=str)).fillna(""),
"citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
"author_id": seed_df.get("author_id", pd.Series(dtype=object)),
"affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)),
"country_id": seed_df.get("country_id", pd.Series(dtype=object)),
"field_id": seed_df.get("field_id", pd.Series(dtype=object)),
"journal_id": seed_df.get("journal_id", pd.Series(dtype=object)),
})
for col in ["title","doi","journal","field","country"]:
seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True)
events = pd.DataFrame({
"citation_event_id": events_df["citation_event_id"],
"seed_paper_id": events_df["cited_seed_paper_id"],
"citing_paper_id": events_df["citing_paper_id"],
"citing_title": events_df.get("citing_title", pd.Series(dtype=str)).fillna(""),
"citing_doi": events_df.get("citing_doi", pd.Series(dtype=str)).fillna(""),
"citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"),
"citing_venue": events_df.get("citing_venue", pd.Series(dtype=str)).fillna(""),
"primary_intent": events_df.get("primary_intent", pd.Series(dtype=str)).fillna(""),
"contexts": events_df.get("contexts"),
"context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int),
"intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int),
"is_influential": events_df.get("is_influential", pd.Series(dtype=bool)).fillna(False),
"field_id": events_df.get("field_id", pd.Series(dtype=object)),
})
events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
citing = pd.DataFrame({
"citing_paper_id": citing_df["citing_paper_id"],
"doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""),
"title": citing_df.get("title", pd.Series(dtype=str)).fillna(""),
"year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
"venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
"oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""),
})
filters = {
"fields": sorted([x for x in seed["field"].dropna().astype(str).unique() if x]),
"countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]),
"journals": sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]),
"intents": ALLOWED_INTENTS,
"year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
"year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
}
overview = {
"seed_papers": int(len(seed)),
"citation_events": int(len(events)),
"citing_papers": int(events["citing_paper_id"].nunique()),
"authors": int(len(authors_df)),
"journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
"countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
"fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
"intents": len(ALLOWED_INTENTS),
}
return (seed, events, citing, filters, overview,
authors_df, affiliations_df, aff_geo_df,
cities_df, countries_df, fields_df, intents_df, journals_df)
# ── KG + Enriched 데이터 (별도 μ§€μ—° λ‘œλ“œ) ─────────────────────
@st.cache_data(show_spinner=False)
def load_kg_data(data_dir_str: str):
d = None if HF_REPO_ID else Path(data_dir_str)
kg_nodes = _read("kg_nodes.parquet", d)
kg_edges = _read("kg_edges.parquet", d)
enriched = _read("citation_events_enriched.parquet", d)
return kg_nodes, kg_edges, enriched
# ── 헬퍼 ───────────────────────────────────────────────────────
def filter_seed_papers(seed, q, fields, countries, journals):
df = seed.copy()
q = (q or "").strip().lower()
if q:
df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
if fields: df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
return df.reset_index(drop=True)
def event_subset(events, seed_paper_id, year_min, year_max):
df = events[events["seed_paper_id"] == seed_paper_id].copy()
df = df[df["citing_year"].fillna(-99999) >= year_min]
df = df[df["citing_year"].fillna(99999) <= year_max]
return df.reset_index(drop=True)
def build_intent_summary(df):
counts = df.groupby("primary_intent").size().to_dict()
return pd.DataFrame({"intent": ALLOWED_INTENTS,
"count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
def build_context_rows(df, limit=20):
rows = []
df = df.sort_values(["context_count","intent_count","citing_year"],
ascending=[False,False,False], na_position="last")
for _, row in df.iterrows():
ctx = row["contexts"]
if isinstance(ctx, list) and ctx:
for c in ctx[:2]:
rows.append({"primary_intent": row["primary_intent"],
"citing_title": row["citing_title"],
"citing_doi": row["citing_doi"],
"citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
"context": c})
if len(rows) >= limit: break
return pd.DataFrame(rows[:limit])
def build_citing_table(df, limit=30):
if df.empty:
return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
return (df.sort_values(["context_count","intent_count","citing_year"],
ascending=[False,False,False], na_position="last")
[["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
.drop_duplicates(subset=["citing_paper_id"]).head(limit))
def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
"""μ„ νƒλœ seed paperλ₯Ό μΈμš©ν•œ 논문듀이 ν•¨κ»˜ μΈμš©ν•œ λ‹€λ₯Έ seed papers"""
citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
cocited = (events[events["citing_paper_id"].isin(citing_ids) &
(events["seed_paper_id"] != selected_seed_id)]
.groupby("seed_paper_id").size()
.reset_index(name="co_citation_count")
.sort_values("co_citation_count", ascending=False)
.head(top_n))
return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
on="seed_paper_id", how="left")
def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
"""μ„ νƒλœ seed paper의 KG 1-hop μ„œλΈŒκ·Έλž˜ν”„ λ°˜ν™˜"""
node_id = f"seed:{seed_doi}"
edges = kg_edges[(kg_edges["source"] == node_id) |
(kg_edges["target"] == node_id)].head(max_edges)
if edges.empty:
return None, None
all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
return nodes, edges
def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
"""KG Explorer: μž„μ˜ λ…Έλ“œ κΈ°μ€€ μ„œλΈŒκ·Έλž˜ν”„"""
edges = kg_edges[(kg_edges["source"] == search_node_id) |
(kg_edges["target"] == search_node_id)].head(max_edges)
if edges.empty:
return None, None
all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
return nodes, edges
# ── pyvis λΉŒλ” ─────────────────────────────────────────────────
def pyvis_citation_graph(seed_row, events_df):
net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
sid = seed_row["seed_paper_id"]
net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
font={"color":"white"})
for _, row in events_df.sort_values(["context_count","intent_count"],
ascending=False).head(40).iterrows():
cid = row["citing_paper_id"]
net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
color=NODE_COLORS["citing_paper"], size=18, shape="dot")
ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
net.add_edge(cid, sid, label=row["primary_intent"],
color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"),
title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
net.barnes_hut()
return inject_fullscreen(net.generate_html())
def pyvis_ontology():
net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
for nid, label, typ in [
("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"),
("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"),
("journal","Journal","journal"), ("author","Author","author"),
("affiliation","Affiliation","affiliation"),("city","City","city"),
("country","Country","country"), ("field","Field","field"),
]:
net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
for s, t, l in [
("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"),
("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"),
("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
("seed","field","belongsToField"),
]:
net.add_edge(s, t, label=l)
net.barnes_hut()
return inject_fullscreen(net.generate_html())
def pyvis_from_kg(nodes_df, edges_df, height="780px"):
"""kg_nodes / kg_edges DataFrame으둜 pyvis κ·Έλž˜ν”„ 생성"""
net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
for _, row in nodes_df.iterrows():
ntype = row.get("node_type","")
color = NODE_TYPE_COLORS.get(ntype,"#94a3b8")
label = str(row.get("label",""))[:55]
size = 30 if ntype == "seed_paper" else 16
font = {"color":"white"} if ntype == "seed_paper" else {}
tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}"
net.add_node(str(row["node_id"]), label=label, color=color,
size=size, shape="dot", title=tooltip, font=font)
for _, row in edges_df.iterrows():
net.add_edge(str(row["source"]), str(row["target"]),
label=row.get("edge_type",""), color="#94a3b8")
net.barnes_hut()
return inject_fullscreen(net.generate_html())
# ═══════════════════════════════════════════════════════════════
# 메인 UI
# ═══════════════════════════════════════════════════════════════
st.title("CitationHub")
st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
# ── Sidebar ────────────────────────────────────────────────────
with st.sidebar:
st.subheader("Data source")
if HF_REPO_ID:
data_dir_val = "hf"
st.caption(f"Hugging Face: {HF_REPO_ID}")
else:
data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
try:
(seed, events, citing, filters, overview,
authors_df, affiliations_df, aff_geo_df,
cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
st.success("Data loaded")
except Exception as e:
st.error(str(e)); st.stop()
st.subheader("Search seed papers")
q_input = st.text_input("Title or DOI")
if "q_submit" not in st.session_state: st.session_state["q_submit"] = ""
if st.button("Search", use_container_width=True):
st.session_state["q_submit"] = q_input
fields_sel = st.multiselect("Field", filters["fields"])
countries_sel = st.multiselect("Country", filters["countries"])
journals_sel = st.multiselect("Journal", filters["journals"][:200])
y_min = max(2000, filters["year_min"])
year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"]))
seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"],
fields_sel, countries_sel, journals_sel)
st.subheader("Overview counts")
c1, c2 = st.columns(2)
c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
c2.metric("Citation events", fmt_num(overview["citation_events"]))
c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
c2.metric("Authors", fmt_num(overview["authors"]))
c1.metric("Countries", fmt_num(overview["countries"]))
c2.metric("Fields", fmt_num(overview["fields"]))
options = seed_filtered["seed_paper_id"].tolist()
if not options:
st.warning("No seed papers match the current search."); st.stop()
current = st.session_state.get("selected_seed_id", options[0])
default_idx = options.index(current) if current in options else 0
selected_seed_id = st.selectbox(
"Seed paper", options, index=default_idx,
format_func=lambda sid: seed_filtered.loc[
seed_filtered["seed_paper_id"]==sid, "title"].iloc[0],
)
st.session_state["selected_seed_id"] = selected_seed_id
selected_seed = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0]
seed_events = event_subset(events, selected_seed_id, year_min, year_max)
intent_summary = build_intent_summary(seed_events)
contexts_df = build_context_rows(seed_events)
citing_table = build_citing_table(seed_events)
# ── νƒ­ ─────────────────────────────────────────────────────────
(tab_overview, tab_cnet, tab_ontology, tab_kg,
tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
"Overview","Citation Network","Ontology",
"Knowledge Graph","KG Explorer","Geographic Map","Analytics",
])
# ═══ 1. OVERVIEW ═══════════════════════════════════════════════
with tab_overview:
col1, col2 = st.columns(2)
with col1:
st.subheader("Seed paper detail")
dc1, dc2 = st.columns(2)
dc1.metric("Cited by", fmt_num(selected_seed["citedby_count"]))
dc2.metric("Citation events", fmt_num(len(seed_events)))
for label, key in [
("Title","title"),("DOI","doi"),("Journal","journal"),
("Author","author"),("Affiliation","affiliation"),
("City","city"),("Country","country"),("Field","field"),
]:
st.markdown(f"**{label}** \n{selected_seed[key] or '-'}")
st.subheader("Related citing papers")
st.dataframe(citing_table.rename(columns={
"citing_title":"Title","citing_year":"Year",
"primary_intent":"Intent","context_count":"Contexts"}),
use_container_width=True, hide_index=True)
st.subheader("Co-cited seed papers")
st.caption("같은 citing paper에 μ˜ν•΄ ν•¨κ»˜ 인용된 λ‹€λ₯Έ top 5% λ…Όλ¬Έλ“€")
cocited = get_cocited_papers(selected_seed_id, events, seed)
if cocited.empty:
st.info("Co-cited papers not found.")
else:
st.dataframe(cocited.rename(columns={
"co_citation_count":"Co-citations","title":"Title",
"field":"Field","citedby_count":"Cited by"}),
use_container_width=True, hide_index=True)
with col2:
st.subheader("Intent distribution (selected paper)")
fig = px.bar(intent_summary, x="intent", y="count", color="intent",
color_discrete_map=INTENT_COLORS)
fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
st.plotly_chart(fig, use_container_width=True)
st.subheader("Citation trend (selected paper)")
trend = (seed_events.dropna(subset=["citing_year"])
.assign(citing_year=lambda df: df["citing_year"].astype(int))
.groupby("citing_year").size().reset_index(name="count"))
if not trend.empty:
st.plotly_chart(
px.line(trend, x="citing_year", y="count", markers=True)
.update_layout(xaxis_title="Year", yaxis_title="Citations"),
use_container_width=True)
st.subheader("Field distribution")
fd = (seed_filtered.groupby("field", dropna=False).size()
.reset_index(name="count").sort_values("count", ascending=False).head(20))
fd["field"] = fd["field"].replace("","Unknown")
st.plotly_chart(
px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
use_container_width=True)
st.subheader("Citation contexts")
if contexts_df.empty:
st.info("No contexts available.")
else:
for _, row in contexts_df.iterrows():
st.markdown(
f"""<div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;
margin-bottom:10px;background:#f8fafc;">
<div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'],'#64748b')};
color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">
{row['primary_intent']}</div>
<div style="font-size:12px;color:#64748b;margin-bottom:6px;">
{row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div>
<div>{row['context']}</div></div>""",
unsafe_allow_html=True)
# ═══ 2. CITATION NETWORK ════════════════════════════════════════
with tab_cnet:
st.subheader("Citing ↔ Cited Citation Network")
st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
if seed_events.empty:
st.info("No citation network data for this seed paper.")
else:
components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
# ═══ 3. ONTOLOGY ════════════════════════════════════════════════
with tab_ontology:
st.subheader("CitationHub Ontology")
st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
components.html(pyvis_ontology(), height=820, scrolling=True)
# ═══ 4. KNOWLEDGE GRAPH (μ‹€μ œ KG 데이터) ═════════════════════════
with tab_kg:
st.subheader("Knowledge Graph β€” Selected Seed Paper")
st.caption("kg_nodes + kg_edges 전체 λ°μ΄ν„°μ—μ„œ μ„ νƒλœ seed paper의 1-hop μ„œλΈŒκ·Έλž˜ν”„")
st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš” (졜초 1회, 이후 μΊμ‹œλ¨)")
if st.button("KG 데이터 λ‘œλ“œ", key="kg_load"):
with st.spinner("kg_nodes / kg_edges / enriched λ‘œλ”© 쀑 ..."):
st.session_state["kg_loaded"] = True
if st.session_state.get("kg_loaded"):
try:
kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
seed_doi = selected_seed["doi"]
if not seed_doi:
st.warning("μ„ νƒλœ seed paper의 DOIκ°€ μ—†μ–΄ KG μ‘°νšŒκ°€ λΆˆκ°€ν•©λ‹ˆλ‹€.")
else:
nodes_sub, edges_sub = get_kg_subgraph(seed_doi, kg_nodes, kg_edges)
if nodes_sub is None:
st.warning(f"KGμ—μ„œ λ…Έλ“œλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (DOI: {seed_doi})")
else:
# 톡계
c1, c2, c3 = st.columns(3)
c1.metric("Nodes", fmt_num(len(nodes_sub)))
c2.metric("Edges", fmt_num(len(edges_sub)))
c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
type_counts = nodes_sub["node_type"].value_counts().reset_index()
type_counts.columns = ["node_type","count"]
st.plotly_chart(
px.bar(type_counts, x="node_type", y="count",
color="node_type",
color_discrete_map=NODE_TYPE_COLORS,
title="Node Type Distribution")
.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
use_container_width=True)
st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
components.html(pyvis_from_kg(nodes_sub, edges_sub), height=820, scrolling=True)
except Exception as e:
st.error(str(e))
# ═══ 5. KG EXPLORER ═════════════════════════════════════════════
with tab_kg_exp:
st.subheader("KG Explorer")
st.caption("kg_nodes 전체λ₯Ό νƒμƒ‰ν•˜κ³  μž„μ˜ λ…Έλ“œμ˜ μ—°κ²° 관계λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€.")
st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš” (졜초 1회, 이후 μΊμ‹œλ¨)")
if st.button("KG 데이터 λ‘œλ“œ", key="kg_exp_load"):
with st.spinner("λ‘œλ”© 쀑..."):
st.session_state["kg_loaded"] = True
if st.session_state.get("kg_loaded"):
try:
kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
# ── 전체 λ…Έλ“œ νƒ€μž… 뢄포
col_a, col_b = st.columns([1,2])
with col_a:
st.subheader("Node Type Counts")
nt = kg_nodes["node_type"].value_counts().reset_index()
nt.columns = ["node_type","count"]
st.dataframe(nt, use_container_width=True, hide_index=True)
st.subheader("Edge Type Counts")
et = kg_edges["edge_type"].value_counts().reset_index()
et.columns = ["edge_type","count"]
st.dataframe(et, use_container_width=True, hide_index=True)
with col_b:
st.subheader("Node Type Distribution")
nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
color_discrete_map=NODE_TYPE_COLORS)
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
st.plotly_chart(nt_fig, use_container_width=True)
st.markdown("---")
st.subheader("Node Search & Ego Network")
exp_col1, exp_col2 = st.columns([1,3])
with exp_col1:
type_options = ["(all)"] + sorted(kg_nodes["node_type"].unique().tolist())
sel_type = st.selectbox("Filter by node type", type_options)
filtered_nodes = (kg_nodes if sel_type == "(all)"
else kg_nodes[kg_nodes["node_type"]==sel_type])
search_q = st.text_input("Search node label / DOI")
if search_q:
filtered_nodes = filtered_nodes[
filtered_nodes["label"].str.contains(search_q, case=False, na=False) |
filtered_nodes["doi"].str.contains(search_q, case=False, na=False)
]
sample = filtered_nodes.head(100)
node_options = sample["node_id"].tolist()
if not node_options:
st.warning("검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
else:
sel_node_id = st.selectbox(
"Select node",
node_options,
format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60],
)
sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0]
st.markdown(f"**Type**: {sel_node_info.get('node_type','')}")
st.markdown(f"**DOI**: {sel_node_info.get('doi','') or '-'}")
st.markdown(f"**Publication**: {sel_node_info.get('publication_name','') or '-'}")
st.markdown(f"**Group**: {sel_node_info.get('group','') or '-'}")
st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}")
max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
if st.button("Show ego network", key="kg_exp_show"):
exp_nodes, exp_edges = get_explorer_subgraph(sel_node_id, kg_nodes, kg_edges, max_e)
if exp_nodes is None:
st.warning("μ—°κ²°λœ μ—£μ§€κ°€ μ—†μŠ΅λ‹ˆλ‹€.")
else:
st.session_state["exp_nodes"] = exp_nodes
st.session_state["exp_edges"] = exp_edges
with exp_col2:
if "exp_nodes" in st.session_state:
en = st.session_state["exp_nodes"]
ee = st.session_state["exp_edges"]
st.caption(f"Nodes: {len(en)} | Edges: {len(ee)}")
st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True)
else:
st.info("μ™Όμͺ½μ—μ„œ λ…Έλ“œλ₯Ό μ„ νƒν•˜κ³  'Show ego network'λ₯Ό ν΄λ¦­ν•˜μ„Έμš”.")
# ── Enriched μΈμ‚¬μ΄νŠΈ
st.markdown("---")
st.subheader("Enriched Citation Insights")
st.caption("citation_events_enriched: 의미적 증거(semantic evidence) 뢄석")
if "has_semantic_evidence" in enriched.columns:
sem = enriched["has_semantic_evidence"].value_counts().reset_index()
sem.columns = ["has_semantic_evidence","count"]
sem["label"] = sem["has_semantic_evidence"].map({True:"With evidence", False:"Without evidence"})
st.plotly_chart(
px.pie(sem, names="label", values="count",
title="Semantic Evidence Coverage (all citation events)")
.update_layout(legend_title=""),
use_container_width=True)
# 뢄야별 semantic evidence λΉ„μœ¨
if "field_folder" in enriched.columns:
field_sem = (enriched.groupby("field_folder")["has_semantic_evidence"]
.mean().reset_index()
.rename(columns={"has_semantic_evidence":"sem_ratio","field_folder":"field"})
.sort_values("sem_ratio", ascending=False).head(20))
st.plotly_chart(
px.bar(field_sem, x="field", y="sem_ratio",
title="Semantic Evidence Rate by Field",
labels={"sem_ratio":"Evidence Rate","field":"Field"})
.update_layout(xaxis_tickangle=-40),
use_container_width=True)
else:
st.info("has_semantic_evidence 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
except Exception as e:
st.error(str(e))
# ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
with tab_geo:
st.subheader("Geographic Distribution of Seed Papers")
country_cnt = (seed_filtered.groupby("country", dropna=False).size()
.reset_index(name="count").rename(columns={"country":"country_name"}))
country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
if not country_cnt.empty:
fig_map = px.choropleth(country_cnt, locations="country_name",
locationmode="country names", color="count",
hover_name="country_name",
color_continuous_scale="Blues",
title="Seed Papers by Country")
fig_map.update_layout(geo=dict(showframe=False), height=500)
st.plotly_chart(fig_map, use_container_width=True)
st.subheader("Top Cities (Affiliation)")
city_cnt = (seed_filtered.merge(
aff_geo_df[["affiliation_name","city_name","country_name"]],
left_on="affiliation", right_on="affiliation_name", how="left")
.groupby(["country_name","city_name"], dropna=False).size()
.reset_index(name="count").dropna(subset=["country_name"])
.sort_values("count", ascending=False).head(30))
if not city_cnt.empty:
st.plotly_chart(
px.bar(city_cnt, x="city_name", y="count", color="country_name",
title="Top 30 Cities")
.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
use_container_width=True)
st.subheader("Citation Trend over Time (selected paper)")
trend2 = (seed_events.dropna(subset=["citing_year"])
.assign(citing_year=lambda df: df["citing_year"].astype(int))
.groupby("citing_year").size().reset_index(name="count"))
if not trend2.empty:
st.plotly_chart(
px.line(trend2, x="citing_year", y="count", markers=True,
title="Citations per Year")
.update_layout(xaxis_title="Year", yaxis_title="Citations"),
use_container_width=True)
# ═══ 7. ANALYTICS ═══════════════════════════════════════════════
with tab_analytics:
col_a, col_b = st.columns(2)
with col_a:
st.subheader("Top Authors")
if "author_id" in seed.columns and not seed["author_id"].isna().all():
top_auth = (seed.explode("author_id")
.merge(authors_df, on="author_id", how="left")
.groupby("author_name").size()
.reset_index(name="paper_count")
.sort_values("paper_count", ascending=False).head(20))
else:
top_auth = (seed["author"].value_counts()
.reset_index().rename(columns={"author":"author_name","count":"paper_count"})
.head(20))
top_auth = top_auth[top_auth["author_name"].str.strip() != ""]
st.plotly_chart(
px.bar(top_auth, x="paper_count", y="author_name", orientation="h",
title="Top 20 Authors")
.update_layout(yaxis=dict(autorange="reversed"),
xaxis_title="Seed Papers", yaxis_title=""),
use_container_width=True)
with col_b:
st.subheader("Top Journals")
top_jnl = (seed.groupby("journal").size()
.reset_index(name="count").sort_values("count", ascending=False).head(20))
top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""]
st.plotly_chart(
px.bar(top_jnl, x="count", y="journal", orientation="h",
title="Top 20 Journals")
.update_layout(yaxis=dict(autorange="reversed"),
xaxis_title="Seed Papers", yaxis_title=""),
use_container_width=True)
st.markdown("---")
col_c, col_d = st.columns(2)
with col_c:
st.subheader("Field Γ— Intent Heatmap")
fi = (seed[["seed_paper_id","field"]]
.merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
.groupby(["field","primary_intent"]).size().reset_index(name="count"))
if not fi.empty:
pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
st.plotly_chart(
px.imshow(pivot, color_continuous_scale="Blues",
title="Citation Intent by Field", aspect="auto")
.update_layout(xaxis_title="Intent", yaxis_title="Field"),
use_container_width=True)
with col_d:
st.subheader("Influential Citations (selected paper)")
if "is_influential" in seed_events.columns:
inf = seed_events["is_influential"].value_counts().reset_index()
inf.columns = ["is_influential","count"]
inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"})
st.plotly_chart(
px.pie(inf, names="label", values="count",
title="Influential vs Non-influential"),
use_container_width=True)
st.subheader("Intent Reference")
st.dataframe(intents_df, use_container_width=True, hide_index=True)
st.markdown("---")
st.subheader("Field Reference")
st.dataframe(fields_df, use_container_width=True, hide_index=True)