Daniel0315 commited on
Commit
af9c904
Β·
verified Β·
1 Parent(s): 41e4f48

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +462 -329
src/app.py CHANGED
@@ -7,34 +7,32 @@ from typing import List
7
  import pandas as pd
8
  import streamlit as st
9
  import plotly.express as px
10
- import plotly.graph_objects as go
11
  from pyvis.network import Network
12
  import streamlit.components.v1 as components
13
 
14
  HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
15
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
 
17
- st.set_page_config(
18
- page_title="CitationHub",
19
- page_icon="πŸ“š",
20
- layout="wide",
21
- )
22
 
23
  ALLOWED_INTENTS = [
24
- "background", "uses", "similarities", "motivation",
25
- "differences", "future_work", "extends",
26
  ]
27
-
28
  INTENT_COLORS = {
29
- "background": "#94a3b8", "uses": "#22c55e", "similarities": "#3b82f6",
30
- "motivation": "#f59e0b", "differences": "#ef4444",
31
- "future_work": "#8b5cf6", "extends": "#06b6d4",
32
  }
33
-
34
  NODE_COLORS = {
35
- "seed_paper": "#111827", "citing_paper": "#dbeafe", "citation_event": "#fde68a",
36
- "journal": "#ede9fe", "author": "#fee2e2", "affiliation": "#fae8ff",
37
- "city": "#cffafe", "country": "#ffedd5", "field": "#e0e7ff", "intent": "#dcfce7",
 
 
 
 
 
38
  }
39
 
40
  DEFAULT_DATA_DIR = Path(os.environ.get(
@@ -44,10 +42,8 @@ DEFAULT_DATA_DIR = Path(os.environ.get(
44
 
45
 
46
  def fmt_num(x):
47
- try:
48
- return f"{int(x):,}"
49
- except Exception:
50
- return "-"
51
 
52
 
53
  def _hf_download(filename: str) -> str:
@@ -65,47 +61,39 @@ def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
65
 
66
 
67
  def inject_fullscreen(html: str) -> str:
68
- """pyvis HTML에 전체화면 λ²„νŠΌμ„ μ£Όμž…ν•©λ‹ˆλ‹€."""
69
  btn = """
70
- <button
71
- onclick="var el=document.getElementById('mynetwork');
72
- if(el){if(el.requestFullscreen)el.requestFullscreen();
73
- else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
74
  style="position:fixed;bottom:18px;right:18px;z-index:9999;
75
- padding:8px 18px;background:#1e293b;color:white;
76
- border:none;border-radius:8px;cursor:pointer;font-size:13px;
77
- box-shadow:0 2px 8px rgba(0,0,0,0.35);">
78
- β›Ά Fullscreen
79
- </button>
80
- <div style="position:fixed;bottom:18px;left:18px;z-index:9999;
81
- font-size:12px;color:#64748b;background:rgba(255,255,255,0.85);
82
  padding:5px 10px;border-radius:6px;">
83
- πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info
84
- </div>
85
  """
86
  return html.replace("</body>", btn + "</body>")
87
 
88
 
 
89
  @st.cache_data(show_spinner=False)
90
  def load_data(data_dir_str: str):
91
  d = None if HF_REPO_ID else Path(data_dir_str)
92
 
93
- # --- 핡심 3개 (λŒ€μš©λŸ‰) ---
94
- seed_df = _read("seed_cited_papers_normalized.parquet", d)
95
- events_df = _read("citation_events_normalized.parquet", d)
96
- citing_df = _read("citing_papers_normalized.parquet", d)
97
-
98
- # --- μ°Έμ‘° ν…Œμ΄λΈ” (μ†Œμš©λŸ‰) ---
99
- authors_df = _read("authors.parquet", d)
100
  affiliations_df = _read("affiliations.parquet", d)
101
- aff_geo_df = _read("affiliation_geo.parquet", d)
102
- cities_df = _read("cities.parquet", d)
103
- countries_df = _read("countries.parquet", d)
104
- fields_df = _read("fields.parquet", d)
105
- intents_df = _read("intents.parquet", d)
106
- journals_df = _read("journals.parquet", d)
107
-
108
- # --- seed 가곡 ---
109
  seed = pd.DataFrame({
110
  "seed_paper_id": seed_df["seed_paper_id"],
111
  "doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
@@ -123,11 +111,10 @@ def load_data(data_dir_str: str):
123
  "field_id": seed_df.get("field_id", pd.Series(dtype=object)),
124
  "journal_id": seed_df.get("journal_id", pd.Series(dtype=object)),
125
  })
126
- for col in ["title", "doi", "journal", "field", "country"]:
127
  seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
128
- seed = seed.sort_values(["citedby_count", "title"], ascending=[False, True]).reset_index(drop=True)
129
 
130
- # --- events 가곡 ---
131
  events = pd.DataFrame({
132
  "citation_event_id": events_df["citation_event_id"],
133
  "seed_paper_id": events_df["cited_seed_paper_id"],
@@ -145,14 +132,13 @@ def load_data(data_dir_str: str):
145
  })
146
  events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
147
 
148
- # --- citing 가곡 ---
149
  citing = pd.DataFrame({
150
  "citing_paper_id": citing_df["citing_paper_id"],
151
- "doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""),
152
- "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""),
153
- "year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
154
- "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
155
- "oa_pdf": citing_df.get("oa_pdf", pd.Series(dtype=str)).fillna(""),
156
  })
157
 
158
  filters = {
@@ -163,35 +149,40 @@ def load_data(data_dir_str: str):
163
  "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
164
  "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
165
  }
166
-
167
  overview = {
168
- "seed_papers": int(len(seed)),
169
- "citation_events": int(len(events)),
170
- "citing_papers": int(events["citing_paper_id"].nunique()),
171
- "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
172
- "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
173
- "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
174
- "intents": len(ALLOWED_INTENTS),
175
- "authors": int(len(authors_df)),
176
  }
177
-
178
  return (seed, events, citing, filters, overview,
179
  authors_df, affiliations_df, aff_geo_df,
180
  cities_df, countries_df, fields_df, intents_df, journals_df)
181
 
182
 
183
- # ── ν•„ν„° 헬퍼 ──────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
184
  def filter_seed_papers(seed, q, fields, countries, journals):
185
  df = seed.copy()
186
  q = (q or "").strip().lower()
187
  if q:
188
  df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
189
- if fields:
190
- df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
191
- if countries:
192
- df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
193
- if journals:
194
- df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
195
  return df.reset_index(drop=True)
196
 
197
 
@@ -204,111 +195,107 @@ def event_subset(events, seed_paper_id, year_min, year_max):
204
 
205
  def build_intent_summary(df):
206
  counts = df.groupby("primary_intent").size().to_dict()
207
- return pd.DataFrame({
208
- "intent": ALLOWED_INTENTS,
209
- "count": [int(counts.get(i, 0)) for i in ALLOWED_INTENTS],
210
- })
211
 
212
 
213
  def build_context_rows(df, limit=20):
214
  rows = []
215
- df = df.sort_values(["context_count", "intent_count", "citing_year"],
216
- ascending=[False, False, False], na_position="last")
217
  for _, row in df.iterrows():
218
- contexts = row["contexts"]
219
- if isinstance(contexts, list) and contexts:
220
- for ctx in contexts[:2]:
221
- rows.append({
222
- "primary_intent": row["primary_intent"],
223
- "citing_title": row["citing_title"],
224
- "citing_doi": row["citing_doi"],
225
- "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
226
- "context": ctx,
227
- })
228
- if len(rows) >= limit:
229
- break
230
  return pd.DataFrame(rows[:limit])
231
 
232
 
233
  def build_citing_table(df, limit=30):
234
  if df.empty:
235
- return pd.DataFrame(columns=["citing_title", "citing_year", "primary_intent", "context_count"])
236
- return (
237
- df.sort_values(["context_count", "intent_count", "citing_year"],
238
- ascending=[False, False, False], na_position="last")
239
- [["citing_paper_id", "citing_title", "citing_doi", "citing_year", "primary_intent", "context_count"]]
240
- .drop_duplicates(subset=["citing_paper_id"])
241
- .head(limit)
242
- )
243
-
244
-
245
- # ── pyvis λΉŒλ” ─────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  def pyvis_citation_graph(seed_row, events_df):
247
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
248
  sid = seed_row["seed_paper_id"]
249
  net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
250
- font={"color": "white"})
251
- for _, row in events_df.sort_values(["context_count", "intent_count"],
252
  ascending=False).head(40).iterrows():
253
  cid = row["citing_paper_id"]
254
  net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
255
  color=NODE_COLORS["citing_paper"], size=18, shape="dot")
256
  ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
257
- yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
258
  net.add_edge(cid, sid, label=row["primary_intent"],
259
- color=INTENT_COLORS.get(row["primary_intent"], "#94a3b8"),
260
  title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
261
  net.barnes_hut()
262
  return inject_fullscreen(net.generate_html())
263
 
264
 
265
- def pyvis_kg(seed_row, events_df):
266
- net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
267
- sid = seed_row["seed_paper_id"]
268
- net.add_node(sid, label=seed_row["title"][:60], color=NODE_COLORS["seed_paper"],
269
- font={"color": "white"}, size=34, shape="dot")
270
- for key, typ, rel in [
271
- ("journal", "journal", "PUBLISHED_IN"), ("author", "author", "HAS_AUTHOR"),
272
- ("affiliation", "affiliation", "HAS_AFFILIATION"), ("city", "city", "LOCATED_IN_CITY"),
273
- ("country", "country", "LOCATED_IN_COUNTRY"), ("field", "field", "BELONGS_TO_FIELD"),
274
- ]:
275
- val = seed_row.get(key, "")
276
- if val:
277
- nid = f"{typ}:{val}"
278
- net.add_node(nid, label=str(val)[:50], color=NODE_COLORS[typ], size=16)
279
- net.add_edge(sid, nid, label=rel)
280
- top = events_df.sort_values(["context_count", "intent_count"], ascending=False).head(20)
281
- for intent, cnt in top.groupby("primary_intent").size().items():
282
- iid = f"intent:{intent}"
283
- net.add_node(iid, label=f"{intent} ({cnt})", color=NODE_COLORS["intent"], size=18)
284
- net.add_edge(sid, iid, label="HAS_INTENT_CLUSTER")
285
- for _, row in top.iterrows():
286
- eid, cid = row["citation_event_id"], row["citing_paper_id"]
287
- net.add_node(eid, label=row["primary_intent"], color=NODE_COLORS["citation_event"], size=14)
288
- net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:55],
289
- color=NODE_COLORS["citing_paper"], size=14)
290
- net.add_edge(eid, sid, label="HAS_CITED_PAPER")
291
- net.add_edge(eid, cid, label="HAS_CITING_PAPER")
292
- net.add_edge(eid, f"intent:{row['primary_intent']}", label="HAS_PRIMARY_INTENT")
293
- net.barnes_hut()
294
- return inject_fullscreen(net.generate_html())
295
-
296
-
297
  def pyvis_ontology():
298
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
299
  for nid, label, typ in [
300
- ("seed","Top5PctCitedPaper","seed_paper"), ("event","CitationEvent","citation_event"),
301
- ("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"),
302
- ("journal","Journal","journal"), ("author","Author","author"),
303
  ("affiliation","Affiliation","affiliation"),("city","City","city"),
304
- ("country","Country","country"), ("field","Field","field"),
305
  ]:
306
  net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
307
  for s, t, l in [
308
- ("event","citing","hasCitingPaper"), ("event","seed","hasCitedPaper"),
309
- ("event","intent","hasPrimaryIntent"), ("seed","journal","publishedInJournal"),
310
- ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
311
- ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
312
  ("seed","field","belongsToField"),
313
  ]:
314
  net.add_edge(s, t, label=l)
@@ -316,10 +303,32 @@ def pyvis_ontology():
316
  return inject_fullscreen(net.generate_html())
317
 
318
 
319
- # ── 메인 UI ────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  st.title("CitationHub")
321
- st.caption("Explore influential papers, their citation networks, and related research.")
322
 
 
323
  with st.sidebar:
324
  st.subheader("Data source")
325
  if HF_REPO_ID:
@@ -334,13 +343,11 @@ with st.sidebar:
334
  cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
335
  st.success("Data loaded")
336
  except Exception as e:
337
- st.error(str(e))
338
- st.stop()
339
 
340
  st.subheader("Search seed papers")
341
  q_input = st.text_input("Title or DOI")
342
- if "q_submit" not in st.session_state:
343
- st.session_state["q_submit"] = ""
344
  if st.button("Search", use_container_width=True):
345
  st.session_state["q_submit"] = q_input
346
 
@@ -355,61 +362,70 @@ with st.sidebar:
355
 
356
  st.subheader("Overview counts")
357
  c1, c2 = st.columns(2)
358
- c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
359
- c2.metric("Citation events", fmt_num(overview["citation_events"]))
360
- c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
361
- c2.metric("Authors", fmt_num(overview["authors"]))
362
- c1.metric("Countries", fmt_num(overview["countries"]))
363
- c2.metric("Fields", fmt_num(overview["fields"]))
364
 
365
  options = seed_filtered["seed_paper_id"].tolist()
366
  if not options:
367
- st.warning("No seed papers match the current search.")
368
- st.stop()
369
- current = st.session_state.get("selected_seed_id", options[0])
370
  default_idx = options.index(current) if current in options else 0
371
  selected_seed_id = st.selectbox(
372
  "Seed paper", options, index=default_idx,
373
  format_func=lambda sid: seed_filtered.loc[
374
- seed_filtered["seed_paper_id"] == sid, "title"].iloc[0],
375
  )
376
  st.session_state["selected_seed_id"] = selected_seed_id
377
 
378
- selected_seed = seed_filtered[seed_filtered["seed_paper_id"] == selected_seed_id].iloc[0]
379
- seed_events = event_subset(events, selected_seed_id, year_min, year_max)
380
  intent_summary = build_intent_summary(seed_events)
381
  contexts_df = build_context_rows(seed_events)
382
  citing_table = build_citing_table(seed_events)
383
 
384
- # ── νƒ­ ────────────────────────────────────────────────────
385
  (tab_overview, tab_cnet, tab_ontology, tab_kg,
386
- tab_geo, tab_analytics) = st.tabs([
387
- "Overview", "Citation Network", "Ontology", "Knowledge Graph",
388
- "Geographic Map", "Analytics",
389
  ])
390
 
391
- # ─────────────────── 1. OVERVIEW ──────────────────────────
 
392
  with tab_overview:
393
  col1, col2 = st.columns(2)
394
  with col1:
395
  st.subheader("Seed paper detail")
396
- st.columns(2)[0].metric("Cited by", fmt_num(selected_seed["citedby_count"]))
397
- st.columns(2)[1].metric("Citation events", fmt_num(len(seed_events)))
 
398
  for label, key in [
399
- ("Title","title"), ("DOI","doi"), ("Journal","journal"),
400
- ("Author","author"), ("Affiliation","affiliation"),
401
- ("City","city"), ("Country","country"), ("Field","field"),
402
  ]:
403
  st.markdown(f"**{label}** \n{selected_seed[key] or '-'}")
404
 
405
  st.subheader("Related citing papers")
406
- st.dataframe(
407
- citing_table.rename(columns={
408
- "citing_title":"Title","citing_year":"Year",
409
- "primary_intent":"Intent","context_count":"Contexts",
410
- }),
411
- use_container_width=True, hide_index=True,
412
- )
 
 
 
 
 
 
 
 
413
 
414
  with col2:
415
  st.subheader("Intent distribution (selected paper)")
@@ -418,23 +434,23 @@ with tab_overview:
418
  fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
419
  st.plotly_chart(fig, use_container_width=True)
420
 
 
 
 
 
 
 
 
 
 
 
421
  st.subheader("Field distribution")
422
  fd = (seed_filtered.groupby("field", dropna=False).size()
423
  .reset_index(name="count").sort_values("count", ascending=False).head(20))
424
- fd["field"] = fd["field"].replace("", "Unknown")
425
  st.plotly_chart(
426
  px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
427
- use_container_width=True,
428
- )
429
-
430
- st.subheader("Overall intent distribution")
431
- all_intents = events.groupby("primary_intent").size().to_dict()
432
- ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
433
- "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
434
- fig2 = px.bar(ai_df, x="intent", y="count", color="intent",
435
- color_discrete_map=INTENT_COLORS)
436
- fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
437
- st.plotly_chart(fig2, use_container_width=True)
438
 
439
  st.subheader("Citation contexts")
440
  if contexts_df.empty:
@@ -450,10 +466,10 @@ with tab_overview:
450
  <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
451
  {row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div>
452
  <div>{row['context']}</div></div>""",
453
- unsafe_allow_html=True,
454
- )
455
 
456
- # ─────────────────── 2. CITATION NETWORK ──────────────────
457
  with tab_cnet:
458
  st.subheader("Citing ↔ Cited Citation Network")
459
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
@@ -462,168 +478,285 @@ with tab_cnet:
462
  else:
463
  components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
464
 
465
- # ─────────────────── 3. ONTOLOGY ──────────────────────────
 
466
  with tab_ontology:
467
  st.subheader("CitationHub Ontology")
468
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
469
  components.html(pyvis_ontology(), height=820, scrolling=True)
470
 
471
- # ─────────────────── 4. KNOWLEDGE GRAPH ───────────────────
 
472
  with tab_kg:
473
  st.subheader("Knowledge Graph β€” Selected Seed Paper")
474
- st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
475
- if seed_events.empty:
476
- st.info("No knowledge graph data for this seed paper.")
477
- else:
478
- components.html(pyvis_kg(selected_seed, seed_events), height=820, scrolling=True)
479
-
480
- # ─────────────────── 5. GEOGRAPHIC MAP ────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
  with tab_geo:
482
  st.subheader("Geographic Distribution of Seed Papers")
483
 
484
- # ꡭ가별 seed paper 수
485
- country_cnt = (
486
- seed_filtered.groupby("country", dropna=False).size()
487
- .reset_index(name="count")
488
- .rename(columns={"country": "country_name"})
489
- )
490
  country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
491
- country_cnt = country_cnt.merge(countries_df, on="country_name", how="left")
492
 
493
  if not country_cnt.empty:
494
- fig_map = px.choropleth(
495
- country_cnt,
496
- locations="country_name",
497
- locationmode="country names",
498
- color="count",
499
- hover_name="country_name",
500
- color_continuous_scale="Blues",
501
- title="Seed Papers by Country",
502
- )
503
  fig_map.update_layout(geo=dict(showframe=False), height=500)
504
  st.plotly_chart(fig_map, use_container_width=True)
505
 
506
- # λ„μ‹œλ³„ 뢄포 (affiliation_geo ν™œμš©)
507
- st.subheader("Affiliation Geo Distribution")
508
- city_cnt = (
509
- seed_filtered.merge(
510
- aff_geo_df[["affiliation_name", "city_name", "country_name"]],
511
- left_on="affiliation", right_on="affiliation_name", how="left",
512
- )
513
- .groupby(["country_name","city_name"], dropna=False).size()
514
- .reset_index(name="count")
515
- .dropna(subset=["country_name"])
516
- .sort_values("count", ascending=False)
517
- .head(30)
518
- )
519
  if not city_cnt.empty:
520
- fig_city = px.bar(
521
- city_cnt, x="city_name", y="count", color="country_name",
522
- title="Top 30 Cities (Affiliation)",
523
- )
524
- fig_city.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40)
525
- st.plotly_chart(fig_city, use_container_width=True)
526
-
527
- # 연도별 citing 좔이 (κ΅­κ°€ ν•„ν„°)
528
- st.subheader("Citation Trend over Time")
529
- year_trend = (
530
- seed_events.groupby("citing_year").size()
531
- .reset_index(name="count")
532
- .dropna()
533
- )
534
- year_trend["citing_year"] = year_trend["citing_year"].astype(int)
535
- if not year_trend.empty:
536
- fig_trend = px.line(year_trend, x="citing_year", y="count",
537
- title="Citations per Year (selected seed paper)",
538
- markers=True)
539
- fig_trend.update_layout(xaxis_title="Year", yaxis_title="Citations")
540
- st.plotly_chart(fig_trend, use_container_width=True)
541
-
542
- # ─────────────────── 6. ANALYTICS ────────────────────────
543
  with tab_analytics:
544
  col_a, col_b = st.columns(2)
545
 
546
- # ── μ €μž λž­ν‚Ή
547
  with col_a:
548
- st.subheader("Top Authors (by seed paper count)")
549
- # seed_cited_papers_normalized에 author_id 있으면 join
550
  if "author_id" in seed.columns and not seed["author_id"].isna().all():
551
- top_authors = (
552
- seed.explode("author_id")
553
- .merge(authors_df, on="author_id", how="left")
554
- .groupby("author_name").size()
555
- .reset_index(name="paper_count")
556
- .sort_values("paper_count", ascending=False)
557
- .head(20)
558
- )
559
  else:
560
- # creator μ»¬λŸΌμ—μ„œ 직접 μΆ”μΆœ
561
- top_authors = (
562
- seed["author"].value_counts()
563
- .reset_index()
564
- .rename(columns={"author": "author_name", "count": "paper_count"})
565
- .head(20)
566
- )
567
- top_authors = top_authors[top_authors["author_name"].str.strip() != ""]
568
- fig_auth = px.bar(top_authors, x="paper_count", y="author_name",
569
- orientation="h", title="Top 20 Authors")
570
- fig_auth.update_layout(yaxis=dict(autorange="reversed"),
571
- xaxis_title="Seed Papers", yaxis_title="")
572
- st.plotly_chart(fig_auth, use_container_width=True)
573
-
574
- # ── 저널 λž­ν‚Ή
575
  with col_b:
576
- st.subheader("Top Journals (by seed paper count)")
577
- top_journals = (
578
- seed.groupby("journal").size()
579
- .reset_index(name="count")
580
- .sort_values("count", ascending=False)
581
- .head(20)
582
- )
583
- top_journals = top_journals[top_journals["journal"].str.strip() != ""]
584
- fig_jnl = px.bar(top_journals, x="count", y="journal",
585
- orientation="h", title="Top 20 Journals")
586
- fig_jnl.update_layout(yaxis=dict(autorange="reversed"),
587
- xaxis_title="Seed Papers", yaxis_title="")
588
- st.plotly_chart(fig_jnl, use_container_width=True)
589
 
590
  st.markdown("---")
591
  col_c, col_d = st.columns(2)
592
 
593
- # ── 뢄야별 인용 μ˜λ„ 히트맡
594
  with col_c:
595
  st.subheader("Field Γ— Intent Heatmap")
596
- field_intent = (
597
- seed[["seed_paper_id", "field"]]
598
- .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
599
- .groupby(["field", "primary_intent"]).size()
600
- .reset_index(name="count")
601
- )
602
- if not field_intent.empty:
603
- pivot = field_intent.pivot(index="field", columns="primary_intent", values="count").fillna(0)
604
- fig_hm = px.imshow(pivot, color_continuous_scale="Blues",
605
- title="Citation Intent by Field",
606
- aspect="auto")
607
- fig_hm.update_layout(xaxis_title="Intent", yaxis_title="Field")
608
- st.plotly_chart(fig_hm, use_container_width=True)
609
-
610
- # ── Influential citation λΉ„μœ¨
611
  with col_d:
612
- st.subheader("Influential Citations")
613
  if "is_influential" in seed_events.columns:
614
- inf_cnt = seed_events["is_influential"].value_counts().reset_index()
615
- inf_cnt.columns = ["is_influential", "count"]
616
- inf_cnt["label"] = inf_cnt["is_influential"].map({True: "Influential", False: "Non-influential"})
617
- fig_inf = px.pie(inf_cnt, names="label", values="count",
618
- title="Influential vs Non-influential (selected paper)")
619
- st.plotly_chart(fig_inf, use_container_width=True)
620
- else:
621
- st.info("is_influential 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
622
 
623
- # ── Intent 상세 정보
624
- st.subheader("Intent Reference Table")
625
- st.dataframe(intents_df, use_container_width=True, hide_index=True)
626
 
627
- # ── Fields 상세 정보
628
- st.subheader("Field Reference Table")
629
  st.dataframe(fields_df, use_container_width=True, hide_index=True)
 
7
  import pandas as pd
8
  import streamlit as st
9
  import plotly.express as px
 
10
  from pyvis.network import Network
11
  import streamlit.components.v1 as components
12
 
13
  HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
14
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
15
 
16
+ st.set_page_config(page_title="CitationHub", page_icon="πŸ“š", layout="wide")
 
 
 
 
17
 
18
  ALLOWED_INTENTS = [
19
+ "background","uses","similarities","motivation",
20
+ "differences","future_work","extends",
21
  ]
 
22
  INTENT_COLORS = {
23
+ "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6",
24
+ "motivation":"#f59e0b","differences":"#ef4444",
25
+ "future_work":"#8b5cf6","extends":"#06b6d4",
26
  }
 
27
  NODE_COLORS = {
28
+ "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a",
29
+ "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff",
30
+ "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7",
31
+ }
32
+ NODE_TYPE_COLORS = {
33
+ "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b",
34
+ "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899",
35
+ "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e",
36
  }
37
 
38
  DEFAULT_DATA_DIR = Path(os.environ.get(
 
42
 
43
 
44
  def fmt_num(x):
45
+ try: return f"{int(x):,}"
46
+ except: return "-"
 
 
47
 
48
 
49
  def _hf_download(filename: str) -> str:
 
61
 
62
 
63
  def inject_fullscreen(html: str) -> str:
 
64
  btn = """
65
+ <button onclick="var el=document.getElementById('mynetwork');
66
+ if(el){if(el.requestFullscreen)el.requestFullscreen();
67
+ else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
 
68
  style="position:fixed;bottom:18px;right:18px;z-index:9999;
69
+ padding:8px 18px;background:#1e293b;color:white;border:none;
70
+ border-radius:8px;cursor:pointer;font-size:13px;
71
+ box-shadow:0 2px 8px rgba(0,0,0,0.35);">β›Ά Fullscreen</button>
72
+ <div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px;
73
+ color:#64748b;background:rgba(255,255,255,0.85);
 
 
74
  padding:5px 10px;border-radius:6px;">
75
+ πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
 
76
  """
77
  return html.replace("</body>", btn + "</body>")
78
 
79
 
80
+ # ── 메인 데이터 λ‘œλ“œ (11개) ────────────────────────────────────
81
  @st.cache_data(show_spinner=False)
82
  def load_data(data_dir_str: str):
83
  d = None if HF_REPO_ID else Path(data_dir_str)
84
 
85
+ seed_df = _read("seed_cited_papers_normalized.parquet", d)
86
+ events_df = _read("citation_events_normalized.parquet", d)
87
+ citing_df = _read("citing_papers_normalized.parquet", d)
88
+ authors_df = _read("authors.parquet", d)
 
 
 
89
  affiliations_df = _read("affiliations.parquet", d)
90
+ aff_geo_df = _read("affiliation_geo.parquet", d)
91
+ cities_df = _read("cities.parquet", d)
92
+ countries_df = _read("countries.parquet", d)
93
+ fields_df = _read("fields.parquet", d)
94
+ intents_df = _read("intents.parquet", d)
95
+ journals_df = _read("journals.parquet", d)
96
+
 
97
  seed = pd.DataFrame({
98
  "seed_paper_id": seed_df["seed_paper_id"],
99
  "doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
 
111
  "field_id": seed_df.get("field_id", pd.Series(dtype=object)),
112
  "journal_id": seed_df.get("journal_id", pd.Series(dtype=object)),
113
  })
114
+ for col in ["title","doi","journal","field","country"]:
115
  seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
116
+ seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True)
117
 
 
118
  events = pd.DataFrame({
119
  "citation_event_id": events_df["citation_event_id"],
120
  "seed_paper_id": events_df["cited_seed_paper_id"],
 
132
  })
133
  events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
134
 
 
135
  citing = pd.DataFrame({
136
  "citing_paper_id": citing_df["citing_paper_id"],
137
+ "doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""),
138
+ "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""),
139
+ "year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
140
+ "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
141
+ "oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""),
142
  })
143
 
144
  filters = {
 
149
  "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
150
  "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
151
  }
 
152
  overview = {
153
+ "seed_papers": int(len(seed)),
154
+ "citation_events": int(len(events)),
155
+ "citing_papers": int(events["citing_paper_id"].nunique()),
156
+ "authors": int(len(authors_df)),
157
+ "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
158
+ "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
159
+ "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
160
+ "intents": len(ALLOWED_INTENTS),
161
  }
 
162
  return (seed, events, citing, filters, overview,
163
  authors_df, affiliations_df, aff_geo_df,
164
  cities_df, countries_df, fields_df, intents_df, journals_df)
165
 
166
 
167
+ # ── KG + Enriched 데이터 (별도 μ§€μ—° λ‘œλ“œ) ─────────────────────
168
+ @st.cache_data(show_spinner=False)
169
+ def load_kg_data(data_dir_str: str):
170
+ d = None if HF_REPO_ID else Path(data_dir_str)
171
+ kg_nodes = _read("kg_nodes.parquet", d)
172
+ kg_edges = _read("kg_edges.parquet", d)
173
+ enriched = _read("citation_events_enriched.parquet", d)
174
+ return kg_nodes, kg_edges, enriched
175
+
176
+
177
+ # ── 헬퍼 ───────────────────────────────────────────────────────
178
  def filter_seed_papers(seed, q, fields, countries, journals):
179
  df = seed.copy()
180
  q = (q or "").strip().lower()
181
  if q:
182
  df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
183
+ if fields: df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
184
+ if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
185
+ if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
 
 
 
186
  return df.reset_index(drop=True)
187
 
188
 
 
195
 
196
  def build_intent_summary(df):
197
  counts = df.groupby("primary_intent").size().to_dict()
198
+ return pd.DataFrame({"intent": ALLOWED_INTENTS,
199
+ "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
 
 
200
 
201
 
202
  def build_context_rows(df, limit=20):
203
  rows = []
204
+ df = df.sort_values(["context_count","intent_count","citing_year"],
205
+ ascending=[False,False,False], na_position="last")
206
  for _, row in df.iterrows():
207
+ ctx = row["contexts"]
208
+ if isinstance(ctx, list) and ctx:
209
+ for c in ctx[:2]:
210
+ rows.append({"primary_intent": row["primary_intent"],
211
+ "citing_title": row["citing_title"],
212
+ "citing_doi": row["citing_doi"],
213
+ "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
214
+ "context": c})
215
+ if len(rows) >= limit: break
 
 
 
216
  return pd.DataFrame(rows[:limit])
217
 
218
 
219
  def build_citing_table(df, limit=30):
220
  if df.empty:
221
+ return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
222
+ return (df.sort_values(["context_count","intent_count","citing_year"],
223
+ ascending=[False,False,False], na_position="last")
224
+ [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
225
+ .drop_duplicates(subset=["citing_paper_id"]).head(limit))
226
+
227
+
228
+ def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
229
+ """μ„ νƒλœ seed paperλ₯Ό μΈμš©ν•œ 논문듀이 ν•¨κ»˜ μΈμš©ν•œ λ‹€λ₯Έ seed papers"""
230
+ citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
231
+ cocited = (events[events["citing_paper_id"].isin(citing_ids) &
232
+ (events["seed_paper_id"] != selected_seed_id)]
233
+ .groupby("seed_paper_id").size()
234
+ .reset_index(name="co_citation_count")
235
+ .sort_values("co_citation_count", ascending=False)
236
+ .head(top_n))
237
+ return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
238
+ on="seed_paper_id", how="left")
239
+
240
+
241
+ def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
242
+ """μ„ νƒλœ seed paper의 KG 1-hop μ„œλΈŒκ·Έλž˜ν”„ λ°˜ν™˜"""
243
+ node_id = f"seed:{seed_doi}"
244
+ edges = kg_edges[(kg_edges["source"] == node_id) |
245
+ (kg_edges["target"] == node_id)].head(max_edges)
246
+ if edges.empty:
247
+ return None, None
248
+ all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
249
+ nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
250
+ return nodes, edges
251
+
252
+
253
+ def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
254
+ """KG Explorer: μž„μ˜ λ…Έλ“œ κΈ°μ€€ μ„œλΈŒκ·Έλž˜ν”„"""
255
+ edges = kg_edges[(kg_edges["source"] == search_node_id) |
256
+ (kg_edges["target"] == search_node_id)].head(max_edges)
257
+ if edges.empty:
258
+ return None, None
259
+ all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
260
+ nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
261
+ return nodes, edges
262
+
263
+
264
+ # ── pyvis λΉŒλ” ─────────────────────────────────────────────────
265
  def pyvis_citation_graph(seed_row, events_df):
266
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
267
  sid = seed_row["seed_paper_id"]
268
  net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
269
+ font={"color":"white"})
270
+ for _, row in events_df.sort_values(["context_count","intent_count"],
271
  ascending=False).head(40).iterrows():
272
  cid = row["citing_paper_id"]
273
  net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
274
  color=NODE_COLORS["citing_paper"], size=18, shape="dot")
275
  ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
276
+ yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
277
  net.add_edge(cid, sid, label=row["primary_intent"],
278
+ color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"),
279
  title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
280
  net.barnes_hut()
281
  return inject_fullscreen(net.generate_html())
282
 
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  def pyvis_ontology():
285
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
286
  for nid, label, typ in [
287
+ ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"),
288
+ ("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"),
289
+ ("journal","Journal","journal"), ("author","Author","author"),
290
  ("affiliation","Affiliation","affiliation"),("city","City","city"),
291
+ ("country","Country","country"), ("field","Field","field"),
292
  ]:
293
  net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
294
  for s, t, l in [
295
+ ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"),
296
+ ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"),
297
+ ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
298
+ ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
299
  ("seed","field","belongsToField"),
300
  ]:
301
  net.add_edge(s, t, label=l)
 
303
  return inject_fullscreen(net.generate_html())
304
 
305
 
306
+ def pyvis_from_kg(nodes_df, edges_df, height="780px"):
307
+ """kg_nodes / kg_edges DataFrame으둜 pyvis κ·Έλž˜ν”„ 생성"""
308
+ net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
309
+ for _, row in nodes_df.iterrows():
310
+ ntype = row.get("node_type","")
311
+ color = NODE_TYPE_COLORS.get(ntype,"#94a3b8")
312
+ label = str(row.get("label",""))[:55]
313
+ size = 30 if ntype == "seed_paper" else 16
314
+ font = {"color":"white"} if ntype == "seed_paper" else {}
315
+ tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}"
316
+ net.add_node(str(row["node_id"]), label=label, color=color,
317
+ size=size, shape="dot", title=tooltip, font=font)
318
+ for _, row in edges_df.iterrows():
319
+ net.add_edge(str(row["source"]), str(row["target"]),
320
+ label=row.get("edge_type",""), color="#94a3b8")
321
+ net.barnes_hut()
322
+ return inject_fullscreen(net.generate_html())
323
+
324
+
325
+ # ═══════════════════════════════════════════════════════════════
326
+ # 메인 UI
327
+ # ═══════════════════════════════════════════════════════════════
328
  st.title("CitationHub")
329
+ st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
330
 
331
+ # ── Sidebar ────────────────────────────────────────────────────
332
  with st.sidebar:
333
  st.subheader("Data source")
334
  if HF_REPO_ID:
 
343
  cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
344
  st.success("Data loaded")
345
  except Exception as e:
346
+ st.error(str(e)); st.stop()
 
347
 
348
  st.subheader("Search seed papers")
349
  q_input = st.text_input("Title or DOI")
350
+ if "q_submit" not in st.session_state: st.session_state["q_submit"] = ""
 
351
  if st.button("Search", use_container_width=True):
352
  st.session_state["q_submit"] = q_input
353
 
 
362
 
363
  st.subheader("Overview counts")
364
  c1, c2 = st.columns(2)
365
+ c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
366
+ c2.metric("Citation events", fmt_num(overview["citation_events"]))
367
+ c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
368
+ c2.metric("Authors", fmt_num(overview["authors"]))
369
+ c1.metric("Countries", fmt_num(overview["countries"]))
370
+ c2.metric("Fields", fmt_num(overview["fields"]))
371
 
372
  options = seed_filtered["seed_paper_id"].tolist()
373
  if not options:
374
+ st.warning("No seed papers match the current search."); st.stop()
375
+ current = st.session_state.get("selected_seed_id", options[0])
 
376
  default_idx = options.index(current) if current in options else 0
377
  selected_seed_id = st.selectbox(
378
  "Seed paper", options, index=default_idx,
379
  format_func=lambda sid: seed_filtered.loc[
380
+ seed_filtered["seed_paper_id"]==sid, "title"].iloc[0],
381
  )
382
  st.session_state["selected_seed_id"] = selected_seed_id
383
 
384
+ selected_seed = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0]
385
+ seed_events = event_subset(events, selected_seed_id, year_min, year_max)
386
  intent_summary = build_intent_summary(seed_events)
387
  contexts_df = build_context_rows(seed_events)
388
  citing_table = build_citing_table(seed_events)
389
 
390
+ # ── νƒ­ ─────────────────────────────────────────────────────────
391
  (tab_overview, tab_cnet, tab_ontology, tab_kg,
392
+ tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
393
+ "Overview","Citation Network","Ontology",
394
+ "Knowledge Graph","KG Explorer","Geographic Map","Analytics",
395
  ])
396
 
397
+
398
+ # ═══ 1. OVERVIEW ═══════════════════════════════════════════════
399
  with tab_overview:
400
  col1, col2 = st.columns(2)
401
  with col1:
402
  st.subheader("Seed paper detail")
403
+ dc1, dc2 = st.columns(2)
404
+ dc1.metric("Cited by", fmt_num(selected_seed["citedby_count"]))
405
+ dc2.metric("Citation events", fmt_num(len(seed_events)))
406
  for label, key in [
407
+ ("Title","title"),("DOI","doi"),("Journal","journal"),
408
+ ("Author","author"),("Affiliation","affiliation"),
409
+ ("City","city"),("Country","country"),("Field","field"),
410
  ]:
411
  st.markdown(f"**{label}** \n{selected_seed[key] or '-'}")
412
 
413
  st.subheader("Related citing papers")
414
+ st.dataframe(citing_table.rename(columns={
415
+ "citing_title":"Title","citing_year":"Year",
416
+ "primary_intent":"Intent","context_count":"Contexts"}),
417
+ use_container_width=True, hide_index=True)
418
+
419
+ st.subheader("Co-cited seed papers")
420
+ st.caption("같은 citing paper에 μ˜ν•΄ ν•¨κ»˜ 인용된 λ‹€λ₯Έ top 5% λ…Όλ¬Έλ“€")
421
+ cocited = get_cocited_papers(selected_seed_id, events, seed)
422
+ if cocited.empty:
423
+ st.info("Co-cited papers not found.")
424
+ else:
425
+ st.dataframe(cocited.rename(columns={
426
+ "co_citation_count":"Co-citations","title":"Title",
427
+ "field":"Field","citedby_count":"Cited by"}),
428
+ use_container_width=True, hide_index=True)
429
 
430
  with col2:
431
  st.subheader("Intent distribution (selected paper)")
 
434
  fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
435
  st.plotly_chart(fig, use_container_width=True)
436
 
437
+ st.subheader("Citation trend (selected paper)")
438
+ trend = (seed_events.dropna(subset=["citing_year"])
439
+ .assign(citing_year=lambda df: df["citing_year"].astype(int))
440
+ .groupby("citing_year").size().reset_index(name="count"))
441
+ if not trend.empty:
442
+ st.plotly_chart(
443
+ px.line(trend, x="citing_year", y="count", markers=True)
444
+ .update_layout(xaxis_title="Year", yaxis_title="Citations"),
445
+ use_container_width=True)
446
+
447
  st.subheader("Field distribution")
448
  fd = (seed_filtered.groupby("field", dropna=False).size()
449
  .reset_index(name="count").sort_values("count", ascending=False).head(20))
450
+ fd["field"] = fd["field"].replace("","Unknown")
451
  st.plotly_chart(
452
  px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
453
+ use_container_width=True)
 
 
 
 
 
 
 
 
 
 
454
 
455
  st.subheader("Citation contexts")
456
  if contexts_df.empty:
 
466
  <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
467
  {row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div>
468
  <div>{row['context']}</div></div>""",
469
+ unsafe_allow_html=True)
470
+
471
 
472
+ # ═══ 2. CITATION NETWORK ════════════════════════════════════════
473
  with tab_cnet:
474
  st.subheader("Citing ↔ Cited Citation Network")
475
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
 
478
  else:
479
  components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
480
 
481
+
482
+ # ═══ 3. ONTOLOGY ════════════════════════════════════════════════
483
  with tab_ontology:
484
  st.subheader("CitationHub Ontology")
485
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
486
  components.html(pyvis_ontology(), height=820, scrolling=True)
487
 
488
+
489
+ # ═══ 4. KNOWLEDGE GRAPH (μ‹€μ œ KG 데이터) ═════════════════════════
490
  with tab_kg:
491
  st.subheader("Knowledge Graph β€” Selected Seed Paper")
492
+ st.caption("kg_nodes + kg_edges 전체 λ°μ΄ν„°μ—μ„œ μ„ νƒλœ seed paper의 1-hop μ„œλΈŒκ·Έλž˜ν”„")
493
+ st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš” (졜초 1회, 이후 μΊμ‹œλ¨)")
494
+
495
+ if st.button("KG 데이터 λ‘œλ“œ", key="kg_load"):
496
+ with st.spinner("kg_nodes / kg_edges / enriched λ‘œλ”© 쀑 ..."):
497
+ st.session_state["kg_loaded"] = True
498
+
499
+ if st.session_state.get("kg_loaded"):
500
+ try:
501
+ kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
502
+ seed_doi = selected_seed["doi"]
503
+ if not seed_doi:
504
+ st.warning("μ„ νƒλœ seed paper의 DOIκ°€ μ—†μ–΄ KG μ‘°νšŒκ°€ λΆˆκ°€ν•©λ‹ˆλ‹€.")
505
+ else:
506
+ nodes_sub, edges_sub = get_kg_subgraph(seed_doi, kg_nodes, kg_edges)
507
+ if nodes_sub is None:
508
+ st.warning(f"KGμ—μ„œ λ…Έλ“œλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (DOI: {seed_doi})")
509
+ else:
510
+ # 톡계
511
+ c1, c2, c3 = st.columns(3)
512
+ c1.metric("Nodes", fmt_num(len(nodes_sub)))
513
+ c2.metric("Edges", fmt_num(len(edges_sub)))
514
+ c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
515
+
516
+ type_counts = nodes_sub["node_type"].value_counts().reset_index()
517
+ type_counts.columns = ["node_type","count"]
518
+ st.plotly_chart(
519
+ px.bar(type_counts, x="node_type", y="count",
520
+ color="node_type",
521
+ color_discrete_map=NODE_TYPE_COLORS,
522
+ title="Node Type Distribution")
523
+ .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
524
+ use_container_width=True)
525
+
526
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
527
+ components.html(pyvis_from_kg(nodes_sub, edges_sub), height=820, scrolling=True)
528
+ except Exception as e:
529
+ st.error(str(e))
530
+
531
+
532
+ # ═══ 5. KG EXPLORER ═════════════════════════════════════════════
533
+ with tab_kg_exp:
534
+ st.subheader("KG Explorer")
535
+ st.caption("kg_nodes 전체λ₯Ό νƒμƒ‰ν•˜κ³  μž„μ˜ λ…Έλ“œμ˜ μ—°κ²° 관계λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€.")
536
+ st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš” (졜초 1회, 이후 μΊμ‹œλ¨)")
537
+
538
+ if st.button("KG 데이터 λ‘œλ“œ", key="kg_exp_load"):
539
+ with st.spinner("λ‘œλ”© 쀑..."):
540
+ st.session_state["kg_loaded"] = True
541
+
542
+ if st.session_state.get("kg_loaded"):
543
+ try:
544
+ kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
545
+
546
+ # ── 전체 λ…Έλ“œ νƒ€μž… 뢄포
547
+ col_a, col_b = st.columns([1,2])
548
+ with col_a:
549
+ st.subheader("Node Type Counts")
550
+ nt = kg_nodes["node_type"].value_counts().reset_index()
551
+ nt.columns = ["node_type","count"]
552
+ st.dataframe(nt, use_container_width=True, hide_index=True)
553
+
554
+ st.subheader("Edge Type Counts")
555
+ et = kg_edges["edge_type"].value_counts().reset_index()
556
+ et.columns = ["edge_type","count"]
557
+ st.dataframe(et, use_container_width=True, hide_index=True)
558
+
559
+ with col_b:
560
+ st.subheader("Node Type Distribution")
561
+ nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
562
+ color_discrete_map=NODE_TYPE_COLORS)
563
+ nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
564
+ st.plotly_chart(nt_fig, use_container_width=True)
565
+
566
+ st.markdown("---")
567
+ st.subheader("Node Search & Ego Network")
568
+ exp_col1, exp_col2 = st.columns([1,3])
569
+ with exp_col1:
570
+ type_options = ["(all)"] + sorted(kg_nodes["node_type"].unique().tolist())
571
+ sel_type = st.selectbox("Filter by node type", type_options)
572
+ filtered_nodes = (kg_nodes if sel_type == "(all)"
573
+ else kg_nodes[kg_nodes["node_type"]==sel_type])
574
+ search_q = st.text_input("Search node label / DOI")
575
+ if search_q:
576
+ filtered_nodes = filtered_nodes[
577
+ filtered_nodes["label"].str.contains(search_q, case=False, na=False) |
578
+ filtered_nodes["doi"].str.contains(search_q, case=False, na=False)
579
+ ]
580
+
581
+ sample = filtered_nodes.head(100)
582
+ node_options = sample["node_id"].tolist()
583
+ if not node_options:
584
+ st.warning("검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
585
+ else:
586
+ sel_node_id = st.selectbox(
587
+ "Select node",
588
+ node_options,
589
+ format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60],
590
+ )
591
+ sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0]
592
+ st.markdown(f"**Type**: {sel_node_info.get('node_type','')}")
593
+ st.markdown(f"**DOI**: {sel_node_info.get('doi','') or '-'}")
594
+ st.markdown(f"**Publication**: {sel_node_info.get('publication_name','') or '-'}")
595
+ st.markdown(f"**Group**: {sel_node_info.get('group','') or '-'}")
596
+ st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}")
597
+
598
+ max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
599
+
600
+ if st.button("Show ego network", key="kg_exp_show"):
601
+ exp_nodes, exp_edges = get_explorer_subgraph(sel_node_id, kg_nodes, kg_edges, max_e)
602
+ if exp_nodes is None:
603
+ st.warning("μ—°κ²°λœ μ—£μ§€κ°€ μ—†μŠ΅λ‹ˆλ‹€.")
604
+ else:
605
+ st.session_state["exp_nodes"] = exp_nodes
606
+ st.session_state["exp_edges"] = exp_edges
607
+
608
+ with exp_col2:
609
+ if "exp_nodes" in st.session_state:
610
+ en = st.session_state["exp_nodes"]
611
+ ee = st.session_state["exp_edges"]
612
+ st.caption(f"Nodes: {len(en)} | Edges: {len(ee)}")
613
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
614
+ components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True)
615
+ else:
616
+ st.info("μ™Όμͺ½μ—μ„œ λ…Έλ“œλ₯Ό μ„ νƒν•˜κ³  'Show ego network'λ₯Ό ν΄λ¦­ν•˜μ„Έμš”.")
617
+
618
+ # ── Enriched μΈμ‚¬μ΄νŠΈ
619
+ st.markdown("---")
620
+ st.subheader("Enriched Citation Insights")
621
+ st.caption("citation_events_enriched: 의미적 증거(semantic evidence) 뢄석")
622
+ if "has_semantic_evidence" in enriched.columns:
623
+ sem = enriched["has_semantic_evidence"].value_counts().reset_index()
624
+ sem.columns = ["has_semantic_evidence","count"]
625
+ sem["label"] = sem["has_semantic_evidence"].map({True:"With evidence", False:"Without evidence"})
626
+ st.plotly_chart(
627
+ px.pie(sem, names="label", values="count",
628
+ title="Semantic Evidence Coverage (all citation events)")
629
+ .update_layout(legend_title=""),
630
+ use_container_width=True)
631
+
632
+ # 뢄야별 semantic evidence λΉ„μœ¨
633
+ if "field_folder" in enriched.columns:
634
+ field_sem = (enriched.groupby("field_folder")["has_semantic_evidence"]
635
+ .mean().reset_index()
636
+ .rename(columns={"has_semantic_evidence":"sem_ratio","field_folder":"field"})
637
+ .sort_values("sem_ratio", ascending=False).head(20))
638
+ st.plotly_chart(
639
+ px.bar(field_sem, x="field", y="sem_ratio",
640
+ title="Semantic Evidence Rate by Field",
641
+ labels={"sem_ratio":"Evidence Rate","field":"Field"})
642
+ .update_layout(xaxis_tickangle=-40),
643
+ use_container_width=True)
644
+ else:
645
+ st.info("has_semantic_evidence 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
646
+
647
+ except Exception as e:
648
+ st.error(str(e))
649
+
650
+
651
+ # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
652
  with tab_geo:
653
  st.subheader("Geographic Distribution of Seed Papers")
654
 
655
+ country_cnt = (seed_filtered.groupby("country", dropna=False).size()
656
+ .reset_index(name="count").rename(columns={"country":"country_name"}))
 
 
 
 
657
  country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
 
658
 
659
  if not country_cnt.empty:
660
+ fig_map = px.choropleth(country_cnt, locations="country_name",
661
+ locationmode="country names", color="count",
662
+ hover_name="country_name",
663
+ color_continuous_scale="Blues",
664
+ title="Seed Papers by Country")
 
 
 
 
665
  fig_map.update_layout(geo=dict(showframe=False), height=500)
666
  st.plotly_chart(fig_map, use_container_width=True)
667
 
668
+ st.subheader("Top Cities (Affiliation)")
669
+ city_cnt = (seed_filtered.merge(
670
+ aff_geo_df[["affiliation_name","city_name","country_name"]],
671
+ left_on="affiliation", right_on="affiliation_name", how="left")
672
+ .groupby(["country_name","city_name"], dropna=False).size()
673
+ .reset_index(name="count").dropna(subset=["country_name"])
674
+ .sort_values("count", ascending=False).head(30))
 
 
 
 
 
 
675
  if not city_cnt.empty:
676
+ st.plotly_chart(
677
+ px.bar(city_cnt, x="city_name", y="count", color="country_name",
678
+ title="Top 30 Cities")
679
+ .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
680
+ use_container_width=True)
681
+
682
+ st.subheader("Citation Trend over Time (selected paper)")
683
+ trend2 = (seed_events.dropna(subset=["citing_year"])
684
+ .assign(citing_year=lambda df: df["citing_year"].astype(int))
685
+ .groupby("citing_year").size().reset_index(name="count"))
686
+ if not trend2.empty:
687
+ st.plotly_chart(
688
+ px.line(trend2, x="citing_year", y="count", markers=True,
689
+ title="Citations per Year")
690
+ .update_layout(xaxis_title="Year", yaxis_title="Citations"),
691
+ use_container_width=True)
692
+
693
+
694
+ # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
 
 
 
 
695
  with tab_analytics:
696
  col_a, col_b = st.columns(2)
697
 
 
698
  with col_a:
699
+ st.subheader("Top Authors")
 
700
  if "author_id" in seed.columns and not seed["author_id"].isna().all():
701
+ top_auth = (seed.explode("author_id")
702
+ .merge(authors_df, on="author_id", how="left")
703
+ .groupby("author_name").size()
704
+ .reset_index(name="paper_count")
705
+ .sort_values("paper_count", ascending=False).head(20))
 
 
 
706
  else:
707
+ top_auth = (seed["author"].value_counts()
708
+ .reset_index().rename(columns={"author":"author_name","count":"paper_count"})
709
+ .head(20))
710
+ top_auth = top_auth[top_auth["author_name"].str.strip() != ""]
711
+ st.plotly_chart(
712
+ px.bar(top_auth, x="paper_count", y="author_name", orientation="h",
713
+ title="Top 20 Authors")
714
+ .update_layout(yaxis=dict(autorange="reversed"),
715
+ xaxis_title="Seed Papers", yaxis_title=""),
716
+ use_container_width=True)
717
+
 
 
 
 
718
  with col_b:
719
+ st.subheader("Top Journals")
720
+ top_jnl = (seed.groupby("journal").size()
721
+ .reset_index(name="count").sort_values("count", ascending=False).head(20))
722
+ top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""]
723
+ st.plotly_chart(
724
+ px.bar(top_jnl, x="count", y="journal", orientation="h",
725
+ title="Top 20 Journals")
726
+ .update_layout(yaxis=dict(autorange="reversed"),
727
+ xaxis_title="Seed Papers", yaxis_title=""),
728
+ use_container_width=True)
 
 
 
729
 
730
  st.markdown("---")
731
  col_c, col_d = st.columns(2)
732
 
 
733
  with col_c:
734
  st.subheader("Field Γ— Intent Heatmap")
735
+ fi = (seed[["seed_paper_id","field"]]
736
+ .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
737
+ .groupby(["field","primary_intent"]).size().reset_index(name="count"))
738
+ if not fi.empty:
739
+ pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
740
+ st.plotly_chart(
741
+ px.imshow(pivot, color_continuous_scale="Blues",
742
+ title="Citation Intent by Field", aspect="auto")
743
+ .update_layout(xaxis_title="Intent", yaxis_title="Field"),
744
+ use_container_width=True)
745
+
 
 
 
 
746
  with col_d:
747
+ st.subheader("Influential Citations (selected paper)")
748
  if "is_influential" in seed_events.columns:
749
+ inf = seed_events["is_influential"].value_counts().reset_index()
750
+ inf.columns = ["is_influential","count"]
751
+ inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"})
752
+ st.plotly_chart(
753
+ px.pie(inf, names="label", values="count",
754
+ title="Influential vs Non-influential"),
755
+ use_container_width=True)
 
756
 
757
+ st.subheader("Intent Reference")
758
+ st.dataframe(intents_df, use_container_width=True, hide_index=True)
 
759
 
760
+ st.markdown("---")
761
+ st.subheader("Field Reference")
762
  st.dataframe(fields_df, use_container_width=True, hide_index=True)