Daniel0315 commited on
Commit
41e4f48
Β·
verified Β·
1 Parent(s): 48b93fd

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +438 -314
src/app.py CHANGED
@@ -2,17 +2,15 @@ from __future__ import annotations
2
 
3
  import os
4
  from pathlib import Path
5
- from typing import Dict, List, Tuple
6
 
7
  import pandas as pd
8
  import streamlit as st
9
  import plotly.express as px
10
- import networkx as nx
11
  from pyvis.network import Network
12
  import streamlit.components.v1 as components
13
 
14
- # Hugging Face: Space 배포 μ‹œ. Space Secrets에 HF_TOKEN, HF_REPO_ID μ„€μ • (env둜 μ£Όμž…λ¨)
15
- # HF_REPO_ID 예: "username/citationhub-data" (Dataset repo 이름)
16
  HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
17
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
 
@@ -23,44 +21,27 @@ st.set_page_config(
23
  )
24
 
25
  ALLOWED_INTENTS = [
26
- "background",
27
- "uses",
28
- "similarities",
29
- "motivation",
30
- "differences",
31
- "future_work",
32
- "extends",
33
  ]
34
 
35
  INTENT_COLORS = {
36
- "background": "#94a3b8",
37
- "uses": "#22c55e",
38
- "similarities": "#3b82f6",
39
- "motivation": "#f59e0b",
40
- "differences": "#ef4444",
41
- "future_work": "#8b5cf6",
42
- "extends": "#06b6d4",
43
  }
44
 
45
  NODE_COLORS = {
46
- "seed_paper": "#111827",
47
- "citing_paper": "#dbeafe",
48
- "citation_event": "#fde68a",
49
- "journal": "#ede9fe",
50
- "author": "#fee2e2",
51
- "affiliation": "#fae8ff",
52
- "city": "#cffafe",
53
- "country": "#ffedd5",
54
- "field": "#e0e7ff",
55
- "intent": "#dcfce7",
56
  }
57
 
58
- DEFAULT_DATA_DIR = Path(
59
- os.environ.get(
60
- "CITATIONHUB_DATA_DIR",
61
- r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\citationhub_v1_ontology_ready",
62
- )
63
- )
64
 
65
  def fmt_num(x):
66
  try:
@@ -68,134 +49,171 @@ def fmt_num(x):
68
  except Exception:
69
  return "-"
70
 
71
- def _load_from_hf():
72
- """Hugging Face Datasetμ—μ„œ Parquet λ‹€μš΄λ‘œλ“œ ν›„ λ‘œλ“œ (Space 배포용)"""
73
- try:
74
- from huggingface_hub import hf_hub_download
75
- except ImportError:
76
- raise ImportError("huggingface_hubκ°€ ν•„μš”ν•©λ‹ˆλ‹€. pip install huggingface_hub")
77
- if not HF_REPO_ID:
78
- raise ValueError("HF_REPO_IDκ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. (예: username/citationhub-data)")
79
- token = HF_TOKEN or None # None이면 public repo, 있으면 private 인증
80
- seed_path = hf_hub_download(repo_id=HF_REPO_ID, repo_type="dataset", filename="data/seed_cited_papers_normalized.parquet", token=token)
81
- events_path = hf_hub_download(repo_id=HF_REPO_ID, repo_type="dataset", filename="data/citation_events_normalized.parquet", token=token)
82
- citing_path = hf_hub_download(repo_id=HF_REPO_ID, repo_type="dataset", filename="data/citing_papers_normalized.parquet", token=token)
83
- return pd.read_parquet(seed_path), pd.read_parquet(events_path), pd.read_parquet(citing_path)
84
 
 
 
 
 
 
 
85
 
86
- @st.cache_data(show_spinner=False)
87
- def load_data(data_dir_str: str):
88
- # Hugging Face λͺ¨λ“œ: HF_REPO_IDκ°€ μ„€μ •λ˜μ–΄ 있으면 Datasetμ—μ„œ λ‘œλ“œ
89
  if HF_REPO_ID:
90
- seed_df, events_df, citing_df = _load_from_hf()
91
- else:
92
- data_dir = Path(data_dir_str)
93
- seed_path = data_dir / "seed_cited_papers_normalized.parquet"
94
- events_path = data_dir / "citation_events_normalized.parquet"
95
- citing_path = data_dir / "citing_papers_normalized.parquet"
96
- missing = [str(p) for p in [seed_path, events_path, citing_path] if not p.exists()]
97
- if missing:
98
- raise FileNotFoundError(f"Missing parquet files: {missing}")
99
- seed_df = pd.read_parquet(seed_path)
100
- events_df = pd.read_parquet(events_path)
101
- citing_df = pd.read_parquet(citing_path)
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  seed = pd.DataFrame({
104
- "seed_paper_id": seed_df["seed_paper_id"],
105
- "doi": seed_df.get("doi", "").fillna(""),
106
- "title": seed_df.get("title", "").fillna(""),
107
- "journal": seed_df.get("publication_name", "").fillna(""),
108
- "author": seed_df.get("creator", "").fillna(""),
109
- "affiliation": seed_df.get("affilname", "").fillna(""),
110
- "city": seed_df.get("affiliation_city", "").fillna(""),
111
- "country": seed_df.get("affiliation_country", "").fillna(""),
112
- "field": seed_df.get("group", "").fillna(""),
113
- "citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
 
 
 
 
 
114
  })
115
  for col in ["title", "doi", "journal", "field", "country"]:
116
  seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
117
  seed = seed.sort_values(["citedby_count", "title"], ascending=[False, True]).reset_index(drop=True)
118
 
 
119
  events = pd.DataFrame({
120
  "citation_event_id": events_df["citation_event_id"],
121
- "seed_paper_id": events_df["cited_seed_paper_id"],
122
- "citing_paper_id": events_df["citing_paper_id"],
123
- "citing_title": events_df.get("citing_title", "").fillna(""),
124
- "citing_doi": events_df.get("citing_doi", "").fillna(""),
125
- "citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"),
126
- "primary_intent": events_df.get("primary_intent", "").fillna(""),
127
- "contexts": events_df.get("contexts"),
128
- "context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int),
129
- "intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int),
 
 
 
130
  })
131
  events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
132
 
 
133
  citing = pd.DataFrame({
134
  "citing_paper_id": citing_df["citing_paper_id"],
135
- "doi": citing_df.get("doi", "").fillna(""),
136
- "title": citing_df.get("title", "").fillna(""),
137
- "year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
138
- "venue": citing_df.get("venue", "").fillna(""),
139
- "oa_pdf": citing_df.get("oa_pdf", "").fillna(""),
140
  })
141
 
142
  filters = {
143
- "fields": sorted([x for x in seed["field"].dropna().astype(str).unique().tolist() if x]),
144
- "countries": sorted([x for x in seed["country"].dropna().astype(str).unique().tolist() if x]),
145
- "journals": sorted([x for x in seed["journal"].dropna().astype(str).unique().tolist() if x]),
146
- "intents": ALLOWED_INTENTS,
147
- "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
148
- "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
149
  }
150
 
151
  overview = {
152
- "seed_papers": int(len(seed)),
153
- "citation_events": int(len(events)),
154
- "citing_papers": int(events["citing_paper_id"].nunique()),
155
- "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
156
- "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
157
- "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
158
- "intents": len(ALLOWED_INTENTS),
 
159
  }
160
 
161
- return seed, events, citing, filters, overview
 
 
162
 
163
 
164
- def filter_seed_papers(seed: pd.DataFrame, q: str, fields: List[str], countries: List[str], journals: List[str]):
 
165
  df = seed.copy()
166
  q = (q or "").strip().lower()
167
  if q:
168
  df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
169
  if fields:
170
- wanted = {x.lower() for x in fields}
171
- df = df[df["field"].str.lower().isin(wanted)]
172
  if countries:
173
- wanted = {x.lower() for x in countries}
174
- df = df[df["country"].str.lower().isin(wanted)]
175
  if journals:
176
- wanted = {x.lower() for x in journals}
177
- df = df[df["journal"].str.lower().isin(wanted)]
178
  return df.reset_index(drop=True)
179
 
180
 
181
- def event_subset(events: pd.DataFrame, seed_paper_id: str, year_min: int, year_max: int):
182
  df = events[events["seed_paper_id"] == seed_paper_id].copy()
183
  df = df[df["citing_year"].fillna(-99999) >= year_min]
184
  df = df[df["citing_year"].fillna(99999) <= year_max]
185
  return df.reset_index(drop=True)
186
 
187
 
188
- def build_intent_summary(df: pd.DataFrame):
189
  counts = df.groupby("primary_intent").size().to_dict()
190
  return pd.DataFrame({
191
  "intent": ALLOWED_INTENTS,
192
- "count": [int(counts.get(intent, 0)) for intent in ALLOWED_INTENTS]
193
  })
194
 
195
 
196
- def build_context_rows(df: pd.DataFrame, limit: int = 20):
197
  rows = []
198
- df = df.sort_values(["context_count", "intent_count", "citing_year"], ascending=[False, False, False], na_position="last")
 
199
  for _, row in df.iterrows():
200
  contexts = row["contexts"]
201
  if isinstance(contexts, list) and contexts:
@@ -212,123 +230,108 @@ def build_context_rows(df: pd.DataFrame, limit: int = 20):
212
  return pd.DataFrame(rows[:limit])
213
 
214
 
215
- def build_citing_table(df: pd.DataFrame, limit: int = 30):
216
  if df.empty:
217
  return pd.DataFrame(columns=["citing_title", "citing_year", "primary_intent", "context_count"])
218
- out = (
219
- df.sort_values(["context_count", "intent_count", "citing_year"], ascending=[False, False, False], na_position="last")
 
220
  [["citing_paper_id", "citing_title", "citing_doi", "citing_year", "primary_intent", "context_count"]]
221
  .drop_duplicates(subset=["citing_paper_id"])
222
  .head(limit)
223
  )
224
- return out
225
 
226
 
227
- def pyvis_html_from_citation_graph(seed_row: pd.Series, events_df: pd.DataFrame):
228
- net = Network(height="1100px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
229
- seed_id = seed_row["seed_paper_id"]
230
- net.add_node(seed_id, label=seed_row["title"][:60], color=INTENT_COLORS.get("background", "#111827"), size=34, shape="dot")
231
-
232
- df = events_df.sort_values(["context_count", "intent_count"], ascending=[False, False]).head(40)
233
- for _, row in df.iterrows():
 
234
  cid = row["citing_paper_id"]
235
- citing_label = (row["citing_title"] or row["citing_doi"] or cid)[:60]
236
- net.add_node(cid, label=citing_label, color=NODE_COLORS["citing_paper"], size=18, shape="dot")
237
- context = None
238
- if isinstance(row["contexts"], list) and row["contexts"]:
239
- context = row["contexts"][0]
240
- title = f"Intent: {row['primary_intent']}<br>Year: {'' if pd.isna(row['citing_year']) else int(row['citing_year'])}<br>{context or ''}"
241
- net.add_edge(cid, seed_id, label=row["primary_intent"], color=INTENT_COLORS.get(row["primary_intent"], "#94a3b8"), title=title)
242
  net.barnes_hut()
243
- return net.generate_html()
244
-
245
-
246
- def pyvis_html_from_kg(seed_row: pd.Series, events_df: pd.DataFrame):
247
- net = Network(height="1100px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
248
- seed_id = seed_row["seed_paper_id"]
249
- net.add_node(seed_id, label=seed_row["title"][:60], color=NODE_COLORS["seed_paper"], font={"color": "white"}, size=34, shape="dot")
250
-
251
- meta_map = [
252
- ("journal", "journal", "PUBLISHED_IN"),
253
- ("author", "author", "HAS_AUTHOR"),
254
- ("affiliation", "affiliation", "HAS_AFFILIATION"),
255
- ("city", "city", "LOCATED_IN_CITY"),
256
- ("country", "country", "LOCATED_IN_COUNTRY"),
257
- ("field", "field", "BELONGS_TO_FIELD"),
258
- ]
259
- for key, typ, rel in meta_map:
260
  val = seed_row.get(key, "")
261
  if val:
262
  nid = f"{typ}:{val}"
263
  net.add_node(nid, label=str(val)[:50], color=NODE_COLORS[typ], size=16)
264
- net.add_edge(seed_id, nid, label=rel)
265
-
266
- top_events = events_df.sort_values(["context_count", "intent_count"], ascending=[False, False]).head(20)
267
- intent_counts = top_events.groupby("primary_intent").size().to_dict()
268
- for intent, count in intent_counts.items():
269
  iid = f"intent:{intent}"
270
- net.add_node(iid, label=f"{intent} ({count})", color=NODE_COLORS["intent"], size=18)
271
- net.add_edge(seed_id, iid, label="HAS_INTENT_CLUSTER")
272
-
273
- for _, row in top_events.iterrows():
274
- eid = row["citation_event_id"]
275
- cid = row["citing_paper_id"]
276
  net.add_node(eid, label=row["primary_intent"], color=NODE_COLORS["citation_event"], size=14)
277
- net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:55], color=NODE_COLORS["citing_paper"], size=14)
278
- net.add_edge(eid, seed_id, label="HAS_CITED_PAPER")
 
279
  net.add_edge(eid, cid, label="HAS_CITING_PAPER")
280
  net.add_edge(eid, f"intent:{row['primary_intent']}", label="HAS_PRIMARY_INTENT")
281
-
282
  net.barnes_hut()
283
- return net.generate_html()
284
-
285
-
286
- def pyvis_html_from_ontology():
287
- net = Network(height="1100px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
288
- nodes = [
289
- ("seed", "Top5PctCitedPaper", "seed_paper"),
290
- ("event", "CitationEvent", "citation_event"),
291
- ("citing", "CitingPaper", "citing_paper"),
292
- ("intent", "Intent", "intent"),
293
- ("journal", "Journal", "journal"),
294
- ("author", "Author", "author"),
295
- ("affiliation", "Affiliation", "affiliation"),
296
- ("city", "City", "city"),
297
- ("country", "Country", "country"),
298
- ("field", "Field", "field"),
299
- ]
300
- for nid, label, typ in nodes:
301
  net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
302
- edges = [
303
- ("event", "citing", "hasCitingPaper"),
304
- ("event", "seed", "hasCitedPaper"),
305
- ("event", "intent", "hasPrimaryIntent"),
306
- ("seed", "journal", "publishedInJournal"),
307
- ("seed", "author", "hasAuthor"),
308
- ("seed", "affiliation", "hasAffiliation"),
309
- ("seed", "city", "locatedInCity"),
310
- ("seed", "country", "locatedInCountry"),
311
- ("seed", "field", "belongsToField"),
312
- ]
313
- for s, t, l in edges:
314
  net.add_edge(s, t, label=l)
315
  net.barnes_hut()
316
- return net.generate_html()
317
 
318
 
319
- # ---------- UI ----------
320
  st.title("CitationHub")
321
  st.caption("Explore influential papers, their citation networks, and related research.")
322
 
323
  with st.sidebar:
324
  st.subheader("Data source")
325
  if HF_REPO_ID:
326
- data_dir = "hf"
327
- st.caption(f"Loading from Hugging Face: {HF_REPO_ID}")
328
  else:
329
- data_dir = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
 
330
  try:
331
- seed, events, citing, filters, overview = load_data(data_dir)
 
 
332
  st.success("Data loaded")
333
  except Exception as e:
334
  st.error(str(e))
@@ -341,165 +344,286 @@ with st.sidebar:
341
  if st.button("Search", use_container_width=True):
342
  st.session_state["q_submit"] = q_input
343
 
344
- fields = st.multiselect("Field", filters["fields"])
345
- countries = st.multiselect("Country", filters["countries"])
346
- journals = st.multiselect("Journal", filters["journals"][:200])
347
- display_year_min = max(2000, filters["year_min"])
348
- year_min, year_max = st.slider(
349
- "Citing year",
350
- display_year_min,
351
- filters["year_max"],
352
- (display_year_min, filters["year_max"]),
353
- )
354
 
355
- seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"], fields, countries, journals)
 
356
 
357
  st.subheader("Overview counts")
358
  c1, c2 = st.columns(2)
359
- c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
360
- c2.metric("Events", fmt_num(overview["citation_events"]))
361
- c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
362
- c2.metric("Intents", fmt_num(overview["intents"]))
 
 
363
 
364
  options = seed_filtered["seed_paper_id"].tolist()
365
  if not options:
366
  st.warning("No seed papers match the current search.")
367
  st.stop()
368
-
369
- default_idx = 0
370
  current = st.session_state.get("selected_seed_id", options[0])
371
- if current in options:
372
- default_idx = options.index(current)
373
  selected_seed_id = st.selectbox(
374
- "Seed paper records",
375
- options,
376
- index=default_idx,
377
- format_func=lambda sid: seed_filtered.loc[seed_filtered["seed_paper_id"] == sid, "title"].iloc[0],
378
  )
379
  st.session_state["selected_seed_id"] = selected_seed_id
380
 
381
  selected_seed = seed_filtered[seed_filtered["seed_paper_id"] == selected_seed_id].iloc[0]
382
- seed_events = event_subset(events, selected_seed_id, year_min, year_max)
383
  intent_summary = build_intent_summary(seed_events)
384
- contexts_df = build_context_rows(seed_events, limit=20)
385
- citing_df = build_citing_table(seed_events, limit=30)
386
 
387
- tab_overview, tab_cnet, tab_ontology, tab_kg = st.tabs(["Overview", "Citation network", "Ontology", "Knowledge graph"])
 
 
 
 
 
388
 
 
389
  with tab_overview:
390
- col1, col2 = st.columns([1, 1])
391
-
392
  with col1:
393
- st.subheader("Selected seed paper detail")
394
- detail_cols = st.columns(2)
395
- detail_cols[0].metric("Cited by count", fmt_num(selected_seed["citedby_count"]))
396
- detail_cols[1].metric("Related citation events", fmt_num(len(seed_events)))
397
-
398
- st.markdown(f"**Title** \n{selected_seed['title']}")
399
- st.markdown(f"**DOI** \n{selected_seed['doi'] or '-'}")
400
- st.markdown(f"**Journal** \n{selected_seed['journal'] or '-'}")
401
- st.markdown(f"**Author** \n{selected_seed['author'] or '-'}")
402
- st.markdown(f"**Affiliation** \n{selected_seed['affiliation'] or '-'}")
403
- st.markdown(f"**City** \n{selected_seed['city'] or '-'}")
404
- st.markdown(f"**Country** \n{selected_seed['country'] or '-'}")
405
- st.markdown(f"**Field** \n{selected_seed['field'] or '-'}")
406
 
407
  st.subheader("Related citing papers")
408
  st.dataframe(
409
- citing_df.rename(columns={
410
- "citing_title": "Title",
411
- "citing_year": "Year",
412
- "primary_intent": "Intent",
413
- "context_count": "Contexts",
414
  }),
415
- use_container_width=True,
416
- hide_index=True,
417
  )
418
 
419
  with col2:
420
- st.subheader("Selected seed paper intent distribution")
421
- fig_intent = px.bar(intent_summary, x="intent", y="count", color="intent", color_discrete_map=INTENT_COLORS)
422
- fig_intent.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
423
- st.plotly_chart(fig_intent, use_container_width=True)
424
-
425
- st.subheader("CitationHub field distribution")
426
- field_dist = seed_filtered.groupby("field", dropna=False).size().reset_index(name="count").sort_values("count", ascending=False).head(20)
427
- field_dist["field"] = field_dist["field"].replace("", "Unknown")
428
- fig_field = px.bar(field_dist, x="field", y="count")
429
- fig_field.update_layout(xaxis_title="", yaxis_title="Count")
430
- st.plotly_chart(fig_field, use_container_width=True)
431
-
432
- st.subheader("CitationHub intent distribution")
433
- all_intent_counts = events.groupby("primary_intent").size().to_dict()
434
- all_intent_df = pd.DataFrame({"intent": ALLOWED_INTENTS, "count": [int(all_intent_counts.get(i, 0)) for i in ALLOWED_INTENTS]})
435
- fig_all_intent = px.bar(all_intent_df, x="intent", y="count", color="intent", color_discrete_map=INTENT_COLORS)
436
- fig_all_intent.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
437
- st.plotly_chart(fig_all_intent, use_container_width=True)
438
-
439
- st.subheader("Selected seed paper contexts")
 
 
 
 
 
440
  if contexts_df.empty:
441
- st.info("No contexts available for this seed paper.")
442
  else:
443
  for _, row in contexts_df.iterrows():
444
  st.markdown(
445
- f"""
446
- <div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;margin-bottom:10px;background:#f8fafc;">
447
- <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'], '#64748b')};color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">{row['primary_intent']}</div>
448
- <div style="font-size:12px;color:#64748b;margin-bottom:6px;">{row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div>
449
- <div>{row['context']}</div>
450
- </div>
451
- """,
 
452
  unsafe_allow_html=True,
453
  )
454
 
 
455
  with tab_cnet:
456
- st.subheader("Citing ↔ cited citation network visualization")
457
-
458
- cnet_expand = st.toggle("Expand citation network view", value=False, key="cnet_expand")
459
- cnet_height = st.slider(
460
- "Citation network height",
461
- min_value=700,
462
- max_value=1800,
463
- value=1400 if cnet_expand else 900,
464
- step=100,
465
- key="cnet_height",
466
- )
467
-
468
  if seed_events.empty:
469
  st.info("No citation network data for this seed paper.")
470
  else:
471
- html = pyvis_html_from_citation_graph(selected_seed, seed_events)
472
- components.html(html, height=cnet_height, scrolling=True)
473
 
 
474
  with tab_ontology:
475
- st.subheader("CitationHub ontology overview")
476
-
477
- ontology_expand = st.toggle("Expand ontology view", value=False, key="ontology_expand")
478
- ontology_height = st.slider(
479
- "Ontology graph height",
480
- min_value=700,
481
- max_value=1800,
482
- value=1400 if ontology_expand else 900,
483
- step=100,
484
- key="ontology_height",
485
- )
486
-
487
- components.html(pyvis_html_from_ontology(), height=ontology_height, scrolling=True)
488
 
 
489
  with tab_kg:
490
- st.subheader("Knowledge graph for the selected seed paper")
491
-
492
- kg_expand = st.toggle("Expand knowledge graph view", value=False, key="kg_expand")
493
- kg_height = st.slider(
494
- "Knowledge graph height",
495
- min_value=700,
496
- max_value=1800,
497
- value=1400 if kg_expand else 900,
498
- step=100,
499
- key="kg_height",
500
- )
501
-
502
  if seed_events.empty:
503
  st.info("No knowledge graph data for this seed paper.")
504
  else:
505
- components.html(pyvis_html_from_kg(selected_seed, seed_events), height=kg_height, scrolling=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import os
4
  from pathlib import Path
5
+ from typing import List
6
 
7
  import pandas as pd
8
  import streamlit as st
9
  import plotly.express as px
10
+ import plotly.graph_objects as go
11
  from pyvis.network import Network
12
  import streamlit.components.v1 as components
13
 
 
 
14
  HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
15
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
 
 
21
  )
22
 
23
  ALLOWED_INTENTS = [
24
+ "background", "uses", "similarities", "motivation",
25
+ "differences", "future_work", "extends",
 
 
 
 
 
26
  ]
27
 
28
  INTENT_COLORS = {
29
+ "background": "#94a3b8", "uses": "#22c55e", "similarities": "#3b82f6",
30
+ "motivation": "#f59e0b", "differences": "#ef4444",
31
+ "future_work": "#8b5cf6", "extends": "#06b6d4",
 
 
 
 
32
  }
33
 
34
  NODE_COLORS = {
35
+ "seed_paper": "#111827", "citing_paper": "#dbeafe", "citation_event": "#fde68a",
36
+ "journal": "#ede9fe", "author": "#fee2e2", "affiliation": "#fae8ff",
37
+ "city": "#cffafe", "country": "#ffedd5", "field": "#e0e7ff", "intent": "#dcfce7",
 
 
 
 
 
 
 
38
  }
39
 
40
+ DEFAULT_DATA_DIR = Path(os.environ.get(
41
+ "CITATIONHUB_DATA_DIR",
42
+ r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\Citehub_huggingface\data",
43
+ ))
44
+
 
45
 
46
  def fmt_num(x):
47
  try:
 
49
  except Exception:
50
  return "-"
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ def _hf_download(filename: str) -> str:
54
+ from huggingface_hub import hf_hub_download
55
+ return hf_hub_download(
56
+ repo_id=HF_REPO_ID, repo_type="dataset",
57
+ filename=f"data/{filename}", token=HF_TOKEN or None,
58
+ )
59
 
60
+
61
+ def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
 
62
  if HF_REPO_ID:
63
+ return pd.read_parquet(_hf_download(filename))
64
+ return pd.read_parquet(data_dir / filename)
65
+
66
+
67
+ def inject_fullscreen(html: str) -> str:
68
+ """pyvis HTML에 전체화면 λ²„νŠΌμ„ μ£Όμž…ν•©λ‹ˆλ‹€."""
69
+ btn = """
70
+ <button
71
+ onclick="var el=document.getElementById('mynetwork');
72
+ if(el){if(el.requestFullscreen)el.requestFullscreen();
73
+ else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
74
+ style="position:fixed;bottom:18px;right:18px;z-index:9999;
75
+ padding:8px 18px;background:#1e293b;color:white;
76
+ border:none;border-radius:8px;cursor:pointer;font-size:13px;
77
+ box-shadow:0 2px 8px rgba(0,0,0,0.35);">
78
+ β›Ά Fullscreen
79
+ </button>
80
+ <div style="position:fixed;bottom:18px;left:18px;z-index:9999;
81
+ font-size:12px;color:#64748b;background:rgba(255,255,255,0.85);
82
+ padding:5px 10px;border-radius:6px;">
83
+ πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info
84
+ </div>
85
+ """
86
+ return html.replace("</body>", btn + "</body>")
87
 
88
+
89
+ @st.cache_data(show_spinner=False)
90
+ def load_data(data_dir_str: str):
91
+ d = None if HF_REPO_ID else Path(data_dir_str)
92
+
93
+ # --- 핡심 3개 (λŒ€μš©λŸ‰) ---
94
+ seed_df = _read("seed_cited_papers_normalized.parquet", d)
95
+ events_df = _read("citation_events_normalized.parquet", d)
96
+ citing_df = _read("citing_papers_normalized.parquet", d)
97
+
98
+ # --- μ°Έμ‘° ν…Œμ΄λΈ” (μ†Œμš©λŸ‰) ---
99
+ authors_df = _read("authors.parquet", d)
100
+ affiliations_df = _read("affiliations.parquet", d)
101
+ aff_geo_df = _read("affiliation_geo.parquet", d)
102
+ cities_df = _read("cities.parquet", d)
103
+ countries_df = _read("countries.parquet", d)
104
+ fields_df = _read("fields.parquet", d)
105
+ intents_df = _read("intents.parquet", d)
106
+ journals_df = _read("journals.parquet", d)
107
+
108
+ # --- seed 가곡 ---
109
  seed = pd.DataFrame({
110
+ "seed_paper_id": seed_df["seed_paper_id"],
111
+ "doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
112
+ "title": seed_df.get("title", pd.Series(dtype=str)).fillna(""),
113
+ "journal": seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""),
114
+ "author": seed_df.get("creator", pd.Series(dtype=str)).fillna(""),
115
+ "affiliation": seed_df.get("affilname", pd.Series(dtype=str)).fillna(""),
116
+ "city": seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""),
117
+ "country": seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""),
118
+ "field": seed_df.get("group", pd.Series(dtype=str)).fillna(""),
119
+ "citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
120
+ "author_id": seed_df.get("author_id", pd.Series(dtype=object)),
121
+ "affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)),
122
+ "country_id": seed_df.get("country_id", pd.Series(dtype=object)),
123
+ "field_id": seed_df.get("field_id", pd.Series(dtype=object)),
124
+ "journal_id": seed_df.get("journal_id", pd.Series(dtype=object)),
125
  })
126
  for col in ["title", "doi", "journal", "field", "country"]:
127
  seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
128
  seed = seed.sort_values(["citedby_count", "title"], ascending=[False, True]).reset_index(drop=True)
129
 
130
+ # --- events 가곡 ---
131
  events = pd.DataFrame({
132
  "citation_event_id": events_df["citation_event_id"],
133
+ "seed_paper_id": events_df["cited_seed_paper_id"],
134
+ "citing_paper_id": events_df["citing_paper_id"],
135
+ "citing_title": events_df.get("citing_title", pd.Series(dtype=str)).fillna(""),
136
+ "citing_doi": events_df.get("citing_doi", pd.Series(dtype=str)).fillna(""),
137
+ "citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"),
138
+ "citing_venue": events_df.get("citing_venue", pd.Series(dtype=str)).fillna(""),
139
+ "primary_intent": events_df.get("primary_intent", pd.Series(dtype=str)).fillna(""),
140
+ "contexts": events_df.get("contexts"),
141
+ "context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int),
142
+ "intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int),
143
+ "is_influential": events_df.get("is_influential", pd.Series(dtype=bool)).fillna(False),
144
+ "field_id": events_df.get("field_id", pd.Series(dtype=object)),
145
  })
146
  events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
147
 
148
+ # --- citing 가곡 ---
149
  citing = pd.DataFrame({
150
  "citing_paper_id": citing_df["citing_paper_id"],
151
+ "doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""),
152
+ "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""),
153
+ "year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
154
+ "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
155
+ "oa_pdf": citing_df.get("oa_pdf", pd.Series(dtype=str)).fillna(""),
156
  })
157
 
158
  filters = {
159
+ "fields": sorted([x for x in seed["field"].dropna().astype(str).unique() if x]),
160
+ "countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]),
161
+ "journals": sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]),
162
+ "intents": ALLOWED_INTENTS,
163
+ "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
164
+ "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
165
  }
166
 
167
  overview = {
168
+ "seed_papers": int(len(seed)),
169
+ "citation_events": int(len(events)),
170
+ "citing_papers": int(events["citing_paper_id"].nunique()),
171
+ "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
172
+ "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
173
+ "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
174
+ "intents": len(ALLOWED_INTENTS),
175
+ "authors": int(len(authors_df)),
176
  }
177
 
178
+ return (seed, events, citing, filters, overview,
179
+ authors_df, affiliations_df, aff_geo_df,
180
+ cities_df, countries_df, fields_df, intents_df, journals_df)
181
 
182
 
183
+ # ── ν•„ν„° 헬퍼 ──────────────────────────────────────────────
184
+ def filter_seed_papers(seed, q, fields, countries, journals):
185
  df = seed.copy()
186
  q = (q or "").strip().lower()
187
  if q:
188
  df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
189
  if fields:
190
+ df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
 
191
  if countries:
192
+ df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
 
193
  if journals:
194
+ df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
 
195
  return df.reset_index(drop=True)
196
 
197
 
198
+ def event_subset(events, seed_paper_id, year_min, year_max):
199
  df = events[events["seed_paper_id"] == seed_paper_id].copy()
200
  df = df[df["citing_year"].fillna(-99999) >= year_min]
201
  df = df[df["citing_year"].fillna(99999) <= year_max]
202
  return df.reset_index(drop=True)
203
 
204
 
205
+ def build_intent_summary(df):
206
  counts = df.groupby("primary_intent").size().to_dict()
207
  return pd.DataFrame({
208
  "intent": ALLOWED_INTENTS,
209
+ "count": [int(counts.get(i, 0)) for i in ALLOWED_INTENTS],
210
  })
211
 
212
 
213
+ def build_context_rows(df, limit=20):
214
  rows = []
215
+ df = df.sort_values(["context_count", "intent_count", "citing_year"],
216
+ ascending=[False, False, False], na_position="last")
217
  for _, row in df.iterrows():
218
  contexts = row["contexts"]
219
  if isinstance(contexts, list) and contexts:
 
230
  return pd.DataFrame(rows[:limit])
231
 
232
 
233
+ def build_citing_table(df, limit=30):
234
  if df.empty:
235
  return pd.DataFrame(columns=["citing_title", "citing_year", "primary_intent", "context_count"])
236
+ return (
237
+ df.sort_values(["context_count", "intent_count", "citing_year"],
238
+ ascending=[False, False, False], na_position="last")
239
  [["citing_paper_id", "citing_title", "citing_doi", "citing_year", "primary_intent", "context_count"]]
240
  .drop_duplicates(subset=["citing_paper_id"])
241
  .head(limit)
242
  )
 
243
 
244
 
245
+ # ── pyvis λΉŒλ” ─────────────────────────────────────────────
246
+ def pyvis_citation_graph(seed_row, events_df):
247
+ net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
248
+ sid = seed_row["seed_paper_id"]
249
+ net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
250
+ font={"color": "white"})
251
+ for _, row in events_df.sort_values(["context_count", "intent_count"],
252
+ ascending=False).head(40).iterrows():
253
  cid = row["citing_paper_id"]
254
+ net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
255
+ color=NODE_COLORS["citing_paper"], size=18, shape="dot")
256
+ ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
257
+ yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
258
+ net.add_edge(cid, sid, label=row["primary_intent"],
259
+ color=INTENT_COLORS.get(row["primary_intent"], "#94a3b8"),
260
+ title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
261
  net.barnes_hut()
262
+ return inject_fullscreen(net.generate_html())
263
+
264
+
265
+ def pyvis_kg(seed_row, events_df):
266
+ net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
267
+ sid = seed_row["seed_paper_id"]
268
+ net.add_node(sid, label=seed_row["title"][:60], color=NODE_COLORS["seed_paper"],
269
+ font={"color": "white"}, size=34, shape="dot")
270
+ for key, typ, rel in [
271
+ ("journal", "journal", "PUBLISHED_IN"), ("author", "author", "HAS_AUTHOR"),
272
+ ("affiliation", "affiliation", "HAS_AFFILIATION"), ("city", "city", "LOCATED_IN_CITY"),
273
+ ("country", "country", "LOCATED_IN_COUNTRY"), ("field", "field", "BELONGS_TO_FIELD"),
274
+ ]:
 
 
 
 
275
  val = seed_row.get(key, "")
276
  if val:
277
  nid = f"{typ}:{val}"
278
  net.add_node(nid, label=str(val)[:50], color=NODE_COLORS[typ], size=16)
279
+ net.add_edge(sid, nid, label=rel)
280
+ top = events_df.sort_values(["context_count", "intent_count"], ascending=False).head(20)
281
+ for intent, cnt in top.groupby("primary_intent").size().items():
 
 
282
  iid = f"intent:{intent}"
283
+ net.add_node(iid, label=f"{intent} ({cnt})", color=NODE_COLORS["intent"], size=18)
284
+ net.add_edge(sid, iid, label="HAS_INTENT_CLUSTER")
285
+ for _, row in top.iterrows():
286
+ eid, cid = row["citation_event_id"], row["citing_paper_id"]
 
 
287
  net.add_node(eid, label=row["primary_intent"], color=NODE_COLORS["citation_event"], size=14)
288
+ net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:55],
289
+ color=NODE_COLORS["citing_paper"], size=14)
290
+ net.add_edge(eid, sid, label="HAS_CITED_PAPER")
291
  net.add_edge(eid, cid, label="HAS_CITING_PAPER")
292
  net.add_edge(eid, f"intent:{row['primary_intent']}", label="HAS_PRIMARY_INTENT")
 
293
  net.barnes_hut()
294
+ return inject_fullscreen(net.generate_html())
295
+
296
+
297
+ def pyvis_ontology():
298
+ net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
299
+ for nid, label, typ in [
300
+ ("seed","Top5PctCitedPaper","seed_paper"), ("event","CitationEvent","citation_event"),
301
+ ("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"),
302
+ ("journal","Journal","journal"), ("author","Author","author"),
303
+ ("affiliation","Affiliation","affiliation"),("city","City","city"),
304
+ ("country","Country","country"), ("field","Field","field"),
305
+ ]:
 
 
 
 
 
 
306
  net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
307
+ for s, t, l in [
308
+ ("event","citing","hasCitingPaper"), ("event","seed","hasCitedPaper"),
309
+ ("event","intent","hasPrimaryIntent"), ("seed","journal","publishedInJournal"),
310
+ ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
311
+ ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
312
+ ("seed","field","belongsToField"),
313
+ ]:
 
 
 
 
 
314
  net.add_edge(s, t, label=l)
315
  net.barnes_hut()
316
+ return inject_fullscreen(net.generate_html())
317
 
318
 
319
+ # ── 메인 UI ────────────────────────────────────────────────
320
  st.title("CitationHub")
321
  st.caption("Explore influential papers, their citation networks, and related research.")
322
 
323
  with st.sidebar:
324
  st.subheader("Data source")
325
  if HF_REPO_ID:
326
+ data_dir_val = "hf"
327
+ st.caption(f"Hugging Face: {HF_REPO_ID}")
328
  else:
329
+ data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
330
+
331
  try:
332
+ (seed, events, citing, filters, overview,
333
+ authors_df, affiliations_df, aff_geo_df,
334
+ cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
335
  st.success("Data loaded")
336
  except Exception as e:
337
  st.error(str(e))
 
344
  if st.button("Search", use_container_width=True):
345
  st.session_state["q_submit"] = q_input
346
 
347
+ fields_sel = st.multiselect("Field", filters["fields"])
348
+ countries_sel = st.multiselect("Country", filters["countries"])
349
+ journals_sel = st.multiselect("Journal", filters["journals"][:200])
350
+ y_min = max(2000, filters["year_min"])
351
+ year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"]))
 
 
 
 
 
352
 
353
+ seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"],
354
+ fields_sel, countries_sel, journals_sel)
355
 
356
  st.subheader("Overview counts")
357
  c1, c2 = st.columns(2)
358
+ c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
359
+ c2.metric("Citation events", fmt_num(overview["citation_events"]))
360
+ c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
361
+ c2.metric("Authors", fmt_num(overview["authors"]))
362
+ c1.metric("Countries", fmt_num(overview["countries"]))
363
+ c2.metric("Fields", fmt_num(overview["fields"]))
364
 
365
  options = seed_filtered["seed_paper_id"].tolist()
366
  if not options:
367
  st.warning("No seed papers match the current search.")
368
  st.stop()
 
 
369
  current = st.session_state.get("selected_seed_id", options[0])
370
+ default_idx = options.index(current) if current in options else 0
 
371
  selected_seed_id = st.selectbox(
372
+ "Seed paper", options, index=default_idx,
373
+ format_func=lambda sid: seed_filtered.loc[
374
+ seed_filtered["seed_paper_id"] == sid, "title"].iloc[0],
 
375
  )
376
  st.session_state["selected_seed_id"] = selected_seed_id
377
 
378
  selected_seed = seed_filtered[seed_filtered["seed_paper_id"] == selected_seed_id].iloc[0]
379
+ seed_events = event_subset(events, selected_seed_id, year_min, year_max)
380
  intent_summary = build_intent_summary(seed_events)
381
+ contexts_df = build_context_rows(seed_events)
382
+ citing_table = build_citing_table(seed_events)
383
 
384
+ # ── νƒ­ ────────────────────────────────────────────────────
385
+ (tab_overview, tab_cnet, tab_ontology, tab_kg,
386
+ tab_geo, tab_analytics) = st.tabs([
387
+ "Overview", "Citation Network", "Ontology", "Knowledge Graph",
388
+ "Geographic Map", "Analytics",
389
+ ])
390
 
391
+ # ─────────────────── 1. OVERVIEW ──────────────────────────
392
  with tab_overview:
393
+ col1, col2 = st.columns(2)
 
394
  with col1:
395
+ st.subheader("Seed paper detail")
396
+ st.columns(2)[0].metric("Cited by", fmt_num(selected_seed["citedby_count"]))
397
+ st.columns(2)[1].metric("Citation events", fmt_num(len(seed_events)))
398
+ for label, key in [
399
+ ("Title","title"), ("DOI","doi"), ("Journal","journal"),
400
+ ("Author","author"), ("Affiliation","affiliation"),
401
+ ("City","city"), ("Country","country"), ("Field","field"),
402
+ ]:
403
+ st.markdown(f"**{label}** \n{selected_seed[key] or '-'}")
 
 
 
 
404
 
405
  st.subheader("Related citing papers")
406
  st.dataframe(
407
+ citing_table.rename(columns={
408
+ "citing_title":"Title","citing_year":"Year",
409
+ "primary_intent":"Intent","context_count":"Contexts",
 
 
410
  }),
411
+ use_container_width=True, hide_index=True,
 
412
  )
413
 
414
  with col2:
415
+ st.subheader("Intent distribution (selected paper)")
416
+ fig = px.bar(intent_summary, x="intent", y="count", color="intent",
417
+ color_discrete_map=INTENT_COLORS)
418
+ fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
419
+ st.plotly_chart(fig, use_container_width=True)
420
+
421
+ st.subheader("Field distribution")
422
+ fd = (seed_filtered.groupby("field", dropna=False).size()
423
+ .reset_index(name="count").sort_values("count", ascending=False).head(20))
424
+ fd["field"] = fd["field"].replace("", "Unknown")
425
+ st.plotly_chart(
426
+ px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
427
+ use_container_width=True,
428
+ )
429
+
430
+ st.subheader("Overall intent distribution")
431
+ all_intents = events.groupby("primary_intent").size().to_dict()
432
+ ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
433
+ "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
434
+ fig2 = px.bar(ai_df, x="intent", y="count", color="intent",
435
+ color_discrete_map=INTENT_COLORS)
436
+ fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
437
+ st.plotly_chart(fig2, use_container_width=True)
438
+
439
+ st.subheader("Citation contexts")
440
  if contexts_df.empty:
441
+ st.info("No contexts available.")
442
  else:
443
  for _, row in contexts_df.iterrows():
444
  st.markdown(
445
+ f"""<div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;
446
+ margin-bottom:10px;background:#f8fafc;">
447
+ <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'],'#64748b')};
448
+ color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">
449
+ {row['primary_intent']}</div>
450
+ <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
451
+ {row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div>
452
+ <div>{row['context']}</div></div>""",
453
  unsafe_allow_html=True,
454
  )
455
 
456
+ # ─────────────────── 2. CITATION NETWORK ──────────────────
457
  with tab_cnet:
458
+ st.subheader("Citing ↔ Cited Citation Network")
459
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
 
 
 
 
 
 
 
 
 
 
460
  if seed_events.empty:
461
  st.info("No citation network data for this seed paper.")
462
  else:
463
+ components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
 
464
 
465
+ # ─────────────────── 3. ONTOLOGY ──────────────────────────
466
  with tab_ontology:
467
+ st.subheader("CitationHub Ontology")
468
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
469
+ components.html(pyvis_ontology(), height=820, scrolling=True)
 
 
 
 
 
 
 
 
 
 
470
 
471
+ # ─────────────────── 4. KNOWLEDGE GRAPH ───────────────────
472
  with tab_kg:
473
+ st.subheader("Knowledge Graph β€” Selected Seed Paper")
474
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
 
 
 
 
 
 
 
 
 
 
475
  if seed_events.empty:
476
  st.info("No knowledge graph data for this seed paper.")
477
  else:
478
+ components.html(pyvis_kg(selected_seed, seed_events), height=820, scrolling=True)
479
+
480
+ # ─────────────────── 5. GEOGRAPHIC MAP ────────────────────
481
+ with tab_geo:
482
+ st.subheader("Geographic Distribution of Seed Papers")
483
+
484
+ # ꡭ가별 seed paper 수
485
+ country_cnt = (
486
+ seed_filtered.groupby("country", dropna=False).size()
487
+ .reset_index(name="count")
488
+ .rename(columns={"country": "country_name"})
489
+ )
490
+ country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
491
+ country_cnt = country_cnt.merge(countries_df, on="country_name", how="left")
492
+
493
+ if not country_cnt.empty:
494
+ fig_map = px.choropleth(
495
+ country_cnt,
496
+ locations="country_name",
497
+ locationmode="country names",
498
+ color="count",
499
+ hover_name="country_name",
500
+ color_continuous_scale="Blues",
501
+ title="Seed Papers by Country",
502
+ )
503
+ fig_map.update_layout(geo=dict(showframe=False), height=500)
504
+ st.plotly_chart(fig_map, use_container_width=True)
505
+
506
+ # λ„μ‹œλ³„ 뢄포 (affiliation_geo ν™œμš©)
507
+ st.subheader("Affiliation Geo Distribution")
508
+ city_cnt = (
509
+ seed_filtered.merge(
510
+ aff_geo_df[["affiliation_name", "city_name", "country_name"]],
511
+ left_on="affiliation", right_on="affiliation_name", how="left",
512
+ )
513
+ .groupby(["country_name","city_name"], dropna=False).size()
514
+ .reset_index(name="count")
515
+ .dropna(subset=["country_name"])
516
+ .sort_values("count", ascending=False)
517
+ .head(30)
518
+ )
519
+ if not city_cnt.empty:
520
+ fig_city = px.bar(
521
+ city_cnt, x="city_name", y="count", color="country_name",
522
+ title="Top 30 Cities (Affiliation)",
523
+ )
524
+ fig_city.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40)
525
+ st.plotly_chart(fig_city, use_container_width=True)
526
+
527
+ # 연도별 citing 좔이 (κ΅­κ°€ ν•„ν„°)
528
+ st.subheader("Citation Trend over Time")
529
+ year_trend = (
530
+ seed_events.groupby("citing_year").size()
531
+ .reset_index(name="count")
532
+ .dropna()
533
+ )
534
+ year_trend["citing_year"] = year_trend["citing_year"].astype(int)
535
+ if not year_trend.empty:
536
+ fig_trend = px.line(year_trend, x="citing_year", y="count",
537
+ title="Citations per Year (selected seed paper)",
538
+ markers=True)
539
+ fig_trend.update_layout(xaxis_title="Year", yaxis_title="Citations")
540
+ st.plotly_chart(fig_trend, use_container_width=True)
541
+
542
+ # ─────────────────── 6. ANALYTICS ────────────────────────
543
+ with tab_analytics:
544
+ col_a, col_b = st.columns(2)
545
+
546
+ # ── μ €μž λž­ν‚Ή
547
+ with col_a:
548
+ st.subheader("Top Authors (by seed paper count)")
549
+ # seed_cited_papers_normalized에 author_id 있으면 join
550
+ if "author_id" in seed.columns and not seed["author_id"].isna().all():
551
+ top_authors = (
552
+ seed.explode("author_id")
553
+ .merge(authors_df, on="author_id", how="left")
554
+ .groupby("author_name").size()
555
+ .reset_index(name="paper_count")
556
+ .sort_values("paper_count", ascending=False)
557
+ .head(20)
558
+ )
559
+ else:
560
+ # creator μ»¬λŸΌμ—μ„œ 직접 μΆ”μΆœ
561
+ top_authors = (
562
+ seed["author"].value_counts()
563
+ .reset_index()
564
+ .rename(columns={"author": "author_name", "count": "paper_count"})
565
+ .head(20)
566
+ )
567
+ top_authors = top_authors[top_authors["author_name"].str.strip() != ""]
568
+ fig_auth = px.bar(top_authors, x="paper_count", y="author_name",
569
+ orientation="h", title="Top 20 Authors")
570
+ fig_auth.update_layout(yaxis=dict(autorange="reversed"),
571
+ xaxis_title="Seed Papers", yaxis_title="")
572
+ st.plotly_chart(fig_auth, use_container_width=True)
573
+
574
+ # ── 저널 λž­ν‚Ή
575
+ with col_b:
576
+ st.subheader("Top Journals (by seed paper count)")
577
+ top_journals = (
578
+ seed.groupby("journal").size()
579
+ .reset_index(name="count")
580
+ .sort_values("count", ascending=False)
581
+ .head(20)
582
+ )
583
+ top_journals = top_journals[top_journals["journal"].str.strip() != ""]
584
+ fig_jnl = px.bar(top_journals, x="count", y="journal",
585
+ orientation="h", title="Top 20 Journals")
586
+ fig_jnl.update_layout(yaxis=dict(autorange="reversed"),
587
+ xaxis_title="Seed Papers", yaxis_title="")
588
+ st.plotly_chart(fig_jnl, use_container_width=True)
589
+
590
+ st.markdown("---")
591
+ col_c, col_d = st.columns(2)
592
+
593
+ # ── 뢄야별 인용 μ˜λ„ 히트맡
594
+ with col_c:
595
+ st.subheader("Field Γ— Intent Heatmap")
596
+ field_intent = (
597
+ seed[["seed_paper_id", "field"]]
598
+ .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
599
+ .groupby(["field", "primary_intent"]).size()
600
+ .reset_index(name="count")
601
+ )
602
+ if not field_intent.empty:
603
+ pivot = field_intent.pivot(index="field", columns="primary_intent", values="count").fillna(0)
604
+ fig_hm = px.imshow(pivot, color_continuous_scale="Blues",
605
+ title="Citation Intent by Field",
606
+ aspect="auto")
607
+ fig_hm.update_layout(xaxis_title="Intent", yaxis_title="Field")
608
+ st.plotly_chart(fig_hm, use_container_width=True)
609
+
610
+ # ── Influential citation λΉ„μœ¨
611
+ with col_d:
612
+ st.subheader("Influential Citations")
613
+ if "is_influential" in seed_events.columns:
614
+ inf_cnt = seed_events["is_influential"].value_counts().reset_index()
615
+ inf_cnt.columns = ["is_influential", "count"]
616
+ inf_cnt["label"] = inf_cnt["is_influential"].map({True: "Influential", False: "Non-influential"})
617
+ fig_inf = px.pie(inf_cnt, names="label", values="count",
618
+ title="Influential vs Non-influential (selected paper)")
619
+ st.plotly_chart(fig_inf, use_container_width=True)
620
+ else:
621
+ st.info("is_influential 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
622
+
623
+ # ── Intent 상세 정보
624
+ st.subheader("Intent Reference Table")
625
+ st.dataframe(intents_df, use_container_width=True, hide_index=True)
626
+
627
+ # ── Fields 상세 정보
628
+ st.subheader("Field Reference Table")
629
+ st.dataframe(fields_df, use_container_width=True, hide_index=True)