Daniel0315 commited on
Commit
8d47b5e
Β·
verified Β·
1 Parent(s): cd5ca3d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1086 -0
app.py ADDED
@@ -0,0 +1,1086 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import List
6
+
7
+ import pandas as pd
8
+ import networkx as nx
9
+ import streamlit as st
10
+ import plotly.express as px
11
+ import plotly.graph_objects as go
12
+ from pyvis.network import Network
13
+ import streamlit.components.v1 as components
14
+
15
+ HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
16
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
17
+
18
+ st.set_page_config(page_title="CitationHub", page_icon="πŸ“š", layout="wide")
19
+
20
+ ALLOWED_INTENTS = [
21
+ "background","uses","similarities","motivation",
22
+ "differences","future_work","extends",
23
+ ]
24
+ INTENT_COLORS = {
25
+ "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6",
26
+ "motivation":"#f59e0b","differences":"#ef4444",
27
+ "future_work":"#8b5cf6","extends":"#06b6d4",
28
+ }
29
+ NODE_COLORS = {
30
+ "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a",
31
+ "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff",
32
+ "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7",
33
+ }
34
+ NODE_TYPE_COLORS = {
35
+ "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b",
36
+ "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899",
37
+ "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e",
38
+ }
39
+
40
+ DEFAULT_DATA_DIR = Path(os.environ.get(
41
+ "CITATIONHUB_DATA_DIR",
42
+ r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\Citehub_huggingface\data",
43
+ ))
44
+
45
+
46
+ def fmt_num(x):
47
+ try: return f"{int(x):,}"
48
+ except: return "-"
49
+
50
+
51
+ def _hf_download(filename: str) -> str:
52
+ from huggingface_hub import hf_hub_download
53
+ return hf_hub_download(
54
+ repo_id=HF_REPO_ID, repo_type="dataset",
55
+ filename=f"data/{filename}", token=HF_TOKEN or None,
56
+ )
57
+
58
+
59
+ def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
60
+ if HF_REPO_ID:
61
+ return pd.read_parquet(_hf_download(filename))
62
+ return pd.read_parquet(data_dir / filename)
63
+
64
+
65
+ def plotly_network_fig(
66
+ nodes_df: pd.DataFrame,
67
+ edges_df: pd.DataFrame,
68
+ title: str = "",
69
+ height: int = 750,
70
+ seed_node_ids: list | None = None,
71
+ ) -> go.Figure:
72
+ """SVG 기반 Plotly λ„€νŠΈμ›Œν¬ κ·Έλž˜ν”„ β€” ν™•λŒ€ν•΄λ„ μ„ λͺ…."""
73
+ G = nx.Graph()
74
+ node_meta: dict = {}
75
+ for _, row in nodes_df.iterrows():
76
+ nid = str(row["node_id"])
77
+ G.add_node(nid)
78
+ node_meta[nid] = row
79
+
80
+ for _, row in edges_df.iterrows():
81
+ s, t = str(row["source"]), str(row["target"])
82
+ if s in node_meta and t in node_meta:
83
+ G.add_edge(s, t, edge_type=row.get("edge_type", ""))
84
+
85
+ if len(G.nodes) == 0:
86
+ return go.Figure()
87
+
88
+ k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
89
+ pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
90
+
91
+ # ── edges ─────────────────────────────────
92
+ ex, ey = [], []
93
+ for src, tgt in G.edges():
94
+ x0, y0 = pos.get(src, (0, 0))
95
+ x1, y1 = pos.get(tgt, (0, 0))
96
+ ex += [x0, x1, None]
97
+ ey += [y0, y1, None]
98
+
99
+ traces: list[go.BaseTraceType] = [
100
+ go.Scatter(
101
+ x=ex, y=ey, mode="lines",
102
+ line=dict(width=0.8, color="#cbd5e1"),
103
+ hoverinfo="none", showlegend=False,
104
+ )
105
+ ]
106
+
107
+ # ── nodes grouped by type ─────────────────
108
+ for ntype, color in NODE_TYPE_COLORS.items():
109
+ subset = nodes_df[nodes_df["node_type"] == ntype]
110
+ if subset.empty:
111
+ continue
112
+ xs, ys, hovers, texts = [], [], [], []
113
+ for _, row in subset.iterrows():
114
+ nid = str(row["node_id"])
115
+ if nid not in pos:
116
+ continue
117
+ x, y = pos[nid]
118
+ xs.append(x); ys.append(y)
119
+ label = str(row.get("label", ""))[:50]
120
+ texts.append(label if ntype == "seed_paper" else "")
121
+ hovers.append(
122
+ f"<b>{label}</b><br>"
123
+ f"Type: {ntype}<br>"
124
+ f"DOI: {row.get('doi','') or '-'}<br>"
125
+ f"Pub: {row.get('publication_name','') or '-'}<br>"
126
+ f"Group: {row.get('group','') or '-'}"
127
+ )
128
+
129
+ is_seed = ntype == "seed_paper"
130
+ traces.append(go.Scatter(
131
+ x=xs, y=ys,
132
+ mode="markers+text" if is_seed else "markers",
133
+ text=texts, textposition="top center",
134
+ hovertext=hovers, hoverinfo="text",
135
+ name=ntype,
136
+ marker=dict(
137
+ size=20 if is_seed else 10,
138
+ color=color,
139
+ line=dict(width=1.5 if is_seed else 0.5, color="white"),
140
+ symbol="circle",
141
+ ),
142
+ ))
143
+
144
+ fig = go.Figure(data=traces)
145
+ fig.update_layout(
146
+ title=dict(text=title, font=dict(size=14)),
147
+ showlegend=True,
148
+ legend=dict(title="Node type", itemsizing="constant"),
149
+ hovermode="closest",
150
+ height=height,
151
+ margin=dict(l=0, r=0, t=40 if title else 10, b=0),
152
+ paper_bgcolor="white",
153
+ plot_bgcolor="#f8fafc",
154
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
155
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
156
+ )
157
+ return fig
158
+
159
+
160
+ def plotly_ontology_fig(height: int = 750) -> go.Figure:
161
+ """CitationHub μ˜¨ν†¨λ‘œμ§€ ꡬ쑰 β€” Plotly SVG."""
162
+ node_defs = [
163
+ ("seed", "Top5PctCitedPaper", "seed_paper"),
164
+ ("event", "CitationEvent", "citation_event"),
165
+ ("citing", "CitingPaper", "citing_paper"),
166
+ ("intent", "Intent", "intent"),
167
+ ("journal", "Journal", "journal"),
168
+ ("author", "Author", "author"),
169
+ ("affiliation", "Affiliation", "affiliation"),
170
+ ("city", "City", "city"),
171
+ ("country", "Country", "country"),
172
+ ("field", "Field", "field"),
173
+ ]
174
+ edge_defs = [
175
+ ("event","citing","hasCitingPaper"), ("event","seed","hasCitedPaper"),
176
+ ("event","intent","hasPrimaryIntent"), ("seed","journal","publishedInJournal"),
177
+ ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
178
+ ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
179
+ ("seed","field","belongsToField"),
180
+ ]
181
+ G = nx.DiGraph()
182
+ for nid, _, _ in node_defs:
183
+ G.add_node(nid)
184
+ for s, t, _ in edge_defs:
185
+ G.add_edge(s, t)
186
+
187
+ pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
188
+
189
+ # edges + edge labels
190
+ ex, ey = [], []
191
+ ann = []
192
+ for s, t, lbl in edge_defs:
193
+ x0, y0 = pos[s]; x1, y1 = pos[t]
194
+ ex += [x0, x1, None]; ey += [y0, y1, None]
195
+ mx, my = (x0+x1)/2, (y0+y1)/2
196
+ ann.append(dict(x=mx, y=my, text=f"<i>{lbl}</i>",
197
+ showarrow=False, font=dict(size=9, color="#64748b"),
198
+ bgcolor="rgba(255,255,255,0.7)"))
199
+
200
+ traces: list[go.BaseTraceType] = [
201
+ go.Scatter(x=ex, y=ey, mode="lines",
202
+ line=dict(width=1.2, color="#94a3b8"),
203
+ hoverinfo="none", showlegend=False)
204
+ ]
205
+ for nid, label, ntype in node_defs:
206
+ x, y = pos[nid]
207
+ color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
208
+ traces.append(go.Scatter(
209
+ x=[x], y=[y], mode="markers+text",
210
+ text=[label], textposition="top center",
211
+ hoverinfo="text", hovertext=f"<b>{label}</b><br>Type: {ntype}",
212
+ name=label, showlegend=False,
213
+ marker=dict(size=22, color=color,
214
+ line=dict(width=1.5, color="white")),
215
+ textfont=dict(size=11),
216
+ ))
217
+
218
+ fig = go.Figure(data=traces)
219
+ fig.update_layout(
220
+ showlegend=False, hovermode="closest", height=height,
221
+ annotations=ann,
222
+ margin=dict(l=0, r=0, t=10, b=0),
223
+ paper_bgcolor="white", plot_bgcolor="#f8fafc",
224
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
225
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
226
+ )
227
+ return fig
228
+
229
+
230
+ def inject_fullscreen(html: str) -> str:
231
+ extra = """
232
+ <button onclick="var el=document.getElementById('mynetwork');
233
+ if(el){if(el.requestFullscreen)el.requestFullscreen();
234
+ else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
235
+ style="position:fixed;bottom:18px;right:18px;z-index:9999;
236
+ padding:8px 18px;background:#1e293b;color:white;border:none;
237
+ border-radius:8px;cursor:pointer;font-size:13px;
238
+ box-shadow:0 2px 8px rgba(0,0,0,0.35);">β›Ά Fullscreen</button>
239
+ <div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px;
240
+ color:#64748b;background:rgba(255,255,255,0.85);
241
+ padding:5px 10px;border-radius:6px;">
242
+ πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
243
+ <script>
244
+ // HiDPI μΊ”λ²„μŠ€ 해상도 보정 (Canvas 흐림 μ΅œμ†Œν™”)
245
+ (function fixDPI() {
246
+ var canvas = document.querySelector('#mynetwork canvas');
247
+ if (!canvas) { setTimeout(fixDPI, 200); return; }
248
+ var dpr = window.devicePixelRatio || 1;
249
+ if (dpr <= 1) return;
250
+ try {
251
+ if (typeof network !== 'undefined') {
252
+ network.canvas.pixelRatio = dpr;
253
+ network.redraw();
254
+ }
255
+ } catch(e) {}
256
+ })();
257
+ </script>
258
+ """
259
+ return html.replace("</body>", extra + "</body>")
260
+
261
+
262
+ # ── 메인 데이터 λ‘œλ“œ (11개) ────────────────────────────────────
263
+ @st.cache_data(show_spinner=False)
264
+ def load_data(data_dir_str: str):
265
+ d = None if HF_REPO_ID else Path(data_dir_str)
266
+
267
+ seed_df = _read("seed_cited_papers_normalized.parquet", d)
268
+ events_df = _read("citation_events_normalized.parquet", d)
269
+ citing_df = _read("citing_papers_normalized.parquet", d)
270
+ authors_df = _read("authors.parquet", d)
271
+ affiliations_df = _read("affiliations.parquet", d)
272
+ aff_geo_df = _read("affiliation_geo.parquet", d)
273
+ cities_df = _read("cities.parquet", d)
274
+ countries_df = _read("countries.parquet", d)
275
+ fields_df = _read("fields.parquet", d)
276
+ intents_df = _read("intents.parquet", d)
277
+ journals_df = _read("journals.parquet", d)
278
+
279
+ seed = pd.DataFrame({
280
+ "seed_paper_id": seed_df["seed_paper_id"],
281
+ "doi": seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
282
+ "title": seed_df.get("title", pd.Series(dtype=str)).fillna(""),
283
+ "journal": seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""),
284
+ "author": seed_df.get("creator", pd.Series(dtype=str)).fillna(""),
285
+ "affiliation": seed_df.get("affilname", pd.Series(dtype=str)).fillna(""),
286
+ "city": seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""),
287
+ "country": seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""),
288
+ "field": seed_df.get("group", pd.Series(dtype=str)).fillna(""),
289
+ "citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
290
+ "author_id": seed_df.get("author_id", pd.Series(dtype=object)),
291
+ "affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)),
292
+ "country_id": seed_df.get("country_id", pd.Series(dtype=object)),
293
+ "field_id": seed_df.get("field_id", pd.Series(dtype=object)),
294
+ "journal_id": seed_df.get("journal_id", pd.Series(dtype=object)),
295
+ })
296
+ for col in ["title","doi","journal","field","country"]:
297
+ seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
298
+ seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True)
299
+
300
+ events = pd.DataFrame({
301
+ "citation_event_id": events_df["citation_event_id"],
302
+ "seed_paper_id": events_df["cited_seed_paper_id"],
303
+ "citing_paper_id": events_df["citing_paper_id"],
304
+ "citing_title": events_df.get("citing_title", pd.Series(dtype=str)).fillna(""),
305
+ "citing_doi": events_df.get("citing_doi", pd.Series(dtype=str)).fillna(""),
306
+ "citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"),
307
+ "citing_venue": events_df.get("citing_venue", pd.Series(dtype=str)).fillna(""),
308
+ "primary_intent": events_df.get("primary_intent", pd.Series(dtype=str)).fillna(""),
309
+ "contexts": events_df.get("contexts"),
310
+ "context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int),
311
+ "intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int),
312
+ "is_influential": events_df.get("is_influential", pd.Series(dtype=bool)).fillna(False),
313
+ "field_id": events_df.get("field_id", pd.Series(dtype=object)),
314
+ })
315
+ events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
316
+
317
+ citing = pd.DataFrame({
318
+ "citing_paper_id": citing_df["citing_paper_id"],
319
+ "doi": citing_df.get("doi", pd.Series(dtype=str)).fillna(""),
320
+ "title": citing_df.get("title", pd.Series(dtype=str)).fillna(""),
321
+ "year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
322
+ "venue": citing_df.get("venue", pd.Series(dtype=str)).fillna(""),
323
+ "oa_pdf": citing_df.get("oa_pdf",pd.Series(dtype=str)).fillna(""),
324
+ })
325
+
326
+ filters = {
327
+ "fields": sorted([x for x in seed["field"].dropna().astype(str).unique() if x]),
328
+ "countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]),
329
+ "journals": sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]),
330
+ "intents": ALLOWED_INTENTS,
331
+ "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
332
+ "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
333
+ }
334
+ overview = {
335
+ "seed_papers": int(len(seed)),
336
+ "citation_events": int(len(events)),
337
+ "citing_papers": int(events["citing_paper_id"].nunique()),
338
+ "authors": int(len(authors_df)),
339
+ "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
340
+ "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
341
+ "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
342
+ "intents": len(ALLOWED_INTENTS),
343
+ }
344
+ return (seed, events, citing, filters, overview,
345
+ authors_df, affiliations_df, aff_geo_df,
346
+ cities_df, countries_df, fields_df, intents_df, journals_df)
347
+
348
+
349
+ # ── KG 데이터: DuckDB λ°©μ‹μœΌλ‘œ 뢄리 λ‘œλ“œ ─────────────────────
350
+ # kg_nodes : pandas 전체 λ‘œλ“œ (~160MB 파일, λ©”λͺ¨λ¦¬ ν—ˆμš© λ²”μœ„)
351
+ # kg_edges : DuckDB둜 ν•„μš”ν•œ λ…Έλ“œμ˜ μ—£μ§€λ§Œ 쿼리 (전체 λ‘œλ“œ μ•ˆ 함)
352
+ # enriched : DuckDB둜 집계 ν†΅κ³„λ§Œ 쿼리 (전체 λ‘œλ“œ μ•ˆ 함)
353
+
354
+ @st.cache_data(show_spinner=False)
355
+ def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
356
+ """kg_nodes 전체 λ‘œλ“œ (3.4M rows, ~160MB 파일)"""
357
+ d = None if HF_REPO_ID else Path(data_dir_str)
358
+ return _read("kg_nodes.parquet", d)
359
+
360
+
361
+ @st.cache_data(show_spinner=False)
362
+ def get_parquet_path(filename: str, data_dir_str: str) -> str:
363
+ """파일 경둜 λ°˜ν™˜ (HFλ©΄ 둜컬 μΊμ‹œμ— λ‹€μš΄λ‘œλ“œ ν›„ 경둜 λ°˜ν™˜)"""
364
+ if HF_REPO_ID:
365
+ return _hf_download(filename)
366
+ # DuckDB용: μ—­μŠ¬λž˜μ‹œ β†’ μŠ¬λž˜μ‹œ λ³€ν™˜
367
+ return str(Path(data_dir_str) / filename).replace("\\", "/")
368
+
369
+
370
+ @st.cache_data(show_spinner=False)
371
+ def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
372
+ """DuckDB: νŠΉμ • λ…Έλ“œμ˜ μ—£μ§€λ§Œ parquetμ—μ„œ λ°”λ‘œ 쿼리 (전체 λ‘œλ“œ μ—†μŒ)"""
373
+ import duckdb
374
+ safe_path = kg_edges_path.replace("\\", "/")
375
+ safe_node = node_id.replace("'", "''")
376
+ q = f"""
377
+ SELECT source, target, edge_type
378
+ FROM read_parquet('{safe_path}')
379
+ WHERE source = '{safe_node}' OR target = '{safe_node}'
380
+ LIMIT {int(max_edges)}
381
+ """
382
+ return duckdb.execute(q).df()
383
+
384
+
385
+ @st.cache_data(show_spinner=False)
386
+ def query_enriched_stats(enriched_path: str):
387
+ """DuckDB: enriched 전체 λ‘œλ“œ 없이 집계 ν†΅κ³„λ§Œ 쿼리"""
388
+ import duckdb
389
+ safe_path = enriched_path.replace("\\", "/")
390
+
391
+ sem_df = duckdb.execute(f"""
392
+ SELECT has_semantic_evidence, COUNT(*) AS count
393
+ FROM read_parquet('{safe_path}')
394
+ GROUP BY has_semantic_evidence
395
+ """).df()
396
+
397
+ field_df = duckdb.execute(f"""
398
+ SELECT field_folder AS field,
399
+ AVG(CAST(has_semantic_evidence AS INTEGER)) AS sem_ratio,
400
+ COUNT(*) AS event_count
401
+ FROM read_parquet('{safe_path}')
402
+ GROUP BY field_folder
403
+ ORDER BY sem_ratio DESC
404
+ LIMIT 20
405
+ """).df()
406
+
407
+ return sem_df, field_df
408
+
409
+
410
+ @st.cache_data(show_spinner=False)
411
+ def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
412
+ """DuckDB: KG Explorer용 μž„μ˜ λ…Έλ“œ μ—£μ§€ 쿼리"""
413
+ import duckdb
414
+ safe_path = kg_edges_path.replace("\\", "/")
415
+ safe_node = node_id.replace("'", "''")
416
+ q = f"""
417
+ SELECT source, target, edge_type
418
+ FROM read_parquet('{safe_path}')
419
+ WHERE source = '{safe_node}' OR target = '{safe_node}'
420
+ LIMIT {int(max_edges)}
421
+ """
422
+ return duckdb.execute(q).df()
423
+
424
+
425
+ # ── 헬퍼 ───────────────────────────────────────────────────────
426
+ def filter_seed_papers(seed, q, fields, countries, journals):
427
+ df = seed.copy()
428
+ q = (q or "").strip().lower()
429
+ if q:
430
+ df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
431
+ if fields: df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
432
+ if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
433
+ if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
434
+ return df.reset_index(drop=True)
435
+
436
+
437
+ def event_subset(events, seed_paper_id, year_min, year_max):
438
+ df = events[events["seed_paper_id"] == seed_paper_id].copy()
439
+ df = df[df["citing_year"].fillna(-99999) >= year_min]
440
+ df = df[df["citing_year"].fillna(99999) <= year_max]
441
+ return df.reset_index(drop=True)
442
+
443
+
444
+ def build_intent_summary(df):
445
+ counts = df.groupby("primary_intent").size().to_dict()
446
+ return pd.DataFrame({"intent": ALLOWED_INTENTS,
447
+ "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
448
+
449
+
450
+ def build_context_rows(df, limit=20):
451
+ rows = []
452
+ df = df.sort_values(["context_count","intent_count","citing_year"],
453
+ ascending=[False,False,False], na_position="last")
454
+ for _, row in df.iterrows():
455
+ ctx = row["contexts"]
456
+ if isinstance(ctx, list) and ctx:
457
+ for c in ctx[:2]:
458
+ rows.append({"primary_intent": row["primary_intent"],
459
+ "citing_title": row["citing_title"],
460
+ "citing_doi": row["citing_doi"],
461
+ "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
462
+ "context": c})
463
+ if len(rows) >= limit: break
464
+ return pd.DataFrame(rows[:limit])
465
+
466
+
467
+ def build_citing_table(df, limit=30):
468
+ if df.empty:
469
+ return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
470
+ return (df.sort_values(["context_count","intent_count","citing_year"],
471
+ ascending=[False,False,False], na_position="last")
472
+ [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
473
+ .drop_duplicates(subset=["citing_paper_id"]).head(limit))
474
+
475
+
476
+ def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
477
+ """μ„ νƒλœ seed paperλ₯Ό μΈμš©ν•œ 논문듀이 ν•¨κ»˜ μΈμš©ν•œ λ‹€λ₯Έ seed papers"""
478
+ citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
479
+ cocited = (events[events["citing_paper_id"].isin(citing_ids) &
480
+ (events["seed_paper_id"] != selected_seed_id)]
481
+ .groupby("seed_paper_id").size()
482
+ .reset_index(name="co_citation_count")
483
+ .sort_values("co_citation_count", ascending=False)
484
+ .head(top_n))
485
+ return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
486
+ on="seed_paper_id", how="left")
487
+
488
+
489
+ def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
490
+ """μ„ νƒλœ seed paper의 KG 1-hop μ„œλΈŒκ·Έλž˜ν”„ λ°˜ν™˜"""
491
+ node_id = f"seed:{seed_doi}"
492
+ edges = kg_edges[(kg_edges["source"] == node_id) |
493
+ (kg_edges["target"] == node_id)].head(max_edges)
494
+ if edges.empty:
495
+ return None, None
496
+ all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
497
+ nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
498
+ return nodes, edges
499
+
500
+
501
+ def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
502
+ """KG Explorer: μž„μ˜ λ…Έλ“œ κΈ°μ€€ μ„œλΈŒκ·Έλž˜ν”„"""
503
+ edges = kg_edges[(kg_edges["source"] == search_node_id) |
504
+ (kg_edges["target"] == search_node_id)].head(max_edges)
505
+ if edges.empty:
506
+ return None, None
507
+ all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
508
+ nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
509
+ return nodes, edges
510
+
511
+
512
+ # ── pyvis λΉŒλ” ─────────────────────────────────────────────────
513
+ def pyvis_citation_graph(seed_row, events_df):
514
+ net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
515
+ sid = seed_row["seed_paper_id"]
516
+ net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
517
+ font={"color":"white"})
518
+ for _, row in events_df.sort_values(["context_count","intent_count"],
519
+ ascending=False).head(40).iterrows():
520
+ cid = row["citing_paper_id"]
521
+ net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
522
+ color=NODE_COLORS["citing_paper"], size=18, shape="dot")
523
+ ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
524
+ yr = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
525
+ net.add_edge(cid, sid, label=row["primary_intent"],
526
+ color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"),
527
+ title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
528
+ net.barnes_hut()
529
+ return inject_fullscreen(net.generate_html())
530
+
531
+
532
+ def pyvis_ontology():
533
+ net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
534
+ for nid, label, typ in [
535
+ ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"),
536
+ ("citing","CitingPaper","citing_paper"), ("intent","Intent","intent"),
537
+ ("journal","Journal","journal"), ("author","Author","author"),
538
+ ("affiliation","Affiliation","affiliation"),("city","City","city"),
539
+ ("country","Country","country"), ("field","Field","field"),
540
+ ]:
541
+ net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
542
+ for s, t, l in [
543
+ ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"),
544
+ ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"),
545
+ ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
546
+ ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
547
+ ("seed","field","belongsToField"),
548
+ ]:
549
+ net.add_edge(s, t, label=l)
550
+ net.barnes_hut()
551
+ return inject_fullscreen(net.generate_html())
552
+
553
+
554
+ def pyvis_from_kg(nodes_df, edges_df, height="780px"):
555
+ """kg_nodes / kg_edges DataFrame으둜 pyvis κ·Έλž˜ν”„ 생성"""
556
+ net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
557
+ for _, row in nodes_df.iterrows():
558
+ ntype = row.get("node_type","")
559
+ color = NODE_TYPE_COLORS.get(ntype,"#94a3b8")
560
+ label = str(row.get("label",""))[:55]
561
+ size = 30 if ntype == "seed_paper" else 16
562
+ font = {"color":"white"} if ntype == "seed_paper" else {}
563
+ tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}"
564
+ net.add_node(str(row["node_id"]), label=label, color=color,
565
+ size=size, shape="dot", title=tooltip, font=font)
566
+ for _, row in edges_df.iterrows():
567
+ net.add_edge(str(row["source"]), str(row["target"]),
568
+ label=row.get("edge_type",""), color="#94a3b8")
569
+ net.barnes_hut()
570
+ return inject_fullscreen(net.generate_html())
571
+
572
+
573
+ # ═══════════════════════════════════════════════════════════════
574
+ # 메인 UI
575
+ # ═══════════════════════════════════════════════════════════════
576
+ st.title("CitationHub")
577
+ st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
578
+
579
+ # ── Sidebar ────────────────────────────────────────────────────
580
+ with st.sidebar:
581
+ st.subheader("Data source")
582
+ if HF_REPO_ID:
583
+ data_dir_val = "hf"
584
+ st.caption(f"Hugging Face: {HF_REPO_ID}")
585
+ else:
586
+ data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
587
+
588
+ try:
589
+ (seed, events, citing, filters, overview,
590
+ authors_df, affiliations_df, aff_geo_df,
591
+ cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
592
+ st.success("Data loaded")
593
+ except Exception as e:
594
+ st.error(str(e)); st.stop()
595
+
596
+ st.subheader("Search seed papers")
597
+ q_input = st.text_input("Title or DOI")
598
+ if "q_submit" not in st.session_state: st.session_state["q_submit"] = ""
599
+ if st.button("Search", use_container_width=True):
600
+ st.session_state["q_submit"] = q_input
601
+
602
+ fields_sel = st.multiselect("Field", filters["fields"])
603
+ countries_sel = st.multiselect("Country", filters["countries"])
604
+ journals_sel = st.multiselect("Journal", filters["journals"][:200])
605
+ y_min = max(2000, filters["year_min"])
606
+ year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"]))
607
+
608
+ seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"],
609
+ fields_sel, countries_sel, journals_sel)
610
+
611
+ st.subheader("Overview counts")
612
+ c1, c2 = st.columns(2)
613
+ c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
614
+ c2.metric("Citation events", fmt_num(overview["citation_events"]))
615
+ c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
616
+ c2.metric("Authors", fmt_num(overview["authors"]))
617
+ c1.metric("Countries", fmt_num(overview["countries"]))
618
+ c2.metric("Fields", fmt_num(overview["fields"]))
619
+
620
+ options = seed_filtered["seed_paper_id"].tolist()
621
+ if not options:
622
+ st.warning("No seed papers match the current search."); st.stop()
623
+ current = st.session_state.get("selected_seed_id", options[0])
624
+ default_idx = options.index(current) if current in options else 0
625
+ selected_seed_id = st.selectbox(
626
+ "Seed paper", options, index=default_idx,
627
+ format_func=lambda sid: seed_filtered.loc[
628
+ seed_filtered["seed_paper_id"]==sid, "title"].iloc[0],
629
+ )
630
+ st.session_state["selected_seed_id"] = selected_seed_id
631
+
632
+ selected_seed = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0]
633
+ seed_events = event_subset(events, selected_seed_id, year_min, year_max)
634
+ intent_summary = build_intent_summary(seed_events)
635
+ contexts_df = build_context_rows(seed_events)
636
+ citing_table = build_citing_table(seed_events)
637
+
638
+ # ── νƒ­ ─────────────────────────────────────────────────────────
639
+ (tab_overview, tab_cnet, tab_ontology,
640
+ tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
641
+ "Overview","Citation Network","Ontology",
642
+ "Knowledge Graph","Geographic Map","Analytics",
643
+ ])
644
+
645
+
646
+ # ═══ 1. OVERVIEW ═══════════════════════════════════════════════
647
+ with tab_overview:
648
+ col1, col2 = st.columns(2)
649
+ with col1:
650
+ st.subheader("Seed paper detail")
651
+ dc1, dc2 = st.columns(2)
652
+ dc1.metric("Cited by", fmt_num(selected_seed["citedby_count"]))
653
+ dc2.metric("Citation events", fmt_num(len(seed_events)))
654
+ for label, key in [
655
+ ("Title","title"),("DOI","doi"),("Journal","journal"),
656
+ ("Author","author"),("Affiliation","affiliation"),
657
+ ("City","city"),("Country","country"),("Field","field"),
658
+ ]:
659
+ st.markdown(f"**{label}** \n{selected_seed[key] or '-'}")
660
+
661
+ st.subheader("Related citing papers")
662
+ st.dataframe(citing_table.rename(columns={
663
+ "citing_title":"Title","citing_year":"Year",
664
+ "primary_intent":"Intent","context_count":"Contexts"}),
665
+ use_container_width=True, hide_index=True)
666
+
667
+ st.subheader("Co-cited seed papers")
668
+ st.caption("같은 citing paper에 μ˜ν•΄ ν•¨κ»˜ 인용된 λ‹€λ₯Έ top 5% λ…Όλ¬Έλ“€")
669
+ cocited = get_cocited_papers(selected_seed_id, events, seed)
670
+ if cocited.empty:
671
+ st.info("Co-cited papers not found.")
672
+ else:
673
+ st.dataframe(cocited.rename(columns={
674
+ "co_citation_count":"Co-citations","title":"Title",
675
+ "field":"Field","citedby_count":"Cited by"}),
676
+ use_container_width=True, hide_index=True)
677
+
678
+ with col2:
679
+ st.subheader("Intent distribution (selected paper)")
680
+ fig = px.bar(intent_summary, x="intent", y="count", color="intent",
681
+ color_discrete_map=INTENT_COLORS)
682
+ fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
683
+ st.plotly_chart(fig, use_container_width=True)
684
+
685
+ st.subheader("Citation trend (selected paper)")
686
+ trend = (seed_events.dropna(subset=["citing_year"])
687
+ .assign(citing_year=lambda df: df["citing_year"].astype(int))
688
+ .groupby("citing_year").size().reset_index(name="count"))
689
+ if not trend.empty:
690
+ st.plotly_chart(
691
+ px.line(trend, x="citing_year", y="count", markers=True)
692
+ .update_layout(xaxis_title="Year", yaxis_title="Citations"),
693
+ use_container_width=True)
694
+
695
+ st.subheader("CitationHub Intent Distribution")
696
+ all_intents = events.groupby("primary_intent").size().to_dict()
697
+ ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
698
+ "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
699
+ fig2 = px.bar(ai_df, x="intent", y="count", color="intent",
700
+ color_discrete_map=INTENT_COLORS)
701
+ fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
702
+ st.plotly_chart(fig2, use_container_width=True)
703
+
704
+ st.subheader("CitationHub Field Distribution")
705
+ fd = (seed_filtered.groupby("field", dropna=False).size()
706
+ .reset_index(name="count").sort_values("count", ascending=False).head(20))
707
+ fd["field"] = fd["field"].replace("","Unknown")
708
+ st.plotly_chart(
709
+ px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
710
+ use_container_width=True)
711
+
712
+ st.subheader("Citation contexts")
713
+ if contexts_df.empty:
714
+ st.info("No contexts available.")
715
+ else:
716
+ for _, row in contexts_df.iterrows():
717
+ st.markdown(
718
+ f"""<div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;
719
+ margin-bottom:10px;background:#f8fafc;">
720
+ <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'],'#64748b')};
721
+ color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">
722
+ {row['primary_intent']}</div>
723
+ <div style="font-size:12px;color:#64748b;margin-bottom:6px;">
724
+ {row['citing_year'] or '-'} Β· {row['citing_title'] or row['citing_doi']}</div>
725
+ <div>{row['context']}</div></div>""",
726
+ unsafe_allow_html=True)
727
+
728
+
729
+ # ═══ 2. CITATION NETWORK ════════════════════════════════════════
730
+ with tab_cnet:
731
+ st.subheader("Citation Network")
732
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
733
+ if seed_events.empty:
734
+ st.info("No citation network data for this seed paper.")
735
+ else:
736
+ components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
737
+
738
+
739
+ # ═══ 3. ONTOLOGY ════════════════════════════════════════════════
740
+ with tab_ontology:
741
+ st.subheader("CitationHub Ontology")
742
+ st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
743
+
744
+
745
+ # ═══ 4. KNOWLEDGE GRAPH (KG Explorer) ═══════════════════════════
746
+ with tab_kg_exp:
747
+ st.subheader("KG Explorer")
748
+
749
+ try:
750
+ with st.spinner("Loading..."):
751
+ kg_nodes_exp = load_kg_nodes(data_dir_val)
752
+ kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
753
+
754
+ # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
755
+ col_a, col_b = st.columns([1, 2])
756
+ with col_a:
757
+ st.subheader("Node Types")
758
+ nt = kg_nodes_exp["node_type"].value_counts().reset_index()
759
+ nt.columns = ["node_type", "count"]
760
+ st.dataframe(nt, use_container_width=True, hide_index=True)
761
+
762
+ st.subheader("Edge Types")
763
+ import duckdb as _ddb
764
+ et = _ddb.execute(f"""
765
+ SELECT edge_type, COUNT(*) AS count
766
+ FROM read_parquet('{kg_edges_path}')
767
+ GROUP BY edge_type ORDER BY count DESC
768
+ """).df()
769
+ st.dataframe(et, use_container_width=True, hide_index=True)
770
+
771
+ with col_b:
772
+ st.subheader("CitationHub KG Node Distribution")
773
+ nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
774
+ color_discrete_map=NODE_TYPE_COLORS)
775
+ nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
776
+ st.plotly_chart(nt_fig, use_container_width=True)
777
+
778
+ # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
779
+ st.markdown("---")
780
+ st.subheader("Multi-Node Knowledge Graph")
781
+ st.caption("All 10 node types and all edge types β€” 2-hop from top cited seed papers")
782
+
783
+ n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
784
+ edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type")
785
+
786
+ with st.spinner("Querying graph..."):
787
+ # ── 1-hop: 인용수 μƒμœ„ seed papers κΈ°μ€€ λͺ¨λ“  μ—£μ§€
788
+ top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
789
+ .sort_values("citedby_count", ascending=False)
790
+ .head(n_seeds))
791
+ seed_ids = top_seeds["node_id"].tolist()
792
+
793
+ if seed_ids:
794
+ ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
795
+
796
+ # 1-hop: seed paper와 μ—°κ²°λœ λͺ¨λ“  edge (journal, author, affiliation, city,
797
+ # country, field, citation_event)
798
+ hop1 = _ddb.execute(f"""
799
+ WITH ranked AS (
800
+ SELECT source, target, edge_type,
801
+ ROW_NUMBER() OVER (
802
+ PARTITION BY edge_type ORDER BY source
803
+ ) AS rn
804
+ FROM read_parquet('{kg_edges_path}')
805
+ WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
806
+ )
807
+ SELECT source, target, edge_type FROM ranked
808
+ WHERE rn <= {int(edges_per_type)}
809
+ """).df()
810
+
811
+ # 2-hop: citation_event β†’ HAS_CITING_PAPER β†’ citing_paper
812
+ # citation_event β†’ HAS_PRIMARY_INTENT β†’ intent
813
+ event_ids = [
814
+ x for x in
815
+ set(hop1["source"].tolist()) | set(hop1["target"].tolist())
816
+ if str(x).startswith("event:")
817
+ ][:20]
818
+
819
+ if event_ids:
820
+ ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
821
+ hop2 = _ddb.execute(f"""
822
+ WITH ranked AS (
823
+ SELECT source, target, edge_type,
824
+ ROW_NUMBER() OVER (
825
+ PARTITION BY edge_type ORDER BY source
826
+ ) AS rn
827
+ FROM read_parquet('{kg_edges_path}')
828
+ WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
829
+ AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
830
+ )
831
+ SELECT source, target, edge_type FROM ranked
832
+ WHERE rn <= {int(edges_per_type)}
833
+ """).df()
834
+ exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
835
+ subset=["source", "target", "edge_type"]
836
+ )
837
+ else:
838
+ exp_edges = hop1
839
+
840
+ all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
841
+ exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
842
+
843
+ c1, c2, c3, c4 = st.columns(4)
844
+ c1.metric("Nodes", fmt_num(len(exp_nodes)))
845
+ c2.metric("Edges", fmt_num(len(exp_edges)))
846
+ c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
847
+ c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
848
+
849
+ # 컀버리지 확인 ν‘œμ‹œ
850
+ present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
851
+ present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
852
+ all_10_ntypes = sorted(NODE_TYPE_COLORS.keys())
853
+ missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
854
+ if missing_nt:
855
+ st.caption(f"⚠ Node types not yet in graph: {', '.join(missing_nt)} "
856
+ f"β€” try increasing 'Edges per type'")
857
+ else:
858
+ st.caption("βœ… All 10 node types represented")
859
+
860
+ st.plotly_chart(
861
+ plotly_network_fig(exp_nodes, exp_edges, height=800,
862
+ seed_node_ids=seed_ids),
863
+ use_container_width=True)
864
+
865
+ except Exception as e:
866
+ st.error(str(e))
867
+
868
+
869
+ # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
870
+ with tab_geo:
871
+ st.subheader("Geographic Distribution of Seed Papers")
872
+
873
+ country_cnt = (seed_filtered.groupby("country", dropna=False).size()
874
+ .reset_index(name="count").rename(columns={"country":"country_name"}))
875
+ country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]
876
+
877
+ if not country_cnt.empty:
878
+ fig_map = px.choropleth(country_cnt, locations="country_name",
879
+ locationmode="country names", color="count",
880
+ hover_name="country_name",
881
+ color_continuous_scale="Blues",
882
+ title="Seed Papers by Country")
883
+ fig_map.update_layout(geo=dict(showframe=False), height=500)
884
+ st.plotly_chart(fig_map, use_container_width=True)
885
+
886
+ st.subheader("Top Cities (Affiliation)")
887
+ city_cnt = (seed_filtered.merge(
888
+ aff_geo_df[["affiliation_name","city_name","country_name"]],
889
+ left_on="affiliation", right_on="affiliation_name", how="left")
890
+ .groupby(["country_name","city_name"], dropna=False).size()
891
+ .reset_index(name="count").dropna(subset=["country_name"])
892
+ .sort_values("count", ascending=False).head(30))
893
+ if not city_cnt.empty:
894
+ st.plotly_chart(
895
+ px.bar(city_cnt, x="city_name", y="count", color="country_name",
896
+ title="Top 30 Cities")
897
+ .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
898
+ use_container_width=True)
899
+
900
+ st.subheader("Citation Trend over Time (selected paper)")
901
+ trend2 = (seed_events.dropna(subset=["citing_year"])
902
+ .assign(citing_year=lambda df: df["citing_year"].astype(int))
903
+ .groupby("citing_year").size().reset_index(name="count"))
904
+ if not trend2.empty:
905
+ st.plotly_chart(
906
+ px.line(trend2, x="citing_year", y="count", markers=True,
907
+ title="Citations per Year")
908
+ .update_layout(xaxis_title="Year", yaxis_title="Citations"),
909
+ use_container_width=True)
910
+
911
+
912
+ # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
913
+ with tab_analytics:
914
+ col_a, col_b = st.columns(2)
915
+
916
+ with col_a:
917
+ st.subheader("Top Authors")
918
+ if "author_id" in seed.columns and not seed["author_id"].isna().all():
919
+ top_auth = (seed.explode("author_id")
920
+ .merge(authors_df, on="author_id", how="left")
921
+ .groupby("author_name").size()
922
+ .reset_index(name="paper_count")
923
+ .sort_values("paper_count", ascending=False).head(20))
924
+ else:
925
+ top_auth = (seed["author"].value_counts()
926
+ .reset_index().rename(columns={"author":"author_name","count":"paper_count"})
927
+ .head(20))
928
+ top_auth = top_auth[top_auth["author_name"].str.strip() != ""]
929
+ st.plotly_chart(
930
+ px.bar(top_auth, x="paper_count", y="author_name", orientation="h",
931
+ title="Top 20 Authors")
932
+ .update_layout(yaxis=dict(autorange="reversed"),
933
+ xaxis_title="Seed Papers", yaxis_title=""),
934
+ use_container_width=True)
935
+
936
+ with col_b:
937
+ st.subheader("Top Journals")
938
+ top_jnl = (seed.groupby("journal").size()
939
+ .reset_index(name="count").sort_values("count", ascending=False).head(20))
940
+ top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""]
941
+ st.plotly_chart(
942
+ px.bar(top_jnl, x="count", y="journal", orientation="h",
943
+ title="Top 20 Journals")
944
+ .update_layout(yaxis=dict(autorange="reversed"),
945
+ xaxis_title="Seed Papers", yaxis_title=""),
946
+ use_container_width=True)
947
+
948
+ st.markdown("---")
949
+ col_c, col_d = st.columns(2)
950
+
951
+ with col_c:
952
+ st.subheader("CitationHub Field Γ— Intent Distribution Heatmap")
953
+ fi = (seed[["seed_paper_id","field"]]
954
+ .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
955
+ .groupby(["field","primary_intent"]).size().reset_index(name="count"))
956
+ if not fi.empty:
957
+ pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
958
+ st.plotly_chart(
959
+ px.imshow(pivot, color_continuous_scale="Blues",
960
+ title="CitationHub Field Γ— Intent Distribution Heatmap",
961
+ aspect="auto")
962
+ .update_layout(xaxis_title="Intent", yaxis_title="Field"),
963
+ use_container_width=True)
964
+
965
+ with col_d:
966
+ st.subheader("Influential Citations (selected paper)")
967
+ if "is_influential" in seed_events.columns:
968
+ inf = seed_events["is_influential"].value_counts().reset_index()
969
+ inf.columns = ["is_influential","count"]
970
+ inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"})
971
+ st.plotly_chart(
972
+ px.pie(inf, names="label", values="count",
973
+ title="Influential vs Non-influential"),
974
+ use_container_width=True)
975
+
976
+ # ── Intent Evolution over Years ────────────────────────────
977
+ st.markdown("---")
978
+ st.subheader("CitationHub Intent Evolution over Years")
979
+ st.caption("How citation intents have changed across all papers over time")
980
+ intent_trend_raw = (
981
+ events.dropna(subset=["citing_year"])
982
+ .assign(year=lambda df: df["citing_year"].astype(int))
983
+ .query("year >= 2000")
984
+ .groupby(["year", "primary_intent"]).size()
985
+ .reset_index(name="count")
986
+ )
987
+ if not intent_trend_raw.empty:
988
+ st.plotly_chart(
989
+ px.area(
990
+ intent_trend_raw, x="year", y="count", color="primary_intent",
991
+ color_discrete_map=INTENT_COLORS,
992
+ labels={"primary_intent": "Intent", "count": "Citations", "year": "Year"},
993
+ ).update_layout(
994
+ legend_title="Intent",
995
+ xaxis_title="Year", yaxis_title="# Citations",
996
+ hovermode="x unified",
997
+ ),
998
+ use_container_width=True,
999
+ )
1000
+
1001
+ # ── Top Citing Venues ───────────────────────────────────────
1002
+ st.markdown("---")
1003
+ col_v1, col_v2 = st.columns(2)
1004
+
1005
+ with col_v1:
1006
+ st.subheader("Top Citing Venues")
1007
+ st.caption("Journals/conferences that cite seed papers most")
1008
+ venue_cnt = (
1009
+ events[events["citing_venue"].str.strip() != ""]
1010
+ .groupby("citing_venue").size()
1011
+ .reset_index(name="count")
1012
+ .sort_values("count", ascending=False).head(20)
1013
+ )
1014
+ if not venue_cnt.empty:
1015
+ st.plotly_chart(
1016
+ px.bar(venue_cnt, x="count", y="citing_venue", orientation="h",
1017
+ labels={"count": "Citations", "citing_venue": ""})
1018
+ .update_layout(yaxis=dict(autorange="reversed"),
1019
+ xaxis_title="Citations", yaxis_title="", height=520),
1020
+ use_container_width=True,
1021
+ )
1022
+
1023
+ with col_v2:
1024
+ st.subheader("CitationHub Field Γ— Intent Distribution")
1025
+ st.caption("How each field uses citations differently (all fields)")
1026
+ fi_pct = (
1027
+ seed[["seed_paper_id", "field"]]
1028
+ .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
1029
+ .groupby(["field", "primary_intent"]).size().reset_index(name="count")
1030
+ )
1031
+ if not fi_pct.empty:
1032
+ totals = fi_pct.groupby("field")["count"].transform("sum")
1033
+ fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
1034
+ n_fields = fi_pct["field"].nunique()
1035
+ chart_height = max(520, n_fields * 28)
1036
+ st.plotly_chart(
1037
+ px.bar(fi_pct, x="pct", y="field", color="primary_intent",
1038
+ orientation="h", color_discrete_map=INTENT_COLORS,
1039
+ labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
1040
+ .update_layout(
1041
+ barmode="stack",
1042
+ yaxis=dict(autorange="reversed", categoryorder="total ascending"),
1043
+ xaxis_title="% of citations", yaxis_title="",
1044
+ legend_title="Intent", height=chart_height,
1045
+ ),
1046
+ use_container_width=True,
1047
+ )
1048
+
1049
+ # ── Export ─────────────────────────────────────────────────
1050
+ st.markdown("---")
1051
+ st.subheader("Export Data")
1052
+ col_e1, col_e2, col_e3 = st.columns(3)
1053
+
1054
+ with col_e1:
1055
+ csv_seed = seed_filtered[
1056
+ ["title", "doi", "journal", "author", "country", "field", "citedby_count"]
1057
+ ].to_csv(index=False).encode("utf-8")
1058
+ st.download_button(
1059
+ "⬇ Seed Papers (CSV)",
1060
+ csv_seed, "seed_papers.csv", "text/csv",
1061
+ use_container_width=True,
1062
+ )
1063
+
1064
+ with col_e2:
1065
+ cite_export = seed_events[
1066
+ ["citing_title", "citing_doi", "citing_year", "citing_venue",
1067
+ "primary_intent", "context_count", "is_influential"]
1068
+ ].rename(columns={
1069
+ "citing_title": "title", "citing_doi": "doi",
1070
+ "citing_year": "year", "citing_venue": "venue",
1071
+ "primary_intent": "intent", "context_count": "contexts",
1072
+ "is_influential": "influential",
1073
+ }).to_csv(index=False).encode("utf-8")
1074
+ st.download_button(
1075
+ "⬇ Citation Events (CSV)",
1076
+ cite_export, "citation_events.csv", "text/csv",
1077
+ use_container_width=True,
1078
+ )
1079
+
1080
+ with col_e3:
1081
+ intent_csv = intent_summary.to_csv(index=False).encode("utf-8")
1082
+ st.download_button(
1083
+ "⬇ Intent Summary (CSV)",
1084
+ intent_csv, "intent_summary.csv", "text/csv",
1085
+ use_container_width=True,
1086
+ )