Daniel0315 commited on
Commit
00ff4cf
·
verified ·
1 Parent(s): ab1755b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +505 -0
  2. requirements.txt +7 -3
app.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Dict, List, Tuple
6
+
7
+ import pandas as pd
8
+ import streamlit as st
9
+ import plotly.express as px
10
+ import networkx as nx
11
+ from pyvis.network import Network
12
+ import streamlit.components.v1 as components
13
+
14
+ # Hugging Face: Space 배포 시. Space Secrets에 HF_TOKEN, HF_REPO_ID 설정 (env로 주입됨)
15
+ # HF_REPO_ID 예: "username/citationhub-data" (Dataset repo 이름)
16
+ HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
17
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
18
+
19
+ st.set_page_config(
20
+ page_title="CitationHub",
21
+ page_icon="📚",
22
+ layout="wide",
23
+ )
24
+
25
+ ALLOWED_INTENTS = [
26
+ "background",
27
+ "uses",
28
+ "similarities",
29
+ "motivation",
30
+ "differences",
31
+ "future_work",
32
+ "extends",
33
+ ]
34
+
35
+ INTENT_COLORS = {
36
+ "background": "#94a3b8",
37
+ "uses": "#22c55e",
38
+ "similarities": "#3b82f6",
39
+ "motivation": "#f59e0b",
40
+ "differences": "#ef4444",
41
+ "future_work": "#8b5cf6",
42
+ "extends": "#06b6d4",
43
+ }
44
+
45
+ NODE_COLORS = {
46
+ "seed_paper": "#111827",
47
+ "citing_paper": "#dbeafe",
48
+ "citation_event": "#fde68a",
49
+ "journal": "#ede9fe",
50
+ "author": "#fee2e2",
51
+ "affiliation": "#fae8ff",
52
+ "city": "#cffafe",
53
+ "country": "#ffedd5",
54
+ "field": "#e0e7ff",
55
+ "intent": "#dcfce7",
56
+ }
57
+
58
+ DEFAULT_DATA_DIR = Path(
59
+ os.environ.get(
60
+ "CITATIONHUB_DATA_DIR",
61
+ r"C:\Users\user\OneDrive\바탕 화면\citationhub_v1_ontology_ready",
62
+ )
63
+ )
64
+
65
+ def fmt_num(x):
66
+ try:
67
+ return f"{int(x):,}"
68
+ except Exception:
69
+ return "-"
70
+
71
+ def _load_from_hf():
72
+ """Hugging Face Dataset에서 Parquet 다운로드 후 로드 (Space 배포용)"""
73
+ try:
74
+ from huggingface_hub import hf_hub_download
75
+ except ImportError:
76
+ raise ImportError("huggingface_hub가 필요합니다. pip install huggingface_hub")
77
+ if not HF_REPO_ID:
78
+ raise ValueError("HF_REPO_ID가 설정되지 않았습니다. (예: username/citationhub-data)")
79
+ token = HF_TOKEN or None # None이면 public repo, 있으면 private 인증
80
+ seed_path = hf_hub_download(repo_id=HF_REPO_ID, repo_type="dataset", filename="seed_cited_papers_normalized.parquet", token=token)
81
+ events_path = hf_hub_download(repo_id=HF_REPO_ID, repo_type="dataset", filename="citation_events_normalized.parquet", token=token)
82
+ citing_path = hf_hub_download(repo_id=HF_REPO_ID, repo_type="dataset", filename="citing_papers_normalized.parquet", token=token)
83
+ return pd.read_parquet(seed_path), pd.read_parquet(events_path), pd.read_parquet(citing_path)
84
+
85
+
86
+ @st.cache_data(show_spinner=False)
87
+ def load_data(data_dir_str: str):
88
+ # Hugging Face 모드: HF_REPO_ID가 설정되어 있으면 Dataset에서 로드
89
+ if HF_REPO_ID:
90
+ seed_df, events_df, citing_df = _load_from_hf()
91
+ else:
92
+ data_dir = Path(data_dir_str)
93
+ seed_path = data_dir / "seed_cited_papers_normalized.parquet"
94
+ events_path = data_dir / "citation_events_normalized.parquet"
95
+ citing_path = data_dir / "citing_papers_normalized.parquet"
96
+ missing = [str(p) for p in [seed_path, events_path, citing_path] if not p.exists()]
97
+ if missing:
98
+ raise FileNotFoundError(f"Missing parquet files: {missing}")
99
+ seed_df = pd.read_parquet(seed_path)
100
+ events_df = pd.read_parquet(events_path)
101
+ citing_df = pd.read_parquet(citing_path)
102
+
103
+ seed = pd.DataFrame({
104
+ "seed_paper_id": seed_df["seed_paper_id"],
105
+ "doi": seed_df.get("doi", "").fillna(""),
106
+ "title": seed_df.get("title", "").fillna(""),
107
+ "journal": seed_df.get("publication_name", "").fillna(""),
108
+ "author": seed_df.get("creator", "").fillna(""),
109
+ "affiliation": seed_df.get("affilname", "").fillna(""),
110
+ "city": seed_df.get("affiliation_city", "").fillna(""),
111
+ "country": seed_df.get("affiliation_country", "").fillna(""),
112
+ "field": seed_df.get("group", "").fillna(""),
113
+ "citedby_count": pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
114
+ })
115
+ for col in ["title", "doi", "journal", "field", "country"]:
116
+ seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
117
+ seed = seed.sort_values(["citedby_count", "title"], ascending=[False, True]).reset_index(drop=True)
118
+
119
+ events = pd.DataFrame({
120
+ "citation_event_id": events_df["citation_event_id"],
121
+ "seed_paper_id": events_df["cited_seed_paper_id"],
122
+ "citing_paper_id": events_df["citing_paper_id"],
123
+ "citing_title": events_df.get("citing_title", "").fillna(""),
124
+ "citing_doi": events_df.get("citing_doi", "").fillna(""),
125
+ "citing_year": pd.to_numeric(events_df.get("citing_year"), errors="coerce"),
126
+ "primary_intent": events_df.get("primary_intent", "").fillna(""),
127
+ "contexts": events_df.get("contexts"),
128
+ "context_count": pd.to_numeric(events_df.get("context_count"), errors="coerce").fillna(0).astype(int),
129
+ "intent_count": pd.to_numeric(events_df.get("intent_count"), errors="coerce").fillna(0).astype(int),
130
+ })
131
+ events = events[events["primary_intent"].isin(ALLOWED_INTENTS)].reset_index(drop=True)
132
+
133
+ citing = pd.DataFrame({
134
+ "citing_paper_id": citing_df["citing_paper_id"],
135
+ "doi": citing_df.get("doi", "").fillna(""),
136
+ "title": citing_df.get("title", "").fillna(""),
137
+ "year": pd.to_numeric(citing_df.get("year"), errors="coerce"),
138
+ "venue": citing_df.get("venue", "").fillna(""),
139
+ "oa_pdf": citing_df.get("oa_pdf", "").fillna(""),
140
+ })
141
+
142
+ filters = {
143
+ "fields": sorted([x for x in seed["field"].dropna().astype(str).unique().tolist() if x]),
144
+ "countries": sorted([x for x in seed["country"].dropna().astype(str).unique().tolist() if x]),
145
+ "journals": sorted([x for x in seed["journal"].dropna().astype(str).unique().tolist() if x]),
146
+ "intents": ALLOWED_INTENTS,
147
+ "year_min": int(events["citing_year"].dropna().min()) if events["citing_year"].notna().any() else 2000,
148
+ "year_max": int(events["citing_year"].dropna().max()) if events["citing_year"].notna().any() else 2025,
149
+ }
150
+
151
+ overview = {
152
+ "seed_papers": int(len(seed)),
153
+ "citation_events": int(len(events)),
154
+ "citing_papers": int(events["citing_paper_id"].nunique()),
155
+ "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
156
+ "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
157
+ "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
158
+ "intents": len(ALLOWED_INTENTS),
159
+ }
160
+
161
+ return seed, events, citing, filters, overview
162
+
163
+
164
+ def filter_seed_papers(seed: pd.DataFrame, q: str, fields: List[str], countries: List[str], journals: List[str]):
165
+ df = seed.copy()
166
+ q = (q or "").strip().lower()
167
+ if q:
168
+ df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
169
+ if fields:
170
+ wanted = {x.lower() for x in fields}
171
+ df = df[df["field"].str.lower().isin(wanted)]
172
+ if countries:
173
+ wanted = {x.lower() for x in countries}
174
+ df = df[df["country"].str.lower().isin(wanted)]
175
+ if journals:
176
+ wanted = {x.lower() for x in journals}
177
+ df = df[df["journal"].str.lower().isin(wanted)]
178
+ return df.reset_index(drop=True)
179
+
180
+
181
+ def event_subset(events: pd.DataFrame, seed_paper_id: str, year_min: int, year_max: int):
182
+ df = events[events["seed_paper_id"] == seed_paper_id].copy()
183
+ df = df[df["citing_year"].fillna(-99999) >= year_min]
184
+ df = df[df["citing_year"].fillna(99999) <= year_max]
185
+ return df.reset_index(drop=True)
186
+
187
+
188
+ def build_intent_summary(df: pd.DataFrame):
189
+ counts = df.groupby("primary_intent").size().to_dict()
190
+ return pd.DataFrame({
191
+ "intent": ALLOWED_INTENTS,
192
+ "count": [int(counts.get(intent, 0)) for intent in ALLOWED_INTENTS]
193
+ })
194
+
195
+
196
+ def build_context_rows(df: pd.DataFrame, limit: int = 20):
197
+ rows = []
198
+ df = df.sort_values(["context_count", "intent_count", "citing_year"], ascending=[False, False, False], na_position="last")
199
+ for _, row in df.iterrows():
200
+ contexts = row["contexts"]
201
+ if isinstance(contexts, list) and contexts:
202
+ for ctx in contexts[:2]:
203
+ rows.append({
204
+ "primary_intent": row["primary_intent"],
205
+ "citing_title": row["citing_title"],
206
+ "citing_doi": row["citing_doi"],
207
+ "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
208
+ "context": ctx,
209
+ })
210
+ if len(rows) >= limit:
211
+ break
212
+ return pd.DataFrame(rows[:limit])
213
+
214
+
215
+ def build_citing_table(df: pd.DataFrame, limit: int = 30):
216
+ if df.empty:
217
+ return pd.DataFrame(columns=["citing_title", "citing_year", "primary_intent", "context_count"])
218
+ out = (
219
+ df.sort_values(["context_count", "intent_count", "citing_year"], ascending=[False, False, False], na_position="last")
220
+ [["citing_paper_id", "citing_title", "citing_doi", "citing_year", "primary_intent", "context_count"]]
221
+ .drop_duplicates(subset=["citing_paper_id"])
222
+ .head(limit)
223
+ )
224
+ return out
225
+
226
+
227
+ def pyvis_html_from_citation_graph(seed_row: pd.Series, events_df: pd.DataFrame):
228
+ net = Network(height="1100px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
229
+ seed_id = seed_row["seed_paper_id"]
230
+ net.add_node(seed_id, label=seed_row["title"][:60], color=INTENT_COLORS.get("background", "#111827"), size=34, shape="dot")
231
+
232
+ df = events_df.sort_values(["context_count", "intent_count"], ascending=[False, False]).head(40)
233
+ for _, row in df.iterrows():
234
+ cid = row["citing_paper_id"]
235
+ citing_label = (row["citing_title"] or row["citing_doi"] or cid)[:60]
236
+ net.add_node(cid, label=citing_label, color=NODE_COLORS["citing_paper"], size=18, shape="dot")
237
+ context = None
238
+ if isinstance(row["contexts"], list) and row["contexts"]:
239
+ context = row["contexts"][0]
240
+ title = f"Intent: {row['primary_intent']}<br>Year: {'' if pd.isna(row['citing_year']) else int(row['citing_year'])}<br>{context or ''}"
241
+ net.add_edge(cid, seed_id, label=row["primary_intent"], color=INTENT_COLORS.get(row["primary_intent"], "#94a3b8"), title=title)
242
+ net.barnes_hut()
243
+ return net.generate_html()
244
+
245
+
246
+ def pyvis_html_from_kg(seed_row: pd.Series, events_df: pd.DataFrame):
247
+ net = Network(height="1100px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
248
+ seed_id = seed_row["seed_paper_id"]
249
+ net.add_node(seed_id, label=seed_row["title"][:60], color=NODE_COLORS["seed_paper"], font={"color": "white"}, size=34, shape="dot")
250
+
251
+ meta_map = [
252
+ ("journal", "journal", "PUBLISHED_IN"),
253
+ ("author", "author", "HAS_AUTHOR"),
254
+ ("affiliation", "affiliation", "HAS_AFFILIATION"),
255
+ ("city", "city", "LOCATED_IN_CITY"),
256
+ ("country", "country", "LOCATED_IN_COUNTRY"),
257
+ ("field", "field", "BELONGS_TO_FIELD"),
258
+ ]
259
+ for key, typ, rel in meta_map:
260
+ val = seed_row.get(key, "")
261
+ if val:
262
+ nid = f"{typ}:{val}"
263
+ net.add_node(nid, label=str(val)[:50], color=NODE_COLORS[typ], size=16)
264
+ net.add_edge(seed_id, nid, label=rel)
265
+
266
+ top_events = events_df.sort_values(["context_count", "intent_count"], ascending=[False, False]).head(20)
267
+ intent_counts = top_events.groupby("primary_intent").size().to_dict()
268
+ for intent, count in intent_counts.items():
269
+ iid = f"intent:{intent}"
270
+ net.add_node(iid, label=f"{intent} ({count})", color=NODE_COLORS["intent"], size=18)
271
+ net.add_edge(seed_id, iid, label="HAS_INTENT_CLUSTER")
272
+
273
+ for _, row in top_events.iterrows():
274
+ eid = row["citation_event_id"]
275
+ cid = row["citing_paper_id"]
276
+ net.add_node(eid, label=row["primary_intent"], color=NODE_COLORS["citation_event"], size=14)
277
+ net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:55], color=NODE_COLORS["citing_paper"], size=14)
278
+ net.add_edge(eid, seed_id, label="HAS_CITED_PAPER")
279
+ net.add_edge(eid, cid, label="HAS_CITING_PAPER")
280
+ net.add_edge(eid, f"intent:{row['primary_intent']}", label="HAS_PRIMARY_INTENT")
281
+
282
+ net.barnes_hut()
283
+ return net.generate_html()
284
+
285
+
286
+ def pyvis_html_from_ontology():
287
+ net = Network(height="1100px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
288
+ nodes = [
289
+ ("seed", "Top5PctCitedPaper", "seed_paper"),
290
+ ("event", "CitationEvent", "citation_event"),
291
+ ("citing", "CitingPaper", "citing_paper"),
292
+ ("intent", "Intent", "intent"),
293
+ ("journal", "Journal", "journal"),
294
+ ("author", "Author", "author"),
295
+ ("affiliation", "Affiliation", "affiliation"),
296
+ ("city", "City", "city"),
297
+ ("country", "Country", "country"),
298
+ ("field", "Field", "field"),
299
+ ]
300
+ for nid, label, typ in nodes:
301
+ net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
302
+ edges = [
303
+ ("event", "citing", "hasCitingPaper"),
304
+ ("event", "seed", "hasCitedPaper"),
305
+ ("event", "intent", "hasPrimaryIntent"),
306
+ ("seed", "journal", "publishedInJournal"),
307
+ ("seed", "author", "hasAuthor"),
308
+ ("seed", "affiliation", "hasAffiliation"),
309
+ ("seed", "city", "locatedInCity"),
310
+ ("seed", "country", "locatedInCountry"),
311
+ ("seed", "field", "belongsToField"),
312
+ ]
313
+ for s, t, l in edges:
314
+ net.add_edge(s, t, label=l)
315
+ net.barnes_hut()
316
+ return net.generate_html()
317
+
318
+
319
+ # ---------- UI ----------
320
+ st.title("CitationHub")
321
+ st.caption("Explore influential papers, their citation networks, and related research.")
322
+
323
+ with st.sidebar:
324
+ st.subheader("Data source")
325
+ if HF_REPO_ID:
326
+ data_dir = "hf"
327
+ st.caption(f"Loading from Hugging Face: {HF_REPO_ID}")
328
+ else:
329
+ data_dir = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
330
+ try:
331
+ seed, events, citing, filters, overview = load_data(data_dir)
332
+ st.success("Data loaded")
333
+ except Exception as e:
334
+ st.error(str(e))
335
+ st.stop()
336
+
337
+ st.subheader("Search seed papers")
338
+ q_input = st.text_input("Title or DOI")
339
+ if "q_submit" not in st.session_state:
340
+ st.session_state["q_submit"] = ""
341
+ if st.button("Search", use_container_width=True):
342
+ st.session_state["q_submit"] = q_input
343
+
344
+ fields = st.multiselect("Field", filters["fields"])
345
+ countries = st.multiselect("Country", filters["countries"])
346
+ journals = st.multiselect("Journal", filters["journals"][:200])
347
+ display_year_min = max(2000, filters["year_min"])
348
+ year_min, year_max = st.slider(
349
+ "Citing year",
350
+ display_year_min,
351
+ filters["year_max"],
352
+ (display_year_min, filters["year_max"]),
353
+ )
354
+
355
+ seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"], fields, countries, journals)
356
+
357
+ st.subheader("Overview counts")
358
+ c1, c2 = st.columns(2)
359
+ c1.metric("Seed papers", fmt_num(overview["seed_papers"]))
360
+ c2.metric("Events", fmt_num(overview["citation_events"]))
361
+ c1.metric("Citing papers", fmt_num(overview["citing_papers"]))
362
+ c2.metric("Intents", fmt_num(overview["intents"]))
363
+
364
+ options = seed_filtered["seed_paper_id"].tolist()
365
+ if not options:
366
+ st.warning("No seed papers match the current search.")
367
+ st.stop()
368
+
369
+ default_idx = 0
370
+ current = st.session_state.get("selected_seed_id", options[0])
371
+ if current in options:
372
+ default_idx = options.index(current)
373
+ selected_seed_id = st.selectbox(
374
+ "Seed paper records",
375
+ options,
376
+ index=default_idx,
377
+ format_func=lambda sid: seed_filtered.loc[seed_filtered["seed_paper_id"] == sid, "title"].iloc[0],
378
+ )
379
+ st.session_state["selected_seed_id"] = selected_seed_id
380
+
381
+ selected_seed = seed_filtered[seed_filtered["seed_paper_id"] == selected_seed_id].iloc[0]
382
+ seed_events = event_subset(events, selected_seed_id, year_min, year_max)
383
+ intent_summary = build_intent_summary(seed_events)
384
+ contexts_df = build_context_rows(seed_events, limit=20)
385
+ citing_df = build_citing_table(seed_events, limit=30)
386
+
387
+ tab_overview, tab_cnet, tab_ontology, tab_kg = st.tabs(["Overview", "Citation network", "Ontology", "Knowledge graph"])
388
+
389
+ with tab_overview:
390
+ col1, col2 = st.columns([1, 1])
391
+
392
+ with col1:
393
+ st.subheader("Selected seed paper detail")
394
+ detail_cols = st.columns(2)
395
+ detail_cols[0].metric("Cited by count", fmt_num(selected_seed["citedby_count"]))
396
+ detail_cols[1].metric("Related citation events", fmt_num(len(seed_events)))
397
+
398
+ st.markdown(f"**Title** \n{selected_seed['title']}")
399
+ st.markdown(f"**DOI** \n{selected_seed['doi'] or '-'}")
400
+ st.markdown(f"**Journal** \n{selected_seed['journal'] or '-'}")
401
+ st.markdown(f"**Author** \n{selected_seed['author'] or '-'}")
402
+ st.markdown(f"**Affiliation** \n{selected_seed['affiliation'] or '-'}")
403
+ st.markdown(f"**City** \n{selected_seed['city'] or '-'}")
404
+ st.markdown(f"**Country** \n{selected_seed['country'] or '-'}")
405
+ st.markdown(f"**Field** \n{selected_seed['field'] or '-'}")
406
+
407
+ st.subheader("Related citing papers")
408
+ st.dataframe(
409
+ citing_df.rename(columns={
410
+ "citing_title": "Title",
411
+ "citing_year": "Year",
412
+ "primary_intent": "Intent",
413
+ "context_count": "Contexts",
414
+ }),
415
+ use_container_width=True,
416
+ hide_index=True,
417
+ )
418
+
419
+ with col2:
420
+ st.subheader("Selected seed paper intent distribution")
421
+ fig_intent = px.bar(intent_summary, x="intent", y="count", color="intent", color_discrete_map=INTENT_COLORS)
422
+ fig_intent.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
423
+ st.plotly_chart(fig_intent, use_container_width=True)
424
+
425
+ st.subheader("CitationHub field distribution")
426
+ field_dist = seed_filtered.groupby("field", dropna=False).size().reset_index(name="count").sort_values("count", ascending=False).head(20)
427
+ field_dist["field"] = field_dist["field"].replace("", "Unknown")
428
+ fig_field = px.bar(field_dist, x="field", y="count")
429
+ fig_field.update_layout(xaxis_title="", yaxis_title="Count")
430
+ st.plotly_chart(fig_field, use_container_width=True)
431
+
432
+ st.subheader("CitationHub intent distribution")
433
+ all_intent_counts = events.groupby("primary_intent").size().to_dict()
434
+ all_intent_df = pd.DataFrame({"intent": ALLOWED_INTENTS, "count": [int(all_intent_counts.get(i, 0)) for i in ALLOWED_INTENTS]})
435
+ fig_all_intent = px.bar(all_intent_df, x="intent", y="count", color="intent", color_discrete_map=INTENT_COLORS)
436
+ fig_all_intent.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
437
+ st.plotly_chart(fig_all_intent, use_container_width=True)
438
+
439
+ st.subheader("Selected seed paper contexts")
440
+ if contexts_df.empty:
441
+ st.info("No contexts available for this seed paper.")
442
+ else:
443
+ for _, row in contexts_df.iterrows():
444
+ st.markdown(
445
+ f"""
446
+ <div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;margin-bottom:10px;background:#f8fafc;">
447
+ <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'], '#64748b')};color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">{row['primary_intent']}</div>
448
+ <div style="font-size:12px;color:#64748b;margin-bottom:6px;">{row['citing_year'] or '-'} · {row['citing_title'] or row['citing_doi']}</div>
449
+ <div>{row['context']}</div>
450
+ </div>
451
+ """,
452
+ unsafe_allow_html=True,
453
+ )
454
+
455
+ with tab_cnet:
456
+ st.subheader("Citing ↔ cited citation network visualization")
457
+
458
+ cnet_expand = st.toggle("Expand citation network view", value=False, key="cnet_expand")
459
+ cnet_height = st.slider(
460
+ "Citation network height",
461
+ min_value=700,
462
+ max_value=1800,
463
+ value=1400 if cnet_expand else 900,
464
+ step=100,
465
+ key="cnet_height",
466
+ )
467
+
468
+ if seed_events.empty:
469
+ st.info("No citation network data for this seed paper.")
470
+ else:
471
+ html = pyvis_html_from_citation_graph(selected_seed, seed_events)
472
+ components.html(html, height=cnet_height, scrolling=True)
473
+
474
+ with tab_ontology:
475
+ st.subheader("CitationHub ontology overview")
476
+
477
+ ontology_expand = st.toggle("Expand ontology view", value=False, key="ontology_expand")
478
+ ontology_height = st.slider(
479
+ "Ontology graph height",
480
+ min_value=700,
481
+ max_value=1800,
482
+ value=1400 if ontology_expand else 900,
483
+ step=100,
484
+ key="ontology_height",
485
+ )
486
+
487
+ components.html(pyvis_html_from_ontology(), height=ontology_height, scrolling=True)
488
+
489
+ with tab_kg:
490
+ st.subheader("Knowledge graph for the selected seed paper")
491
+
492
+ kg_expand = st.toggle("Expand knowledge graph view", value=False, key="kg_expand")
493
+ kg_height = st.slider(
494
+ "Knowledge graph height",
495
+ min_value=700,
496
+ max_value=1800,
497
+ value=1400 if kg_expand else 900,
498
+ step=100,
499
+ key="kg_height",
500
+ )
501
+
502
+ if seed_events.empty:
503
+ st.info("No knowledge graph data for this seed paper.")
504
+ else:
505
+ components.html(pyvis_html_from_kg(selected_seed, seed_events), height=kg_height, scrolling=True)
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ streamlit==1.39.0
2
+ pandas==2.2.2
3
+ pyarrow==17.0.0
4
+ plotly==5.24.1
5
+ networkx==3.3
6
+ pyvis==0.3.2
7
+ huggingface_hub>=0.20.0