Daniel0315 commited on
Commit
cd5ca3d
Β·
verified Β·
1 Parent(s): 5e840b3

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +300 -34
src/app.py CHANGED
@@ -5,8 +5,10 @@ from pathlib import Path
5
  from typing import List
6
 
7
  import pandas as pd
 
8
  import streamlit as st
9
  import plotly.express as px
 
10
  from pyvis.network import Network
11
  import streamlit.components.v1 as components
12
 
@@ -60,8 +62,173 @@ def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
60
  return pd.read_parquet(data_dir / filename)
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def inject_fullscreen(html: str) -> str:
64
- btn = """
65
  <button onclick="var el=document.getElementById('mynetwork');
66
  if(el){if(el.requestFullscreen)el.requestFullscreen();
67
  else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
@@ -73,8 +240,23 @@ def inject_fullscreen(html: str) -> str:
73
  color:#64748b;background:rgba(255,255,255,0.85);
74
  padding:5px 10px;border-radius:6px;">
75
  πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  """
77
- return html.replace("</body>", btn + "</body>")
78
 
79
 
80
  # ── 메인 데이터 λ‘œλ“œ (11개) ────────────────────────────────────
@@ -557,8 +739,7 @@ with tab_cnet:
557
  # ═══ 3. ONTOLOGY ════════════════════════════════════════════════
558
  with tab_ontology:
559
  st.subheader("CitationHub Ontology")
560
- st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
561
- components.html(pyvis_ontology(), height=820, scrolling=True)
562
 
563
 
564
  # ═══ 4. KNOWLEDGE GRAPH ══════════════════════════════════════════
@@ -599,8 +780,9 @@ with tab_kg:
599
  .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
600
  use_container_width=True)
601
 
602
- st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
603
- components.html(pyvis_from_kg(nodes_sub, edges_sub), height=820, scrolling=True)
 
604
  except Exception as e:
605
  st.error(str(e))
606
 
@@ -613,7 +795,6 @@ with tab_kg_exp:
613
  with st.spinner("Loading..."):
614
  kg_nodes_exp = load_kg_nodes(data_dir_val)
615
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
616
- enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
617
 
618
  # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
619
  col_a, col_b = st.columns([1, 2])
@@ -674,35 +855,9 @@ with tab_kg_exp:
674
  c2.metric("Edges", fmt_num(len(exp_edges)))
675
  c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
676
 
677
- st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
678
- components.html(pyvis_from_kg(exp_nodes, exp_edges, height="780px"),
679
- height=800, scrolling=True)
680
-
681
- # ── Enriched μΈμ‚¬μ΄νŠΈ
682
- st.markdown("---")
683
- st.subheader("CitationHub Semantic Evidence Distribution")
684
- with st.spinner("Loading..."):
685
- sem_df, field_df = query_enriched_stats(enriched_path)
686
-
687
- if not sem_df.empty:
688
- sem_df["label"] = sem_df["has_semantic_evidence"].map(
689
- {True: "With evidence", False: "Without evidence",
690
- 1: "With evidence", 0: "Without evidence"})
691
- col_s1, col_s2 = st.columns(2)
692
- with col_s1:
693
  st.plotly_chart(
694
- px.pie(sem_df, names="label", values="count",
695
- title="Semantic Evidence Coverage")
696
- .update_layout(legend_title=""),
697
  use_container_width=True)
698
- with col_s2:
699
- if not field_df.empty:
700
- st.plotly_chart(
701
- px.bar(field_df, x="field", y="sem_ratio",
702
- title="Semantic Evidence Rate by Field",
703
- labels={"sem_ratio": "Evidence Rate", "field": "Field"})
704
- .update_layout(xaxis_tickangle=-40),
705
- use_container_width=True)
706
 
707
  except Exception as e:
708
  st.error(str(e))
@@ -820,3 +975,114 @@ with tab_analytics:
820
  st.markdown("---")
821
  st.subheader("Field Reference")
822
  st.dataframe(fields_df, use_container_width=True, hide_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from typing import List
6
 
7
  import pandas as pd
8
+ import networkx as nx
9
  import streamlit as st
10
  import plotly.express as px
11
+ import plotly.graph_objects as go
12
  from pyvis.network import Network
13
  import streamlit.components.v1 as components
14
 
 
62
  return pd.read_parquet(data_dir / filename)
63
 
64
 
65
+ def plotly_network_fig(
66
+ nodes_df: pd.DataFrame,
67
+ edges_df: pd.DataFrame,
68
+ title: str = "",
69
+ height: int = 750,
70
+ seed_node_ids: list | None = None,
71
+ ) -> go.Figure:
72
+ """SVG 기반 Plotly λ„€νŠΈμ›Œν¬ κ·Έλž˜ν”„ β€” ν™•λŒ€ν•΄λ„ μ„ λͺ…."""
73
+ G = nx.Graph()
74
+ node_meta: dict = {}
75
+ for _, row in nodes_df.iterrows():
76
+ nid = str(row["node_id"])
77
+ G.add_node(nid)
78
+ node_meta[nid] = row
79
+
80
+ for _, row in edges_df.iterrows():
81
+ s, t = str(row["source"]), str(row["target"])
82
+ if s in node_meta and t in node_meta:
83
+ G.add_edge(s, t, edge_type=row.get("edge_type", ""))
84
+
85
+ if len(G.nodes) == 0:
86
+ return go.Figure()
87
+
88
+ k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
89
+ pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
90
+
91
+ # ── edges ─────────────────────────────────
92
+ ex, ey = [], []
93
+ for src, tgt in G.edges():
94
+ x0, y0 = pos.get(src, (0, 0))
95
+ x1, y1 = pos.get(tgt, (0, 0))
96
+ ex += [x0, x1, None]
97
+ ey += [y0, y1, None]
98
+
99
+ traces: list[go.BaseTraceType] = [
100
+ go.Scatter(
101
+ x=ex, y=ey, mode="lines",
102
+ line=dict(width=0.8, color="#cbd5e1"),
103
+ hoverinfo="none", showlegend=False,
104
+ )
105
+ ]
106
+
107
+ # ── nodes grouped by type ─────────────────
108
+ for ntype, color in NODE_TYPE_COLORS.items():
109
+ subset = nodes_df[nodes_df["node_type"] == ntype]
110
+ if subset.empty:
111
+ continue
112
+ xs, ys, hovers, texts = [], [], [], []
113
+ for _, row in subset.iterrows():
114
+ nid = str(row["node_id"])
115
+ if nid not in pos:
116
+ continue
117
+ x, y = pos[nid]
118
+ xs.append(x); ys.append(y)
119
+ label = str(row.get("label", ""))[:50]
120
+ texts.append(label if ntype == "seed_paper" else "")
121
+ hovers.append(
122
+ f"<b>{label}</b><br>"
123
+ f"Type: {ntype}<br>"
124
+ f"DOI: {row.get('doi','') or '-'}<br>"
125
+ f"Pub: {row.get('publication_name','') or '-'}<br>"
126
+ f"Group: {row.get('group','') or '-'}"
127
+ )
128
+
129
+ is_seed = ntype == "seed_paper"
130
+ traces.append(go.Scatter(
131
+ x=xs, y=ys,
132
+ mode="markers+text" if is_seed else "markers",
133
+ text=texts, textposition="top center",
134
+ hovertext=hovers, hoverinfo="text",
135
+ name=ntype,
136
+ marker=dict(
137
+ size=20 if is_seed else 10,
138
+ color=color,
139
+ line=dict(width=1.5 if is_seed else 0.5, color="white"),
140
+ symbol="circle",
141
+ ),
142
+ ))
143
+
144
+ fig = go.Figure(data=traces)
145
+ fig.update_layout(
146
+ title=dict(text=title, font=dict(size=14)),
147
+ showlegend=True,
148
+ legend=dict(title="Node type", itemsizing="constant"),
149
+ hovermode="closest",
150
+ height=height,
151
+ margin=dict(l=0, r=0, t=40 if title else 10, b=0),
152
+ paper_bgcolor="white",
153
+ plot_bgcolor="#f8fafc",
154
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
155
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
156
+ )
157
+ return fig
158
+
159
+
160
+ def plotly_ontology_fig(height: int = 750) -> go.Figure:
161
+ """CitationHub μ˜¨ν†¨λ‘œμ§€ ꡬ쑰 β€” Plotly SVG."""
162
+ node_defs = [
163
+ ("seed", "Top5PctCitedPaper", "seed_paper"),
164
+ ("event", "CitationEvent", "citation_event"),
165
+ ("citing", "CitingPaper", "citing_paper"),
166
+ ("intent", "Intent", "intent"),
167
+ ("journal", "Journal", "journal"),
168
+ ("author", "Author", "author"),
169
+ ("affiliation", "Affiliation", "affiliation"),
170
+ ("city", "City", "city"),
171
+ ("country", "Country", "country"),
172
+ ("field", "Field", "field"),
173
+ ]
174
+ edge_defs = [
175
+ ("event","citing","hasCitingPaper"), ("event","seed","hasCitedPaper"),
176
+ ("event","intent","hasPrimaryIntent"), ("seed","journal","publishedInJournal"),
177
+ ("seed","author","hasAuthor"), ("seed","affiliation","hasAffiliation"),
178
+ ("seed","city","locatedInCity"), ("seed","country","locatedInCountry"),
179
+ ("seed","field","belongsToField"),
180
+ ]
181
+ G = nx.DiGraph()
182
+ for nid, _, _ in node_defs:
183
+ G.add_node(nid)
184
+ for s, t, _ in edge_defs:
185
+ G.add_edge(s, t)
186
+
187
+ pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
188
+
189
+ # edges + edge labels
190
+ ex, ey = [], []
191
+ ann = []
192
+ for s, t, lbl in edge_defs:
193
+ x0, y0 = pos[s]; x1, y1 = pos[t]
194
+ ex += [x0, x1, None]; ey += [y0, y1, None]
195
+ mx, my = (x0+x1)/2, (y0+y1)/2
196
+ ann.append(dict(x=mx, y=my, text=f"<i>{lbl}</i>",
197
+ showarrow=False, font=dict(size=9, color="#64748b"),
198
+ bgcolor="rgba(255,255,255,0.7)"))
199
+
200
+ traces: list[go.BaseTraceType] = [
201
+ go.Scatter(x=ex, y=ey, mode="lines",
202
+ line=dict(width=1.2, color="#94a3b8"),
203
+ hoverinfo="none", showlegend=False)
204
+ ]
205
+ for nid, label, ntype in node_defs:
206
+ x, y = pos[nid]
207
+ color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
208
+ traces.append(go.Scatter(
209
+ x=[x], y=[y], mode="markers+text",
210
+ text=[label], textposition="top center",
211
+ hoverinfo="text", hovertext=f"<b>{label}</b><br>Type: {ntype}",
212
+ name=label, showlegend=False,
213
+ marker=dict(size=22, color=color,
214
+ line=dict(width=1.5, color="white")),
215
+ textfont=dict(size=11),
216
+ ))
217
+
218
+ fig = go.Figure(data=traces)
219
+ fig.update_layout(
220
+ showlegend=False, hovermode="closest", height=height,
221
+ annotations=ann,
222
+ margin=dict(l=0, r=0, t=10, b=0),
223
+ paper_bgcolor="white", plot_bgcolor="#f8fafc",
224
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
225
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
226
+ )
227
+ return fig
228
+
229
+
230
  def inject_fullscreen(html: str) -> str:
231
+ extra = """
232
  <button onclick="var el=document.getElementById('mynetwork');
233
  if(el){if(el.requestFullscreen)el.requestFullscreen();
234
  else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"
 
240
  color:#64748b;background:rgba(255,255,255,0.85);
241
  padding:5px 10px;border-radius:6px;">
242
  πŸ–± Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>
243
+ <script>
244
+ // HiDPI μΊ”λ²„μŠ€ 해상도 보정 (Canvas 흐림 μ΅œμ†Œν™”)
245
+ (function fixDPI() {
246
+ var canvas = document.querySelector('#mynetwork canvas');
247
+ if (!canvas) { setTimeout(fixDPI, 200); return; }
248
+ var dpr = window.devicePixelRatio || 1;
249
+ if (dpr <= 1) return;
250
+ try {
251
+ if (typeof network !== 'undefined') {
252
+ network.canvas.pixelRatio = dpr;
253
+ network.redraw();
254
+ }
255
+ } catch(e) {}
256
+ })();
257
+ </script>
258
  """
259
+ return html.replace("</body>", extra + "</body>")
260
 
261
 
262
  # ── 메인 데이터 λ‘œλ“œ (11개) ────────────────────────────────────
 
739
  # ═══ 3. ONTOLOGY ════════════════════════════════════════════════
740
  with tab_ontology:
741
  st.subheader("CitationHub Ontology")
742
+ st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
 
743
 
744
 
745
  # ═══ 4. KNOWLEDGE GRAPH ══════════════════════════════════════════
 
780
  .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
781
  use_container_width=True)
782
 
783
+ st.plotly_chart(
784
+ plotly_network_fig(nodes_sub, edges_sub, height=750),
785
+ use_container_width=True)
786
  except Exception as e:
787
  st.error(str(e))
788
 
 
795
  with st.spinner("Loading..."):
796
  kg_nodes_exp = load_kg_nodes(data_dir_val)
797
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
 
798
 
799
  # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
800
  col_a, col_b = st.columns([1, 2])
 
855
  c2.metric("Edges", fmt_num(len(exp_edges)))
856
  c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
857
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
858
  st.plotly_chart(
859
+ plotly_network_fig(exp_nodes, exp_edges, height=750),
 
 
860
  use_container_width=True)
 
 
 
 
 
 
 
 
861
 
862
  except Exception as e:
863
  st.error(str(e))
 
975
  st.markdown("---")
976
  st.subheader("Field Reference")
977
  st.dataframe(fields_df, use_container_width=True, hide_index=True)
978
+
979
+ # ── Intent Evolution over Years ────────────────────────────
980
+ st.markdown("---")
981
+ st.subheader("CitationHub Intent Evolution over Years")
982
+ st.caption("How citation intents have changed across all papers over time")
983
+ intent_trend_raw = (
984
+ events.dropna(subset=["citing_year"])
985
+ .assign(year=lambda df: df["citing_year"].astype(int))
986
+ .groupby(["year", "primary_intent"]).size()
987
+ .reset_index(name="count")
988
+ )
989
+ if not intent_trend_raw.empty:
990
+ st.plotly_chart(
991
+ px.area(
992
+ intent_trend_raw, x="year", y="count", color="primary_intent",
993
+ color_discrete_map=INTENT_COLORS,
994
+ labels={"primary_intent": "Intent", "count": "Citations", "year": "Year"},
995
+ ).update_layout(
996
+ legend_title="Intent",
997
+ xaxis_title="Year", yaxis_title="# Citations",
998
+ hovermode="x unified",
999
+ ),
1000
+ use_container_width=True,
1001
+ )
1002
+
1003
+ # ── Top Citing Venues ───────────────────────────────────────
1004
+ st.markdown("---")
1005
+ col_v1, col_v2 = st.columns(2)
1006
+
1007
+ with col_v1:
1008
+ st.subheader("Top Citing Venues")
1009
+ st.caption("Journals/conferences that cite seed papers most")
1010
+ venue_cnt = (
1011
+ events[events["citing_venue"].str.strip() != ""]
1012
+ .groupby("citing_venue").size()
1013
+ .reset_index(name="count")
1014
+ .sort_values("count", ascending=False).head(20)
1015
+ )
1016
+ if not venue_cnt.empty:
1017
+ st.plotly_chart(
1018
+ px.bar(venue_cnt, x="count", y="citing_venue", orientation="h",
1019
+ labels={"count": "Citations", "citing_venue": ""})
1020
+ .update_layout(yaxis=dict(autorange="reversed"),
1021
+ xaxis_title="Citations", yaxis_title="", height=520),
1022
+ use_container_width=True,
1023
+ )
1024
+
1025
+ with col_v2:
1026
+ st.subheader("Intent Mix by Field")
1027
+ st.caption("How each field uses citations differently")
1028
+ fi_pct = (
1029
+ seed[["seed_paper_id", "field"]]
1030
+ .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
1031
+ .groupby(["field", "primary_intent"]).size().reset_index(name="count")
1032
+ )
1033
+ if not fi_pct.empty:
1034
+ totals = fi_pct.groupby("field")["count"].transform("sum")
1035
+ fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
1036
+ top_fields = fi_pct.groupby("field")["count"].sum().nlargest(12).index
1037
+ fi_pct_top = fi_pct[fi_pct["field"].isin(top_fields)]
1038
+ st.plotly_chart(
1039
+ px.bar(fi_pct_top, x="pct", y="field", color="primary_intent",
1040
+ orientation="h", color_discrete_map=INTENT_COLORS,
1041
+ labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
1042
+ .update_layout(
1043
+ barmode="stack",
1044
+ yaxis=dict(autorange="reversed"),
1045
+ xaxis_title="% of citations", yaxis_title="",
1046
+ legend_title="Intent", height=520,
1047
+ ),
1048
+ use_container_width=True,
1049
+ )
1050
+
1051
+ # ── Export ─────────────────────────────────────────────────
1052
+ st.markdown("---")
1053
+ st.subheader("Export Data")
1054
+ col_e1, col_e2, col_e3 = st.columns(3)
1055
+
1056
+ with col_e1:
1057
+ csv_seed = seed_filtered[
1058
+ ["title", "doi", "journal", "author", "country", "field", "citedby_count"]
1059
+ ].to_csv(index=False).encode("utf-8")
1060
+ st.download_button(
1061
+ "⬇ Seed Papers (CSV)",
1062
+ csv_seed, "seed_papers.csv", "text/csv",
1063
+ use_container_width=True,
1064
+ )
1065
+
1066
+ with col_e2:
1067
+ cite_export = seed_events[
1068
+ ["citing_title", "citing_doi", "citing_year", "citing_venue",
1069
+ "primary_intent", "context_count", "is_influential"]
1070
+ ].rename(columns={
1071
+ "citing_title": "title", "citing_doi": "doi",
1072
+ "citing_year": "year", "citing_venue": "venue",
1073
+ "primary_intent": "intent", "context_count": "contexts",
1074
+ "is_influential": "influential",
1075
+ }).to_csv(index=False).encode("utf-8")
1076
+ st.download_button(
1077
+ "⬇ Citation Events (CSV)",
1078
+ cite_export, "citation_events.csv", "text/csv",
1079
+ use_container_width=True,
1080
+ )
1081
+
1082
+ with col_e3:
1083
+ intent_csv = intent_summary.to_csv(index=False).encode("utf-8")
1084
+ st.download_button(
1085
+ "⬇ Intent Summary (CSV)",
1086
+ intent_csv, "intent_summary.csv", "text/csv",
1087
+ use_container_width=True,
1088
+ )