Daniel0315 commited on
Commit
8e127bd
Β·
verified Β·
1 Parent(s): ca221d8

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +137 -54
src/app.py CHANGED
@@ -164,14 +164,80 @@ def load_data(data_dir_str: str):
164
  cities_df, countries_df, fields_df, intents_df, journals_df)
165
 
166
 
167
- # ── KG + Enriched 데이터 (별도 μ§€μ—° λ‘œλ“œ) ─────────────────────
 
 
 
 
168
  @st.cache_data(show_spinner=False)
169
- def load_kg_data(data_dir_str: str):
 
170
  d = None if HF_REPO_ID else Path(data_dir_str)
171
- kg_nodes = _read("kg_nodes.parquet", d)
172
- kg_edges = _read("kg_edges.parquet", d)
173
- enriched = _read("citation_events_enriched.parquet", d)
174
- return kg_nodes, kg_edges, enriched
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
 
177
  # ── 헬퍼 ───────────────────────────────────────────────────────
@@ -486,39 +552,49 @@ with tab_ontology:
486
  components.html(pyvis_ontology(), height=820, scrolling=True)
487
 
488
 
489
- # ═══ 4. KNOWLEDGE GRAPH (μ‹€μ œ KG 데이터) ═════════════════════════
490
  with tab_kg:
491
  st.subheader("Knowledge Graph β€” Selected Seed Paper")
492
- st.caption("kg_nodes + kg_edges 전체 λ°μ΄ν„°μ—μ„œ μ„ νƒλœ seed paper의 1-hop μ„œλΈŒκ·Έλž˜ν”„")
493
- st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš” (졜초 1회, 이후 μΊμ‹œλ¨)")
 
 
494
 
495
  if st.button("KG 데이터 λ‘œλ“œ", key="kg_load"):
496
- with st.spinner("kg_nodes / kg_edges / enriched λ‘œλ”© 쀑 ..."):
497
  st.session_state["kg_loaded"] = True
498
 
499
  if st.session_state.get("kg_loaded"):
500
  try:
501
- kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
 
 
502
  seed_doi = selected_seed["doi"]
503
  if not seed_doi:
504
  st.warning("μ„ νƒλœ seed paper의 DOIκ°€ μ—†μ–΄ KG μ‘°νšŒκ°€ λΆˆκ°€ν•©λ‹ˆλ‹€.")
505
  else:
506
- nodes_sub, edges_sub = get_kg_subgraph(seed_doi, kg_nodes, kg_edges)
507
- if nodes_sub is None:
508
- st.warning(f"KGμ—μ„œ λ…Έλ“œλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (DOI: {seed_doi})")
 
 
 
 
 
509
  else:
510
- # 톡계
 
 
511
  c1, c2, c3 = st.columns(3)
512
  c1.metric("Nodes", fmt_num(len(nodes_sub)))
513
  c2.metric("Edges", fmt_num(len(edges_sub)))
514
  c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
515
 
516
  type_counts = nodes_sub["node_type"].value_counts().reset_index()
517
- type_counts.columns = ["node_type","count"]
518
  st.plotly_chart(
519
  px.bar(type_counts, x="node_type", y="count",
520
- color="node_type",
521
- color_discrete_map=NODE_TYPE_COLORS,
522
  title="Node Type Distribution")
523
  .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
524
  use_container_width=True)
@@ -532,18 +608,21 @@ with tab_kg:
532
  # ═══ 5. KG EXPLORER ═════════════════════════════════════════════
533
  with tab_kg_exp:
534
  st.subheader("KG Explorer")
535
- st.caption("kg_nodes 전체λ₯Ό νƒμƒ‰ν•˜κ³  μž„μ˜ λ…Έλ“œμ˜ μ—°κ²° 관계λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€.")
536
- st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš” (졜초 1회, 이후 μΊμ‹œλ¨)")
537
 
538
  if st.button("KG 데이터 λ‘œλ“œ", key="kg_exp_load"):
539
- with st.spinner("λ‘œλ”© 쀑..."):
540
  st.session_state["kg_loaded"] = True
541
 
542
  if st.session_state.get("kg_loaded"):
543
  try:
544
- kg_nodes, kg_edges, enriched = load_kg_data(data_dir_val)
 
 
 
545
 
546
- # ── 전체 λ…Έλ“œ νƒ€μž… 뢄포
547
  col_a, col_b = st.columns([1,2])
548
  with col_a:
549
  st.subheader("Node Type Counts")
@@ -551,9 +630,14 @@ with tab_kg_exp:
551
  nt.columns = ["node_type","count"]
552
  st.dataframe(nt, use_container_width=True, hide_index=True)
553
 
 
554
  st.subheader("Edge Type Counts")
555
- et = kg_edges["edge_type"].value_counts().reset_index()
556
- et.columns = ["edge_type","count"]
 
 
 
 
557
  st.dataframe(et, use_container_width=True, hide_index=True)
558
 
559
  with col_b:
@@ -584,8 +668,7 @@ with tab_kg_exp:
584
  st.warning("검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
585
  else:
586
  sel_node_id = st.selectbox(
587
- "Select node",
588
- node_options,
589
  format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60],
590
  )
591
  sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0]
@@ -596,12 +679,14 @@ with tab_kg_exp:
596
  st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}")
597
 
598
  max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
599
-
600
  if st.button("Show ego network", key="kg_exp_show"):
601
- exp_nodes, exp_edges = get_explorer_subgraph(sel_node_id, kg_nodes, kg_edges, max_e)
602
- if exp_nodes is None:
 
603
  st.warning("μ—°κ²°λœ μ—£μ§€κ°€ μ—†μŠ΅λ‹ˆλ‹€.")
604
  else:
 
 
605
  st.session_state["exp_nodes"] = exp_nodes
606
  st.session_state["exp_edges"] = exp_edges
607
 
@@ -615,34 +700,32 @@ with tab_kg_exp:
615
  else:
616
  st.info("μ™Όμͺ½μ—μ„œ λ…Έλ“œλ₯Ό μ„ νƒν•˜κ³  'Show ego network'λ₯Ό ν΄λ¦­ν•˜μ„Έμš”.")
617
 
618
- # ── Enriched μΈμ‚¬μ΄νŠΈ
619
  st.markdown("---")
620
  st.subheader("Enriched Citation Insights")
621
- st.caption("citation_events_enriched: 의미적 증거(semantic evidence) 뢄석")
622
- if "has_semantic_evidence" in enriched.columns:
623
- sem = enriched["has_semantic_evidence"].value_counts().reset_index()
624
- sem.columns = ["has_semantic_evidence","count"]
625
- sem["label"] = sem["has_semantic_evidence"].map({True:"With evidence", False:"Without evidence"})
626
- st.plotly_chart(
627
- px.pie(sem, names="label", values="count",
628
- title="Semantic Evidence Coverage (all citation events)")
629
- .update_layout(legend_title=""),
630
- use_container_width=True)
631
-
632
- # 뢄야별 semantic evidence λΉ„μœ¨
633
- if "field_folder" in enriched.columns:
634
- field_sem = (enriched.groupby("field_folder")["has_semantic_evidence"]
635
- .mean().reset_index()
636
- .rename(columns={"has_semantic_evidence":"sem_ratio","field_folder":"field"})
637
- .sort_values("sem_ratio", ascending=False).head(20))
638
  st.plotly_chart(
639
- px.bar(field_sem, x="field", y="sem_ratio",
640
- title="Semantic Evidence Rate by Field",
641
- labels={"sem_ratio":"Evidence Rate","field":"Field"})
642
- .update_layout(xaxis_tickangle=-40),
643
  use_container_width=True)
644
- else:
645
- st.info("has_semantic_evidence 컬럼이 μ—†μŠ΅λ‹ˆλ‹€.")
 
 
 
 
 
 
646
 
647
  except Exception as e:
648
  st.error(str(e))
 
164
  cities_df, countries_df, fields_df, intents_df, journals_df)
165
 
166
 
167
+ # ── KG 데이터: DuckDB λ°©μ‹μœΌλ‘œ 뢄리 λ‘œλ“œ ─────────────────────
168
+ # kg_nodes : pandas 전체 λ‘œλ“œ (~160MB 파일, λ©”λͺ¨λ¦¬ ν—ˆμš© λ²”μœ„)
169
+ # kg_edges : DuckDB둜 ν•„μš”ν•œ λ…Έλ“œμ˜ μ—£μ§€λ§Œ 쿼리 (전체 λ‘œλ“œ μ•ˆ 함)
170
+ # enriched : DuckDB둜 집계 ν†΅κ³„λ§Œ 쿼리 (전체 λ‘œλ“œ μ•ˆ 함)
171
+
172
  @st.cache_data(show_spinner=False)
173
+ def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
174
+ """kg_nodes 전체 λ‘œλ“œ (3.4M rows, ~160MB 파일)"""
175
  d = None if HF_REPO_ID else Path(data_dir_str)
176
+ return _read("kg_nodes.parquet", d)
177
+
178
+
179
+ @st.cache_data(show_spinner=False)
180
+ def get_parquet_path(filename: str, data_dir_str: str) -> str:
181
+ """파일 경둜 λ°˜ν™˜ (HFλ©΄ 둜컬 μΊμ‹œμ— λ‹€μš΄λ‘œλ“œ ν›„ 경둜 λ°˜ν™˜)"""
182
+ if HF_REPO_ID:
183
+ return _hf_download(filename)
184
+ # DuckDB용: μ—­μŠ¬λž˜μ‹œ β†’ μŠ¬λž˜μ‹œ λ³€ν™˜
185
+ return str(Path(data_dir_str) / filename).replace("\\", "/")
186
+
187
+
188
+ @st.cache_data(show_spinner=False)
189
+ def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
190
+ """DuckDB: νŠΉμ • λ…Έλ“œμ˜ μ—£μ§€λ§Œ parquetμ—μ„œ λ°”λ‘œ 쿼리 (전체 λ‘œλ“œ μ—†μŒ)"""
191
+ import duckdb
192
+ safe_path = kg_edges_path.replace("\\", "/")
193
+ safe_node = node_id.replace("'", "''")
194
+ q = f"""
195
+ SELECT source, target, edge_type
196
+ FROM read_parquet('{safe_path}')
197
+ WHERE source = '{safe_node}' OR target = '{safe_node}'
198
+ LIMIT {int(max_edges)}
199
+ """
200
+ return duckdb.execute(q).df()
201
+
202
+
203
+ @st.cache_data(show_spinner=False)
204
+ def query_enriched_stats(enriched_path: str):
205
+ """DuckDB: enriched 전체 λ‘œλ“œ 없이 집계 ν†΅κ³„λ§Œ 쿼리"""
206
+ import duckdb
207
+ safe_path = enriched_path.replace("\\", "/")
208
+
209
+ sem_df = duckdb.execute(f"""
210
+ SELECT has_semantic_evidence, COUNT(*) AS count
211
+ FROM read_parquet('{safe_path}')
212
+ GROUP BY has_semantic_evidence
213
+ """).df()
214
+
215
+ field_df = duckdb.execute(f"""
216
+ SELECT field_folder AS field,
217
+ AVG(CAST(has_semantic_evidence AS INTEGER)) AS sem_ratio,
218
+ COUNT(*) AS event_count
219
+ FROM read_parquet('{safe_path}')
220
+ GROUP BY field_folder
221
+ ORDER BY sem_ratio DESC
222
+ LIMIT 20
223
+ """).df()
224
+
225
+ return sem_df, field_df
226
+
227
+
228
+ @st.cache_data(show_spinner=False)
229
+ def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
230
+ """DuckDB: KG Explorer용 μž„μ˜ λ…Έλ“œ μ—£μ§€ 쿼리"""
231
+ import duckdb
232
+ safe_path = kg_edges_path.replace("\\", "/")
233
+ safe_node = node_id.replace("'", "''")
234
+ q = f"""
235
+ SELECT source, target, edge_type
236
+ FROM read_parquet('{safe_path}')
237
+ WHERE source = '{safe_node}' OR target = '{safe_node}'
238
+ LIMIT {int(max_edges)}
239
+ """
240
+ return duckdb.execute(q).df()
241
 
242
 
243
  # ── 헬퍼 ───────────────────────────────────────────────────────
 
552
  components.html(pyvis_ontology(), height=820, scrolling=True)
553
 
554
 
555
+ # ═══ 4. KNOWLEDGE GRAPH (μ‹€μ œ KG 데이터, DuckDB) ════════════════
556
  with tab_kg:
557
  st.subheader("Knowledge Graph β€” Selected Seed Paper")
558
+ st.caption("kg_nodes + kg_edgesμ—μ„œ μ„ νƒλœ seed paper의 1-hop μ„œλΈŒκ·Έλž˜ν”„ (DuckDB λΆ€λΆ„ 쿼리)")
559
+ st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš”. kg_nodes만 전체 λ‘œλ“œ, kg_edgesλŠ” ν•„μš”ν•œ λΆ€λΆ„λ§Œ μΏΌλ¦¬ν•©λ‹ˆλ‹€.")
560
+
561
+ max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
562
 
563
  if st.button("KG 데이터 λ‘œλ“œ", key="kg_load"):
564
+ with st.spinner("kg_nodes λ‘œλ”© + kg_edges 경둜 μ€€λΉ„ 쀑..."):
565
  st.session_state["kg_loaded"] = True
566
 
567
  if st.session_state.get("kg_loaded"):
568
  try:
569
+ with st.spinner("kg_nodes λ‘œλ”© 쀑..."):
570
+ kg_nodes = load_kg_nodes(data_dir_val)
571
+
572
  seed_doi = selected_seed["doi"]
573
  if not seed_doi:
574
  st.warning("μ„ νƒλœ seed paper의 DOIκ°€ μ—†μ–΄ KG μ‘°νšŒκ°€ λΆˆκ°€ν•©λ‹ˆλ‹€.")
575
  else:
576
+ node_id = f"seed:{seed_doi}"
577
+
578
+ with st.spinner("kg_edges 쿼리 쀑 (DuckDB)..."):
579
+ kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
580
+ edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
581
+
582
+ if edges_sub.empty:
583
+ st.warning(f"KGμ—μ„œ ν•΄λ‹Ή λ…Έλ“œμ˜ μ—£μ§€λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (node_id: {node_id})")
584
  else:
585
+ all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
586
+ nodes_sub = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
587
+
588
  c1, c2, c3 = st.columns(3)
589
  c1.metric("Nodes", fmt_num(len(nodes_sub)))
590
  c2.metric("Edges", fmt_num(len(edges_sub)))
591
  c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
592
 
593
  type_counts = nodes_sub["node_type"].value_counts().reset_index()
594
+ type_counts.columns = ["node_type", "count"]
595
  st.plotly_chart(
596
  px.bar(type_counts, x="node_type", y="count",
597
+ color="node_type", color_discrete_map=NODE_TYPE_COLORS,
 
598
  title="Node Type Distribution")
599
  .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
600
  use_container_width=True)
 
608
  # ═══ 5. KG EXPLORER ═════════════════════════════════════════════
609
  with tab_kg_exp:
610
  st.subheader("KG Explorer")
611
+ st.caption("kg_nodesλ₯Ό νƒμƒ‰ν•˜κ³  μž„μ˜ λ…Έλ“œμ˜ μ—°κ²° 관계λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€. kg_edgesλŠ” DuckDB둜 ν•„μš”ν•œ λΆ€λΆ„λ§Œ μΏΌλ¦¬ν•©λ‹ˆλ‹€.")
612
+ st.info("μ•„λž˜ λ²„νŠΌμ„ 눌러 KG 데이터λ₯Ό λ‘œλ“œν•˜μ„Έμš”. kg_nodes만 전체 λ‘œλ“œλ˜κ³ , kg_edges/enrichedλŠ” DuckDB둜 μΏΌλ¦¬ν•©λ‹ˆλ‹€.")
613
 
614
  if st.button("KG 데이터 λ‘œλ“œ", key="kg_exp_load"):
615
+ with st.spinner("kg_nodes λ‘œλ”© 쀑..."):
616
  st.session_state["kg_loaded"] = True
617
 
618
  if st.session_state.get("kg_loaded"):
619
  try:
620
+ with st.spinner("kg_nodes λ‘œλ”© 쀑..."):
621
+ kg_nodes = load_kg_nodes(data_dir_val)
622
+ kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
623
+ enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
624
 
625
+ # ── 전체 λ…Έλ“œ νƒ€μž… 뢄포 (kg_nodes만으둜 계산)
626
  col_a, col_b = st.columns([1,2])
627
  with col_a:
628
  st.subheader("Node Type Counts")
 
630
  nt.columns = ["node_type","count"]
631
  st.dataframe(nt, use_container_width=True, hide_index=True)
632
 
633
+ # Edge type 집계: DuckDB둜 λΉ λ₯΄κ²Œ 계산
634
  st.subheader("Edge Type Counts")
635
+ import duckdb
636
+ et = duckdb.execute(f"""
637
+ SELECT edge_type, COUNT(*) AS count
638
+ FROM read_parquet('{kg_edges_path}')
639
+ GROUP BY edge_type ORDER BY count DESC
640
+ """).df()
641
  st.dataframe(et, use_container_width=True, hide_index=True)
642
 
643
  with col_b:
 
668
  st.warning("검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
669
  else:
670
  sel_node_id = st.selectbox(
671
+ "Select node", node_options,
 
672
  format_func=lambda nid: sample.loc[sample["node_id"]==nid,"label"].iloc[0][:60],
673
  )
674
  sel_node_info = sample[sample["node_id"]==sel_node_id].iloc[0]
 
679
  st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count',''))}")
680
 
681
  max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
 
682
  if st.button("Show ego network", key="kg_exp_show"):
683
+ with st.spinner("DuckDB둜 μ—£μ§€ 쿼리 쀑..."):
684
+ exp_edges = query_explorer_edges(sel_node_id, kg_edges_path, max_e)
685
+ if exp_edges.empty:
686
  st.warning("μ—°κ²°λœ μ—£μ§€κ°€ μ—†μŠ΅λ‹ˆλ‹€.")
687
  else:
688
+ all_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
689
+ exp_nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
690
  st.session_state["exp_nodes"] = exp_nodes
691
  st.session_state["exp_edges"] = exp_edges
692
 
 
700
  else:
701
  st.info("μ™Όμͺ½μ—μ„œ λ…Έλ“œλ₯Ό μ„ νƒν•˜κ³  'Show ego network'λ₯Ό ν΄λ¦­ν•˜μ„Έμš”.")
702
 
703
+ # ── Enriched μΈμ‚¬μ΄νŠΈ (DuckDB μ§‘κ³„λ§Œ)
704
  st.markdown("---")
705
  st.subheader("Enriched Citation Insights")
706
+ st.caption("citation_events_enriched: DuckDB둜 집계 ν†΅κ³„λ§Œ 쿼리 (전체 λ‘œλ“œ μ—†μŒ)")
707
+ with st.spinner("Enriched 톡계 쿼리 쀑 (DuckDB)..."):
708
+ sem_df, field_df = query_enriched_stats(enriched_path)
709
+
710
+ if not sem_df.empty:
711
+ sem_df["label"] = sem_df["has_semantic_evidence"].map(
712
+ {True:"With evidence", False:"Without evidence",
713
+ 1:"With evidence", 0:"Without evidence"})
714
+ col_s1, col_s2 = st.columns(2)
715
+ with col_s1:
 
 
 
 
 
 
 
716
  st.plotly_chart(
717
+ px.pie(sem_df, names="label", values="count",
718
+ title="Semantic Evidence Coverage")
719
+ .update_layout(legend_title=""),
 
720
  use_container_width=True)
721
+ with col_s2:
722
+ if not field_df.empty:
723
+ st.plotly_chart(
724
+ px.bar(field_df, x="field", y="sem_ratio",
725
+ title="Semantic Evidence Rate by Field",
726
+ labels={"sem_ratio":"Evidence Rate","field":"Field"})
727
+ .update_layout(xaxis_tickangle=-40),
728
+ use_container_width=True)
729
 
730
  except Exception as e:
731
  st.error(str(e))