Daniel0315 commited on
Commit
5e840b3
Β·
verified Β·
1 Parent(s): 3899ec8

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +60 -79
src/app.py CHANGED
@@ -510,7 +510,7 @@ with tab_overview:
510
  .update_layout(xaxis_title="Year", yaxis_title="Citations"),
511
  use_container_width=True)
512
 
513
- st.subheader("Overall intent distribution")
514
  all_intents = events.groupby("primary_intent").size().to_dict()
515
  ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
516
  "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
@@ -519,7 +519,7 @@ with tab_overview:
519
  fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
520
  st.plotly_chart(fig2, use_container_width=True)
521
 
522
- st.subheader("Field distribution")
523
  fd = (seed_filtered.groupby("field", dropna=False).size()
524
  .reset_index(name="count").sort_values("count", ascending=False).head(20))
525
  fd["field"] = fd["field"].replace("","Unknown")
@@ -546,7 +546,7 @@ with tab_overview:
546
 
547
  # ═══ 2. CITATION NETWORK ════════════════════════════════════════
548
  with tab_cnet:
549
- st.subheader("Citing ↔ Cited Citation Network")
550
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
551
  if seed_events.empty:
552
  st.info("No citation network data for this seed paper.")
@@ -561,31 +561,30 @@ with tab_ontology:
561
  components.html(pyvis_ontology(), height=820, scrolling=True)
562
 
563
 
564
- # ═══ 4. KNOWLEDGE GRAPH (μ‹€μ œ KG 데이터, DuckDB) ════════════════
565
  with tab_kg:
566
- st.subheader("Knowledge Graph β€” Selected Seed Paper")
567
- st.caption("kg_nodes + kg_edgesμ—μ„œ μ„ νƒλœ seed paper의 1-hop μ„œλΈŒκ·Έλž˜ν”„ (DuckDB λΆ€λΆ„ 쿼리)")
568
 
569
  max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
570
 
571
  try:
572
- with st.spinner("KG 데이터 λ‘œλ”© 쀑... (졜초 1회 ν›„ μΊμ‹œλ©λ‹ˆλ‹€)"):
573
- kg_nodes = load_kg_nodes(data_dir_val)
574
- kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
575
 
576
  seed_doi = selected_seed["doi"]
577
  if not seed_doi:
578
- st.warning("μ„ νƒλœ seed paper의 DOIκ°€ μ—†μ–΄ KG μ‘°νšŒκ°€ λΆˆκ°€ν•©λ‹ˆλ‹€.")
579
  else:
580
  node_id = f"seed:{seed_doi}"
581
- with st.spinner("kg_edges 쿼리 쀑 (DuckDB)..."):
582
  edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
583
 
584
  if edges_sub.empty:
585
- st.warning(f"KGμ—μ„œ ν•΄λ‹Ή λ…Έλ“œμ˜ μ—£μ§€λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (node_id: {node_id})")
586
  else:
587
  all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
588
- nodes_sub = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
589
 
590
  c1, c2, c3 = st.columns(3)
591
  c1.metric("Nodes", fmt_num(len(nodes_sub)))
@@ -596,8 +595,7 @@ with tab_kg:
596
  type_counts.columns = ["node_type", "count"]
597
  st.plotly_chart(
598
  px.bar(type_counts, x="node_type", y="count",
599
- color="node_type", color_discrete_map=NODE_TYPE_COLORS,
600
- title="Node Type Distribution")
601
  .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
602
  use_container_width=True)
603
 
@@ -607,27 +605,25 @@ with tab_kg:
607
  st.error(str(e))
608
 
609
 
610
- # ═══ 5. KG EXPLORER ═════════════════════════════════════════════
611
  with tab_kg_exp:
612
  st.subheader("KG Explorer")
613
- st.caption("kg_nodesλ₯Ό νƒμƒ‰ν•˜κ³  μž„μ˜ λ…Έλ“œμ˜ μ—°κ²° 관계λ₯Ό μ‹œκ°ν™”ν•©λ‹ˆλ‹€. kg_edgesλŠ” DuckDB둜 ν•„μš”ν•œ λΆ€λΆ„λ§Œ μΏΌλ¦¬ν•©λ‹ˆλ‹€.")
614
 
615
  try:
616
- # spinnerλŠ” λ‘œλ”©λ§Œ, UIλŠ” spinner 밖에
617
- with st.spinner("KG 데이터 λ‘œλ”© 쀑... (졜초 1회 ν›„ μΊμ‹œλ©λ‹ˆλ‹€)"):
618
  kg_nodes_exp = load_kg_nodes(data_dir_val)
619
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
620
  enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
621
 
622
- # ── 전체 λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포
623
  col_a, col_b = st.columns([1, 2])
624
  with col_a:
625
- st.subheader("Node Type Counts")
626
  nt = kg_nodes_exp["node_type"].value_counts().reset_index()
627
  nt.columns = ["node_type", "count"]
628
  st.dataframe(nt, use_container_width=True, hide_index=True)
629
 
630
- st.subheader("Edge Type Counts")
631
  import duckdb as _ddb
632
  et = _ddb.execute(f"""
633
  SELECT edge_type, COUNT(*) AS count
@@ -637,70 +633,55 @@ with tab_kg_exp:
637
  st.dataframe(et, use_container_width=True, hide_index=True)
638
 
639
  with col_b:
640
- st.subheader("Node Type Distribution")
641
  nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
642
  color_discrete_map=NODE_TYPE_COLORS)
643
  nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
644
  st.plotly_chart(nt_fig, use_container_width=True)
645
 
646
- # ── Node Search & Ego Network
647
  st.markdown("---")
648
- st.subheader("Node Search & Ego Network")
649
- exp_col1, exp_col2 = st.columns([1, 3])
650
- with exp_col1:
651
- type_options = ["(all)"] + sorted(kg_nodes_exp["node_type"].unique().tolist())
652
- sel_type = st.selectbox("Filter by node type", type_options)
653
- filtered_nodes = (kg_nodes_exp if sel_type == "(all)"
654
- else kg_nodes_exp[kg_nodes_exp["node_type"] == sel_type])
655
- search_q = st.text_input("Search node label / DOI")
656
- if search_q:
657
- filtered_nodes = filtered_nodes[
658
- filtered_nodes["label"].str.contains(search_q, case=False, na=False) |
659
- filtered_nodes["doi"].str.contains(search_q, case=False, na=False)
660
- ]
661
-
662
- sample = filtered_nodes.head(100)
663
- node_options = sample["node_id"].tolist()
664
- if not node_options:
665
- st.warning("검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
666
- else:
667
- sel_node_id = st.selectbox(
668
- "Select node", node_options,
669
- format_func=lambda nid: sample.loc[sample["node_id"] == nid, "label"].iloc[0][:60],
670
- )
671
- sel_node_info = sample[sample["node_id"] == sel_node_id].iloc[0]
672
- st.markdown(f"**Type**: {sel_node_info.get('node_type', '')}")
673
- st.markdown(f"**DOI**: {sel_node_info.get('doi', '') or '-'}")
674
- st.markdown(f"**Publication**: {sel_node_info.get('publication_name', '') or '-'}")
675
- st.markdown(f"**Group**: {sel_node_info.get('group', '') or '-'}")
676
- st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count', ''))}")
677
-
678
- max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
679
- if st.button("Show ego network", key="kg_exp_show"):
680
- with st.spinner("DuckDB둜 μ—£μ§€ 쿼리 쀑..."):
681
- exp_edges = query_explorer_edges(sel_node_id, kg_edges_path, max_e)
682
- if exp_edges.empty:
683
- st.warning("μ—°κ²°λœ μ—£μ§€κ°€ μ—†μŠ΅λ‹ˆλ‹€.")
684
- else:
685
- all_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
686
- st.session_state["exp_nodes"] = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_ids)]
687
- st.session_state["exp_edges"] = exp_edges
688
-
689
- with exp_col2:
690
- if "exp_nodes" in st.session_state:
691
- en = st.session_state["exp_nodes"]
692
- ee = st.session_state["exp_edges"]
693
- st.caption(f"Nodes: {len(en)} | Edges: {len(ee)}")
694
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
695
- components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True)
696
- else:
697
- st.info("μ™Όμͺ½μ—μ„œ λ…Έλ“œλ₯Ό μ„ νƒν•˜κ³  'Show ego network'λ₯Ό ν΄λ¦­ν•˜μ„Έμš”.")
698
 
699
- # ── Enriched μΈμ‚¬μ΄νŠΈ (DuckDB μ§‘κ³„λ§Œ)
700
  st.markdown("---")
701
- st.subheader("Enriched Citation Insights")
702
- st.caption("citation_events_enriched: DuckDB둜 집계 ν†΅κ³„λ§Œ 쿼리 (전체 λ‘œλ“œ μ—†μŒ)")
703
- with st.spinner("Enriched 톡계 쿼리 쀑 (DuckDB)..."):
704
  sem_df, field_df = query_enriched_stats(enriched_path)
705
 
706
  if not sem_df.empty:
@@ -810,7 +791,7 @@ with tab_analytics:
810
  col_c, col_d = st.columns(2)
811
 
812
  with col_c:
813
- st.subheader("Field Γ— Intent Heatmap")
814
  fi = (seed[["seed_paper_id","field"]]
815
  .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
816
  .groupby(["field","primary_intent"]).size().reset_index(name="count"))
@@ -818,7 +799,7 @@ with tab_analytics:
818
  pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
819
  st.plotly_chart(
820
  px.imshow(pivot, color_continuous_scale="Blues",
821
- title="Citation Intent by Field", aspect="auto")
822
  .update_layout(xaxis_title="Intent", yaxis_title="Field"),
823
  use_container_width=True)
824
 
 
510
  .update_layout(xaxis_title="Year", yaxis_title="Citations"),
511
  use_container_width=True)
512
 
513
+ st.subheader("CitationHub Intent Distribution")
514
  all_intents = events.groupby("primary_intent").size().to_dict()
515
  ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
516
  "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
 
519
  fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
520
  st.plotly_chart(fig2, use_container_width=True)
521
 
522
+ st.subheader("CitationHub Field Distribution")
523
  fd = (seed_filtered.groupby("field", dropna=False).size()
524
  .reset_index(name="count").sort_values("count", ascending=False).head(20))
525
  fd["field"] = fd["field"].replace("","Unknown")
 
546
 
547
  # ═══ 2. CITATION NETWORK ════════════════════════════════════════
548
  with tab_cnet:
549
+ st.subheader("Citation Network")
550
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
551
  if seed_events.empty:
552
  st.info("No citation network data for this seed paper.")
 
561
  components.html(pyvis_ontology(), height=820, scrolling=True)
562
 
563
 
564
+ # ═══ 4. KNOWLEDGE GRAPH ══════════════════════════════════════════
565
  with tab_kg:
566
+ st.subheader("Knowledge Graph")
 
567
 
568
  max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
569
 
570
  try:
571
+ with st.spinner("Loading..."):
572
+ kg_nodes_kg = load_kg_nodes(data_dir_val)
573
+ kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
574
 
575
  seed_doi = selected_seed["doi"]
576
  if not seed_doi:
577
+ st.warning("Selected seed paper has no DOI.")
578
  else:
579
  node_id = f"seed:{seed_doi}"
580
+ with st.spinner("Querying graph..."):
581
  edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
582
 
583
  if edges_sub.empty:
584
+ st.warning("No edges found for this paper in the knowledge graph.")
585
  else:
586
  all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
587
+ nodes_sub = kg_nodes_kg[kg_nodes_kg["node_id"].isin(all_node_ids)]
588
 
589
  c1, c2, c3 = st.columns(3)
590
  c1.metric("Nodes", fmt_num(len(nodes_sub)))
 
595
  type_counts.columns = ["node_type", "count"]
596
  st.plotly_chart(
597
  px.bar(type_counts, x="node_type", y="count",
598
+ color="node_type", color_discrete_map=NODE_TYPE_COLORS)
 
599
  .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
600
  use_container_width=True)
601
 
 
605
  st.error(str(e))
606
 
607
 
608
+ # ═══ 5. KG EXPLORER ══════════════════════════════════════════════
609
  with tab_kg_exp:
610
  st.subheader("KG Explorer")
 
611
 
612
  try:
613
+ with st.spinner("Loading..."):
 
614
  kg_nodes_exp = load_kg_nodes(data_dir_val)
615
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
616
  enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
617
 
618
+ # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
619
  col_a, col_b = st.columns([1, 2])
620
  with col_a:
621
+ st.subheader("Node Types")
622
  nt = kg_nodes_exp["node_type"].value_counts().reset_index()
623
  nt.columns = ["node_type", "count"]
624
  st.dataframe(nt, use_container_width=True, hide_index=True)
625
 
626
+ st.subheader("Edge Types")
627
  import duckdb as _ddb
628
  et = _ddb.execute(f"""
629
  SELECT edge_type, COUNT(*) AS count
 
633
  st.dataframe(et, use_container_width=True, hide_index=True)
634
 
635
  with col_b:
636
+ st.subheader("CitationHub KG Node Distribution")
637
  nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
638
  color_discrete_map=NODE_TYPE_COLORS)
639
  nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
640
  st.plotly_chart(nt_fig, use_container_width=True)
641
 
642
+ # ── λ©€ν‹°λ…Έλ“œ μžλ™ μ‹œκ°ν™” (인용수 μƒμœ„ seed papers + μ—°κ²° λ…Έλ“œ)
643
  st.markdown("---")
644
+ st.subheader("Multi-Node Knowledge Graph")
645
+
646
+ n_seeds = st.slider("Number of seed papers", 3, 20, 8, key="kg_exp_n_seeds")
647
+ max_exp_edges = st.slider("Max edges", 30, 200, 100, key="kg_exp_max_edges")
648
+
649
+ with st.spinner("Querying graph..."):
650
+ # 인용수 μƒμœ„ seed papers 선택
651
+ top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
652
+ .sort_values("citedby_count", ascending=False)
653
+ .head(n_seeds))
654
+ seed_ids = top_seeds["node_id"].tolist()
655
+
656
+ if seed_ids:
657
+ ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
658
+ # 메타데이터 μ—£μ§€λ§Œ κ°€μ Έμ˜΄ (citing/cited/intent μ œμ™Έ β†’ μ €μž/저널/λΆ„μ•Ό λ“±)
659
+ exp_edges = _ddb.execute(f"""
660
+ SELECT source, target, edge_type
661
+ FROM read_parquet('{kg_edges_path}')
662
+ WHERE (source IN ({ids_sql}) OR target IN ({ids_sql}))
663
+ AND edge_type NOT IN (
664
+ 'HAS_CITING_PAPER','HAS_CITED_PAPER','HAS_PRIMARY_INTENT'
665
+ )
666
+ LIMIT {int(max_exp_edges)}
667
+ """).df()
668
+
669
+ all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
670
+ exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
671
+
672
+ c1, c2, c3 = st.columns(3)
673
+ c1.metric("Nodes", fmt_num(len(exp_nodes)))
674
+ c2.metric("Edges", fmt_num(len(exp_edges)))
675
+ c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
676
+
 
 
 
 
 
 
 
 
 
 
 
 
 
677
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
678
+ components.html(pyvis_from_kg(exp_nodes, exp_edges, height="780px"),
679
+ height=800, scrolling=True)
 
680
 
681
+ # ── Enriched μΈμ‚¬μ΄νŠΈ
682
  st.markdown("---")
683
+ st.subheader("CitationHub Semantic Evidence Distribution")
684
+ with st.spinner("Loading..."):
 
685
  sem_df, field_df = query_enriched_stats(enriched_path)
686
 
687
  if not sem_df.empty:
 
791
  col_c, col_d = st.columns(2)
792
 
793
  with col_c:
794
+ st.subheader("CitationHub Field Γ— Intent Distribution")
795
  fi = (seed[["seed_paper_id","field"]]
796
  .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
797
  .groupby(["field","primary_intent"]).size().reset_index(name="count"))
 
799
  pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
800
  st.plotly_chart(
801
  px.imshow(pivot, color_continuous_scale="Blues",
802
+ title="CitationHub Field Γ— Intent Distribution", aspect="auto")
803
  .update_layout(xaxis_title="Intent", yaxis_title="Field"),
804
  use_container_width=True)
805