Daniel0315 commited on
Commit
4dd37d3
Β·
verified Β·
1 Parent(s): a2524a3

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +36 -52
src/app.py CHANGED
@@ -682,16 +682,6 @@ with tab_overview:
682
  fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
683
  st.plotly_chart(fig, use_container_width=True)
684
 
685
- st.subheader("Citation trend (selected paper)")
686
- trend = (seed_events.dropna(subset=["citing_year"])
687
- .assign(citing_year=lambda df: df["citing_year"].astype(int))
688
- .groupby("citing_year").size().reset_index(name="count"))
689
- if not trend.empty:
690
- st.plotly_chart(
691
- px.line(trend, x="citing_year", y="count", markers=True)
692
- .update_layout(xaxis_title="Year", yaxis_title="Citations"),
693
- use_container_width=True)
694
-
695
  st.subheader("CitationHub Intent Distribution")
696
  all_intents = events.groupby("primary_intent").size().to_dict()
697
  ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
@@ -752,39 +742,47 @@ with tab_kg_exp:
752
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
753
 
754
  # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
755
- col_a, col_b = st.columns([1, 2])
 
 
 
 
 
 
 
 
 
 
 
756
  with col_a:
757
  st.subheader("Node Types")
758
- nt = kg_nodes_exp["node_type"].value_counts().reset_index()
759
- nt.columns = ["node_type", "count"]
760
  st.dataframe(nt, use_container_width=True, hide_index=True)
761
-
762
- st.subheader("Edge Types")
763
- import duckdb as _ddb
764
- et = _ddb.execute(f"""
765
- SELECT edge_type, COUNT(*) AS count
766
- FROM read_parquet('{kg_edges_path}')
767
- GROUP BY edge_type ORDER BY count DESC
768
- """).df()
769
- st.dataframe(et, use_container_width=True, hide_index=True)
770
-
771
  with col_b:
772
  st.subheader("CitationHub KG Node Distribution")
773
  nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
774
  color_discrete_map=NODE_TYPE_COLORS)
775
  nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
776
  st.plotly_chart(nt_fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
777
 
778
  # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
779
  st.markdown("---")
780
  st.subheader("Multi-Node Knowledge Graph")
781
- st.caption("All 10 node types and all edge types β€” 2-hop from top cited seed papers")
782
 
783
  n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
784
- edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type")
 
785
 
786
  with st.spinner("Querying graph..."):
787
- # ── 1-hop: 인용수 μƒμœ„ seed papers κΈ°μ€€ λͺ¨λ“  μ—£μ§€
788
  top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
789
  .sort_values("citedby_count", ascending=False)
790
  .head(n_seeds))
@@ -793,8 +791,8 @@ with tab_kg_exp:
793
  if seed_ids:
794
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
795
 
796
- # 1-hop: seed paper와 μ—°κ²°λœ λͺ¨λ“  edge (journal, author, affiliation, city,
797
- # country, field, citation_event)
798
  hop1 = _ddb.execute(f"""
799
  WITH ranked AS (
800
  SELECT source, target, edge_type,
@@ -805,16 +803,15 @@ with tab_kg_exp:
805
  WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
806
  )
807
  SELECT source, target, edge_type FROM ranked
808
- WHERE rn <= {int(edges_per_type)}
809
  """).df()
810
 
811
- # 2-hop: citation_event β†’ HAS_CITING_PAPER β†’ citing_paper
812
- # citation_event β†’ HAS_PRIMARY_INTENT β†’ intent
813
  event_ids = [
814
  x for x in
815
  set(hop1["source"].tolist()) | set(hop1["target"].tolist())
816
  if str(x).startswith("event:")
817
- ][:20]
818
 
819
  if event_ids:
820
  ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
@@ -829,7 +826,7 @@ with tab_kg_exp:
829
  AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
830
  )
831
  SELECT source, target, edge_type FROM ranked
832
- WHERE rn <= {int(edges_per_type)}
833
  """).df()
834
  exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
835
  subset=["source", "target", "edge_type"]
@@ -841,26 +838,13 @@ with tab_kg_exp:
841
  exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
842
 
843
  c1, c2, c3, c4 = st.columns(4)
844
- c1.metric("Nodes", fmt_num(len(exp_nodes)))
845
- c2.metric("Edges", fmt_num(len(exp_edges)))
846
- c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
847
- c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
848
-
849
- # 컀버리지 확인 ν‘œμ‹œ
850
- present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
851
- present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
852
- all_10_ntypes = sorted(NODE_TYPE_COLORS.keys())
853
- missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
854
- if missing_nt:
855
- st.caption(f"⚠ Node types not yet in graph: {', '.join(missing_nt)} "
856
- f"β€” try increasing 'Edges per type'")
857
- else:
858
- st.caption("βœ… All 10 node types represented")
859
 
860
- st.plotly_chart(
861
- plotly_network_fig(exp_nodes, exp_edges, height=800,
862
- seed_node_ids=seed_ids),
863
- use_container_width=True)
864
 
865
  except Exception as e:
866
  st.error(str(e))
 
682
  fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
683
  st.plotly_chart(fig, use_container_width=True)
684
 
 
 
 
 
 
 
 
 
 
 
685
  st.subheader("CitationHub Intent Distribution")
686
  all_intents = events.groupby("primary_intent").size().to_dict()
687
  ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
 
742
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
743
 
744
  # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
745
+ import duckdb as _ddb
746
+
747
+ nt = kg_nodes_exp["node_type"].value_counts().reset_index()
748
+ nt.columns = ["node_type", "count"]
749
+
750
+ et = _ddb.execute(f"""
751
+ SELECT edge_type, COUNT(*) AS count
752
+ FROM read_parquet('{kg_edges_path}')
753
+ GROUP BY edge_type ORDER BY count DESC
754
+ """).df()
755
+
756
+ col_a, col_b, col_c, col_d = st.columns([1, 2, 1, 2])
757
  with col_a:
758
  st.subheader("Node Types")
 
 
759
  st.dataframe(nt, use_container_width=True, hide_index=True)
 
 
 
 
 
 
 
 
 
 
760
  with col_b:
761
  st.subheader("CitationHub KG Node Distribution")
762
  nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
763
  color_discrete_map=NODE_TYPE_COLORS)
764
  nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
765
  st.plotly_chart(nt_fig, use_container_width=True)
766
+ with col_c:
767
+ st.subheader("Edge Types")
768
+ st.dataframe(et, use_container_width=True, hide_index=True)
769
+ with col_d:
770
+ st.subheader("CitationHub KG Edge Distribution")
771
+ et_fig = px.bar(et, x="edge_type", y="count", color="edge_type")
772
+ et_fig.update_layout(showlegend=False, xaxis_title="",
773
+ yaxis_title="Count", xaxis_tickangle=-35)
774
+ st.plotly_chart(et_fig, use_container_width=True)
775
 
776
  # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
777
  st.markdown("---")
778
  st.subheader("Multi-Node Knowledge Graph")
779
+ st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
780
 
781
  n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
782
+
783
+ EDGES_PER_TYPE = 10 # 각 edge typeλ‹Ή κ³ μ • μƒ˜ν”Œ 수 β†’ 10 types Γ— 10 = μ΅œλŒ€ 100 edges
784
 
785
  with st.spinner("Querying graph..."):
 
786
  top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
787
  .sort_values("citedby_count", ascending=False)
788
  .head(n_seeds))
 
791
  if seed_ids:
792
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
793
 
794
+ # 1-hop: seed paper에 μ—°κ²°λœ λͺ¨λ“  edge type (journal/author/affiliation/
795
+ # city/country/field/citation_event 포함)
796
  hop1 = _ddb.execute(f"""
797
  WITH ranked AS (
798
  SELECT source, target, edge_type,
 
803
  WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
804
  )
805
  SELECT source, target, edge_type FROM ranked
806
+ WHERE rn <= {EDGES_PER_TYPE}
807
  """).df()
808
 
809
+ # 2-hop: citation_event β†’ citing_paper / intent
 
810
  event_ids = [
811
  x for x in
812
  set(hop1["source"].tolist()) | set(hop1["target"].tolist())
813
  if str(x).startswith("event:")
814
+ ][:30]
815
 
816
  if event_ids:
817
  ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
 
826
  AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
827
  )
828
  SELECT source, target, edge_type FROM ranked
829
+ WHERE rn <= {EDGES_PER_TYPE}
830
  """).df()
831
  exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
832
  subset=["source", "target", "edge_type"]
 
838
  exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
839
 
840
  c1, c2, c3, c4 = st.columns(4)
841
+ c1.metric("Nodes", fmt_num(len(exp_nodes)))
842
+ c2.metric("Edges", fmt_num(len(exp_edges)))
843
+ c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
844
+ c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
 
 
 
 
 
 
 
 
 
 
 
845
 
846
+ kg_html = pyvis_from_kg(exp_nodes, exp_edges)
847
+ components.html(kg_html, height=860, scrolling=True)
 
 
848
 
849
  except Exception as e:
850
  st.error(str(e))