Daniel0315 commited on
Commit
a2524a3
Β·
verified Β·
1 Parent(s): f7077fb

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +80 -82
src/app.py CHANGED
@@ -636,10 +636,10 @@ contexts_df = build_context_rows(seed_events)
636
  citing_table = build_citing_table(seed_events)
637
 
638
  # ── νƒ­ ─────────────────────────────────────────────────────────
639
- (tab_overview, tab_cnet, tab_ontology, tab_kg,
640
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
641
  "Overview","Citation Network","Ontology",
642
- "Knowledge Graph","KG Explorer","Geographic Map","Analytics",
643
  ])
644
 
645
 
@@ -742,52 +742,7 @@ with tab_ontology:
742
  st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
743
 
744
 
745
- # ═══ 4. KNOWLEDGE GRAPH ══════════════════════════════════════════
746
- with tab_kg:
747
- st.subheader("Knowledge Graph")
748
-
749
- max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
750
-
751
- try:
752
- with st.spinner("Loading..."):
753
- kg_nodes_kg = load_kg_nodes(data_dir_val)
754
- kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
755
-
756
- seed_doi = selected_seed["doi"]
757
- if not seed_doi:
758
- st.warning("Selected seed paper has no DOI.")
759
- else:
760
- node_id = f"seed:{seed_doi}"
761
- with st.spinner("Querying graph..."):
762
- edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
763
-
764
- if edges_sub.empty:
765
- st.warning("No edges found for this paper in the knowledge graph.")
766
- else:
767
- all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
768
- nodes_sub = kg_nodes_kg[kg_nodes_kg["node_id"].isin(all_node_ids)]
769
-
770
- c1, c2, c3 = st.columns(3)
771
- c1.metric("Nodes", fmt_num(len(nodes_sub)))
772
- c2.metric("Edges", fmt_num(len(edges_sub)))
773
- c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
774
-
775
- type_counts = nodes_sub["node_type"].value_counts().reset_index()
776
- type_counts.columns = ["node_type", "count"]
777
- st.plotly_chart(
778
- px.bar(type_counts, x="node_type", y="count",
779
- color="node_type", color_discrete_map=NODE_TYPE_COLORS)
780
- .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
781
- use_container_width=True)
782
-
783
- st.plotly_chart(
784
- plotly_network_fig(nodes_sub, edges_sub, height=750),
785
- use_container_width=True)
786
- except Exception as e:
787
- st.error(str(e))
788
-
789
-
790
- # ═══ 5. KG EXPLORER ══════════════════════════════════════════════
791
  with tab_kg_exp:
792
  st.subheader("KG Explorer")
793
 
@@ -820,15 +775,16 @@ with tab_kg_exp:
820
  nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
821
  st.plotly_chart(nt_fig, use_container_width=True)
822
 
823
- # ── λ©€ν‹°λ…Έλ“œ μžλ™ μ‹œκ°ν™” (인용수 μƒμœ„ seed papers + μ—°κ²° λ…Έλ“œ)
824
  st.markdown("---")
825
  st.subheader("Multi-Node Knowledge Graph")
 
826
 
827
- n_seeds = st.slider("Number of seed papers", 3, 20, 8, key="kg_exp_n_seeds")
828
- max_exp_edges = st.slider("Max edges", 30, 200, 100, key="kg_exp_max_edges")
829
 
830
  with st.spinner("Querying graph..."):
831
- # 인용수 μƒμœ„ seed papers 선택
832
  top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
833
  .sort_values("citedby_count", ascending=False)
834
  .head(n_seeds))
@@ -836,27 +792,74 @@ with tab_kg_exp:
836
 
837
  if seed_ids:
838
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
839
- # 메타데이터 μ—£μ§€λ§Œ κ°€μ Έμ˜΄ (citing/cited/intent μ œμ™Έ β†’ μ €μž/저널/λΆ„μ•Ό λ“±)
840
- exp_edges = _ddb.execute(f"""
841
- SELECT source, target, edge_type
842
- FROM read_parquet('{kg_edges_path}')
843
- WHERE (source IN ({ids_sql}) OR target IN ({ids_sql}))
844
- AND edge_type NOT IN (
845
- 'HAS_CITING_PAPER','HAS_CITED_PAPER','HAS_PRIMARY_INTENT'
846
- )
847
- LIMIT {int(max_exp_edges)}
 
 
 
 
 
848
  """).df()
849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
  all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
851
  exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
852
 
853
- c1, c2, c3 = st.columns(3)
854
- c1.metric("Nodes", fmt_num(len(exp_nodes)))
855
- c2.metric("Edges", fmt_num(len(exp_edges)))
856
- c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
 
 
 
 
 
 
 
 
 
 
 
 
857
 
858
  st.plotly_chart(
859
- plotly_network_fig(exp_nodes, exp_edges, height=750),
 
860
  use_container_width=True)
861
 
862
  except Exception as e:
@@ -946,7 +949,7 @@ with tab_analytics:
946
  col_c, col_d = st.columns(2)
947
 
948
  with col_c:
949
- st.subheader("CitationHub Field Γ— Intent Distribution")
950
  fi = (seed[["seed_paper_id","field"]]
951
  .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
952
  .groupby(["field","primary_intent"]).size().reset_index(name="count"))
@@ -954,7 +957,8 @@ with tab_analytics:
954
  pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
955
  st.plotly_chart(
956
  px.imshow(pivot, color_continuous_scale="Blues",
957
- title="CitationHub Field Γ— Intent Distribution", aspect="auto")
 
958
  .update_layout(xaxis_title="Intent", yaxis_title="Field"),
959
  use_container_width=True)
960
 
@@ -969,13 +973,6 @@ with tab_analytics:
969
  title="Influential vs Non-influential"),
970
  use_container_width=True)
971
 
972
- st.subheader("Intent Reference")
973
- st.dataframe(intents_df, use_container_width=True, hide_index=True)
974
-
975
- st.markdown("---")
976
- st.subheader("Field Reference")
977
- st.dataframe(fields_df, use_container_width=True, hide_index=True)
978
-
979
  # ── Intent Evolution over Years ────────────────────────────
980
  st.markdown("---")
981
  st.subheader("CitationHub Intent Evolution over Years")
@@ -983,6 +980,7 @@ with tab_analytics:
983
  intent_trend_raw = (
984
  events.dropna(subset=["citing_year"])
985
  .assign(year=lambda df: df["citing_year"].astype(int))
 
986
  .groupby(["year", "primary_intent"]).size()
987
  .reset_index(name="count")
988
  )
@@ -1023,8 +1021,8 @@ with tab_analytics:
1023
  )
1024
 
1025
  with col_v2:
1026
- st.subheader("Intent Mix by Field")
1027
- st.caption("How each field uses citations differently")
1028
  fi_pct = (
1029
  seed[["seed_paper_id", "field"]]
1030
  .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
@@ -1033,17 +1031,17 @@ with tab_analytics:
1033
  if not fi_pct.empty:
1034
  totals = fi_pct.groupby("field")["count"].transform("sum")
1035
  fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
1036
- top_fields = fi_pct.groupby("field")["count"].sum().nlargest(12).index
1037
- fi_pct_top = fi_pct[fi_pct["field"].isin(top_fields)]
1038
  st.plotly_chart(
1039
- px.bar(fi_pct_top, x="pct", y="field", color="primary_intent",
1040
  orientation="h", color_discrete_map=INTENT_COLORS,
1041
  labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
1042
  .update_layout(
1043
  barmode="stack",
1044
- yaxis=dict(autorange="reversed"),
1045
  xaxis_title="% of citations", yaxis_title="",
1046
- legend_title="Intent", height=520,
1047
  ),
1048
  use_container_width=True,
1049
  )
 
636
  citing_table = build_citing_table(seed_events)
637
 
638
  # ── νƒ­ ─────────────────────────────────────────────────────────
639
+ (tab_overview, tab_cnet, tab_ontology,
640
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
641
  "Overview","Citation Network","Ontology",
642
+ "Knowledge Graph","Geographic Map","Analytics",
643
  ])
644
 
645
 
 
742
  st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
743
 
744
 
745
+ # ═══ 4. KNOWLEDGE GRAPH (KG Explorer) ═══════════════════════════
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  with tab_kg_exp:
747
  st.subheader("KG Explorer")
748
 
 
775
  nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
776
  st.plotly_chart(nt_fig, use_container_width=True)
777
 
778
+ # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
779
  st.markdown("---")
780
  st.subheader("Multi-Node Knowledge Graph")
781
+ st.caption("All 10 node types and all edge types β€” 2-hop from top cited seed papers")
782
 
783
+ n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
784
+ edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type")
785
 
786
  with st.spinner("Querying graph..."):
787
+ # ── 1-hop: 인용수 μƒμœ„ seed papers κΈ°μ€€ λͺ¨λ“  μ—£μ§€
788
  top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
789
  .sort_values("citedby_count", ascending=False)
790
  .head(n_seeds))
 
792
 
793
  if seed_ids:
794
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
795
+
796
+ # 1-hop: seed paper와 μ—°κ²°λœ λͺ¨λ“  edge (journal, author, affiliation, city,
797
+ # country, field, citation_event)
798
+ hop1 = _ddb.execute(f"""
799
+ WITH ranked AS (
800
+ SELECT source, target, edge_type,
801
+ ROW_NUMBER() OVER (
802
+ PARTITION BY edge_type ORDER BY source
803
+ ) AS rn
804
+ FROM read_parquet('{kg_edges_path}')
805
+ WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
806
+ )
807
+ SELECT source, target, edge_type FROM ranked
808
+ WHERE rn <= {int(edges_per_type)}
809
  """).df()
810
 
811
+ # 2-hop: citation_event β†’ HAS_CITING_PAPER β†’ citing_paper
812
+ # citation_event β†’ HAS_PRIMARY_INTENT β†’ intent
813
+ event_ids = [
814
+ x for x in
815
+ set(hop1["source"].tolist()) | set(hop1["target"].tolist())
816
+ if str(x).startswith("event:")
817
+ ][:20]
818
+
819
+ if event_ids:
820
+ ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
821
+ hop2 = _ddb.execute(f"""
822
+ WITH ranked AS (
823
+ SELECT source, target, edge_type,
824
+ ROW_NUMBER() OVER (
825
+ PARTITION BY edge_type ORDER BY source
826
+ ) AS rn
827
+ FROM read_parquet('{kg_edges_path}')
828
+ WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
829
+ AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
830
+ )
831
+ SELECT source, target, edge_type FROM ranked
832
+ WHERE rn <= {int(edges_per_type)}
833
+ """).df()
834
+ exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
835
+ subset=["source", "target", "edge_type"]
836
+ )
837
+ else:
838
+ exp_edges = hop1
839
+
840
  all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
841
  exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
842
 
843
+ c1, c2, c3, c4 = st.columns(4)
844
+ c1.metric("Nodes", fmt_num(len(exp_nodes)))
845
+ c2.metric("Edges", fmt_num(len(exp_edges)))
846
+ c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
847
+ c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
848
+
849
+ # 컀버리지 확인 ν‘œμ‹œ
850
+ present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
851
+ present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
852
+ all_10_ntypes = sorted(NODE_TYPE_COLORS.keys())
853
+ missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
854
+ if missing_nt:
855
+ st.caption(f"⚠ Node types not yet in graph: {', '.join(missing_nt)} "
856
+ f"β€” try increasing 'Edges per type'")
857
+ else:
858
+ st.caption("βœ… All 10 node types represented")
859
 
860
  st.plotly_chart(
861
+ plotly_network_fig(exp_nodes, exp_edges, height=800,
862
+ seed_node_ids=seed_ids),
863
  use_container_width=True)
864
 
865
  except Exception as e:
 
949
  col_c, col_d = st.columns(2)
950
 
951
  with col_c:
952
+ st.subheader("CitationHub Field Γ— Intent Distribution Heatmap")
953
  fi = (seed[["seed_paper_id","field"]]
954
  .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
955
  .groupby(["field","primary_intent"]).size().reset_index(name="count"))
 
957
  pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
958
  st.plotly_chart(
959
  px.imshow(pivot, color_continuous_scale="Blues",
960
+ title="CitationHub Field Γ— Intent Distribution Heatmap",
961
+ aspect="auto")
962
  .update_layout(xaxis_title="Intent", yaxis_title="Field"),
963
  use_container_width=True)
964
 
 
973
  title="Influential vs Non-influential"),
974
  use_container_width=True)
975
 
 
 
 
 
 
 
 
976
  # ── Intent Evolution over Years ────────────────────────────
977
  st.markdown("---")
978
  st.subheader("CitationHub Intent Evolution over Years")
 
980
  intent_trend_raw = (
981
  events.dropna(subset=["citing_year"])
982
  .assign(year=lambda df: df["citing_year"].astype(int))
983
+ .query("year >= 2000")
984
  .groupby(["year", "primary_intent"]).size()
985
  .reset_index(name="count")
986
  )
 
1021
  )
1022
 
1023
  with col_v2:
1024
+ st.subheader("CitationHub Field Γ— Intent Distribution")
1025
+ st.caption("How each field uses citations differently (all fields)")
1026
  fi_pct = (
1027
  seed[["seed_paper_id", "field"]]
1028
  .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
 
1031
  if not fi_pct.empty:
1032
  totals = fi_pct.groupby("field")["count"].transform("sum")
1033
  fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
1034
+ n_fields = fi_pct["field"].nunique()
1035
+ chart_height = max(520, n_fields * 28)
1036
  st.plotly_chart(
1037
+ px.bar(fi_pct, x="pct", y="field", color="primary_intent",
1038
  orientation="h", color_discrete_map=INTENT_COLORS,
1039
  labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
1040
  .update_layout(
1041
  barmode="stack",
1042
+ yaxis=dict(autorange="reversed", categoryorder="total ascending"),
1043
  xaxis_title="% of citations", yaxis_title="",
1044
+ legend_title="Intent", height=chart_height,
1045
  ),
1046
  use_container_width=True,
1047
  )