Daniel0315 commited on
Commit
aedd2d3
Β·
verified Β·
1 Parent(s): 4dd37d3

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +82 -24
src/app.py CHANGED
@@ -780,7 +780,8 @@ with tab_kg_exp:
780
 
781
  n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
782
 
783
- EDGES_PER_TYPE = 10 # 각 edge typeλ‹Ή κ³ μ • μƒ˜ν”Œ 수 β†’ 10 types Γ— 10 = μ΅œλŒ€ 100 edges
 
784
 
785
  with st.spinner("Querying graph..."):
786
  top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
@@ -791,8 +792,8 @@ with tab_kg_exp:
791
  if seed_ids:
792
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
793
 
794
- # 1-hop: seed paper에 μ—°κ²°λœ λͺ¨λ“  edge type (journal/author/affiliation/
795
- # city/country/field/citation_event 포함)
796
  hop1 = _ddb.execute(f"""
797
  WITH ranked AS (
798
  SELECT source, target, edge_type,
@@ -806,15 +807,20 @@ with tab_kg_exp:
806
  WHERE rn <= {EDGES_PER_TYPE}
807
  """).df()
808
 
809
- # 2-hop: citation_event β†’ citing_paper / intent
810
- event_ids = [
811
- x for x in
812
- set(hop1["source"].tolist()) | set(hop1["target"].tolist())
813
- if str(x).startswith("event:")
814
- ][:30]
815
-
816
- if event_ids:
817
- ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
 
 
 
 
 
818
  hop2 = _ddb.execute(f"""
819
  WITH ranked AS (
820
  SELECT source, target, edge_type,
@@ -823,7 +829,11 @@ with tab_kg_exp:
823
  ) AS rn
824
  FROM read_parquet('{kg_edges_path}')
825
  WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
826
- AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
 
 
 
 
827
  )
828
  SELECT source, target, edge_type FROM ranked
829
  WHERE rn <= {EDGES_PER_TYPE}
@@ -867,7 +877,7 @@ with tab_geo:
867
  fig_map.update_layout(geo=dict(showframe=False), height=500)
868
  st.plotly_chart(fig_map, use_container_width=True)
869
 
870
- st.subheader("Top Cities (Affiliation)")
871
  city_cnt = (seed_filtered.merge(
872
  aff_geo_df[["affiliation_name","city_name","country_name"]],
873
  left_on="affiliation", right_on="affiliation_name", how="left")
@@ -881,16 +891,46 @@ with tab_geo:
881
  .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
882
  use_container_width=True)
883
 
884
- st.subheader("Citation Trend over Time (selected paper)")
885
- trend2 = (seed_events.dropna(subset=["citing_year"])
886
- .assign(citing_year=lambda df: df["citing_year"].astype(int))
887
- .groupby("citing_year").size().reset_index(name="count"))
888
- if not trend2.empty:
889
- st.plotly_chart(
890
- px.line(trend2, x="citing_year", y="count", markers=True,
891
- title="Citations per Year")
892
- .update_layout(xaxis_title="Year", yaxis_title="Citations"),
893
- use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894
 
895
 
896
  # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
@@ -1030,6 +1070,24 @@ with tab_analytics:
1030
  use_container_width=True,
1031
  )
1032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
  # ── Export ─────────────────────────────────────────────────
1034
  st.markdown("---")
1035
  st.subheader("Export Data")
 
780
 
781
  n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
782
 
783
+ # edge typeλ‹Ή κ³ μ • μƒ˜ν”Œ 수 β€” 10 types Γ— 10 = μ΅œλŒ€ 100 edges
784
+ EDGES_PER_TYPE = 10
785
 
786
  with st.spinner("Querying graph..."):
787
  top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
 
792
  if seed_ids:
793
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
794
 
795
+ # 1-hop: seed paper에 μ—°κ²°λœ λͺ¨λ“  edge type
796
+ # β†’ journal / author / affiliation / city / country / field / citation_event
797
  hop1 = _ddb.execute(f"""
798
  WITH ranked AS (
799
  SELECT source, target, edge_type,
 
807
  WHERE rn <= {EDGES_PER_TYPE}
808
  """).df()
809
 
810
+ # 2-hop: kg_nodes_exp νƒ€μž… 기반으둜 citation_event λ…Έλ“œ ID μΆ”μΆœ
811
+ # (prefix κ°€μ • 없이 μ‹€μ œ node_type 컬럼으둜 확인)
812
+ hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
813
+ event_node_ids = (
814
+ kg_nodes_exp[
815
+ kg_nodes_exp["node_id"].isin(hop1_all_ids) &
816
+ (kg_nodes_exp["node_type"] == "citation_event")
817
+ ]["node_id"].tolist()[:40]
818
+ )
819
+
820
+ if event_node_ids:
821
+ ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
822
+ # citation_event β†’ HAS_CITING_PAPER β†’ citing_paper
823
+ # citation_event β†’ HAS_PRIMARY_INTENT β†’ intent
824
  hop2 = _ddb.execute(f"""
825
  WITH ranked AS (
826
  SELECT source, target, edge_type,
 
829
  ) AS rn
830
  FROM read_parquet('{kg_edges_path}')
831
  WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
832
+ AND edge_type NOT IN (
833
+ SELECT DISTINCT edge_type
834
+ FROM read_parquet('{kg_edges_path}')
835
+ WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
836
+ )
837
  )
838
  SELECT source, target, edge_type FROM ranked
839
  WHERE rn <= {EDGES_PER_TYPE}
 
877
  fig_map.update_layout(geo=dict(showframe=False), height=500)
878
  st.plotly_chart(fig_map, use_container_width=True)
879
 
880
+ st.subheader("Top Cities")
881
  city_cnt = (seed_filtered.merge(
882
  aff_geo_df[["affiliation_name","city_name","country_name"]],
883
  left_on="affiliation", right_on="affiliation_name", how="left")
 
891
  .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
892
  use_container_width=True)
893
 
894
+ # ── Affiliation μ‹œκ°ν™” ──────────────────────────────────────
895
+ st.subheader("Top Affiliations")
896
+ geo_col1, geo_col2 = st.columns(2)
897
+
898
+ with geo_col1:
899
+ aff_cnt = (seed_filtered[seed_filtered["affiliation"].str.strip() != ""]
900
+ .groupby("affiliation").size()
901
+ .reset_index(name="count")
902
+ .sort_values("count", ascending=False).head(20))
903
+ if not aff_cnt.empty:
904
+ st.plotly_chart(
905
+ px.bar(aff_cnt, x="count", y="affiliation", orientation="h",
906
+ title="Top 20 Affiliations by Seed Papers",
907
+ labels={"count": "Seed Papers", "affiliation": ""})
908
+ .update_layout(yaxis=dict(autorange="reversed"),
909
+ xaxis_title="Seed Papers", yaxis_title="", height=520),
910
+ use_container_width=True)
911
+
912
+ with geo_col2:
913
+ aff_country = (seed_filtered[
914
+ (seed_filtered["affiliation"].str.strip() != "") &
915
+ (seed_filtered["country"].str.strip() != "")
916
+ ]
917
+ .groupby(["country", "affiliation"]).size()
918
+ .reset_index(name="count")
919
+ .sort_values("count", ascending=False)
920
+ )
921
+ top_affs = aff_country.groupby("affiliation")["count"].sum().nlargest(20).index
922
+ aff_country_top = aff_country[aff_country["affiliation"].isin(top_affs)]
923
+ if not aff_country_top.empty:
924
+ st.plotly_chart(
925
+ px.bar(aff_country_top, x="count", y="affiliation",
926
+ color="country", orientation="h",
927
+ title="Top Affiliations by Country",
928
+ labels={"count": "Seed Papers", "affiliation": "", "country": "Country"})
929
+ .update_layout(yaxis=dict(autorange="reversed"),
930
+ barmode="stack",
931
+ xaxis_title="Seed Papers", yaxis_title="",
932
+ legend_title="Country", height=520),
933
+ use_container_width=True)
934
 
935
 
936
  # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
 
1070
  use_container_width=True,
1071
  )
1072
 
1073
+ # ── Citation Trend over Time ────────────────────────────────
1074
+ st.markdown("---")
1075
+ st.subheader("Citation Trend over Time (selected paper)")
1076
+ st.caption("How citations to the selected seed paper have changed year by year")
1077
+ trend_sel = (seed_events.dropna(subset=["citing_year"])
1078
+ .assign(citing_year=lambda df: df["citing_year"].astype(int))
1079
+ .query("citing_year >= 2000")
1080
+ .groupby("citing_year").size().reset_index(name="count"))
1081
+ if not trend_sel.empty:
1082
+ st.plotly_chart(
1083
+ px.line(trend_sel, x="citing_year", y="count", markers=True,
1084
+ labels={"citing_year": "Year", "count": "Citations"})
1085
+ .update_layout(xaxis_title="Year", yaxis_title="Citations",
1086
+ hovermode="x unified"),
1087
+ use_container_width=True)
1088
+ else:
1089
+ st.info("No citation trend data for the selected paper.")
1090
+
1091
  # ── Export ─────────────────────────────────────────────────
1092
  st.markdown("---")
1093
  st.subheader("Export Data")