Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +82 -24
src/app.py
CHANGED
|
@@ -780,7 +780,8 @@ with tab_kg_exp:
|
|
| 780 |
|
| 781 |
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 782 |
|
| 783 |
-
|
|
|
|
| 784 |
|
| 785 |
with st.spinner("Querying graph..."):
|
| 786 |
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
|
@@ -791,8 +792,8 @@ with tab_kg_exp:
|
|
| 791 |
if seed_ids:
|
| 792 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 793 |
|
| 794 |
-
# 1-hop: seed paperμ μ°κ²°λ λͺ¨λ edge type
|
| 795 |
-
#
|
| 796 |
hop1 = _ddb.execute(f"""
|
| 797 |
WITH ranked AS (
|
| 798 |
SELECT source, target, edge_type,
|
|
@@ -806,15 +807,20 @@ with tab_kg_exp:
|
|
| 806 |
WHERE rn <= {EDGES_PER_TYPE}
|
| 807 |
""").df()
|
| 808 |
|
| 809 |
-
# 2-hop:
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
hop2 = _ddb.execute(f"""
|
| 819 |
WITH ranked AS (
|
| 820 |
SELECT source, target, edge_type,
|
|
@@ -823,7 +829,11 @@ with tab_kg_exp:
|
|
| 823 |
) AS rn
|
| 824 |
FROM read_parquet('{kg_edges_path}')
|
| 825 |
WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
|
| 826 |
-
AND edge_type IN (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
)
|
| 828 |
SELECT source, target, edge_type FROM ranked
|
| 829 |
WHERE rn <= {EDGES_PER_TYPE}
|
|
@@ -867,7 +877,7 @@ with tab_geo:
|
|
| 867 |
fig_map.update_layout(geo=dict(showframe=False), height=500)
|
| 868 |
st.plotly_chart(fig_map, use_container_width=True)
|
| 869 |
|
| 870 |
-
st.subheader("Top Cities
|
| 871 |
city_cnt = (seed_filtered.merge(
|
| 872 |
aff_geo_df[["affiliation_name","city_name","country_name"]],
|
| 873 |
left_on="affiliation", right_on="affiliation_name", how="left")
|
|
@@ -881,16 +891,46 @@ with tab_geo:
|
|
| 881 |
.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
|
| 882 |
use_container_width=True)
|
| 883 |
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 894 |
|
| 895 |
|
| 896 |
# βββ 7. ANALYTICS βββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1030,6 +1070,24 @@ with tab_analytics:
|
|
| 1030 |
use_container_width=True,
|
| 1031 |
)
|
| 1032 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1033 |
# ββ Export βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1034 |
st.markdown("---")
|
| 1035 |
st.subheader("Export Data")
|
|
|
|
| 780 |
|
| 781 |
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 782 |
|
| 783 |
+
# edge typeλΉ κ³ μ μν μ β 10 types Γ 10 = μ΅λ 100 edges
|
| 784 |
+
EDGES_PER_TYPE = 10
|
| 785 |
|
| 786 |
with st.spinner("Querying graph..."):
|
| 787 |
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
|
|
|
| 792 |
if seed_ids:
|
| 793 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 794 |
|
| 795 |
+
# 1-hop: seed paperμ μ°κ²°λ λͺ¨λ edge type
|
| 796 |
+
# β journal / author / affiliation / city / country / field / citation_event
|
| 797 |
hop1 = _ddb.execute(f"""
|
| 798 |
WITH ranked AS (
|
| 799 |
SELECT source, target, edge_type,
|
|
|
|
| 807 |
WHERE rn <= {EDGES_PER_TYPE}
|
| 808 |
""").df()
|
| 809 |
|
| 810 |
+
# 2-hop: kg_nodes_exp νμ
κΈ°λ°μΌλ‘ citation_event λ
Έλ ID μΆμΆ
|
| 811 |
+
# (prefix κ°μ μμ΄ μ€μ node_type 컬λΌμΌλ‘ νμΈ)
|
| 812 |
+
hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
|
| 813 |
+
event_node_ids = (
|
| 814 |
+
kg_nodes_exp[
|
| 815 |
+
kg_nodes_exp["node_id"].isin(hop1_all_ids) &
|
| 816 |
+
(kg_nodes_exp["node_type"] == "citation_event")
|
| 817 |
+
]["node_id"].tolist()[:40]
|
| 818 |
+
)
|
| 819 |
+
|
| 820 |
+
if event_node_ids:
|
| 821 |
+
ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
|
| 822 |
+
# citation_event β HAS_CITING_PAPER β citing_paper
|
| 823 |
+
# citation_event β HAS_PRIMARY_INTENT β intent
|
| 824 |
hop2 = _ddb.execute(f"""
|
| 825 |
WITH ranked AS (
|
| 826 |
SELECT source, target, edge_type,
|
|
|
|
| 829 |
) AS rn
|
| 830 |
FROM read_parquet('{kg_edges_path}')
|
| 831 |
WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
|
| 832 |
+
AND edge_type NOT IN (
|
| 833 |
+
SELECT DISTINCT edge_type
|
| 834 |
+
FROM read_parquet('{kg_edges_path}')
|
| 835 |
+
WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
|
| 836 |
+
)
|
| 837 |
)
|
| 838 |
SELECT source, target, edge_type FROM ranked
|
| 839 |
WHERE rn <= {EDGES_PER_TYPE}
|
|
|
|
| 877 |
fig_map.update_layout(geo=dict(showframe=False), height=500)
|
| 878 |
st.plotly_chart(fig_map, use_container_width=True)
|
| 879 |
|
| 880 |
+
st.subheader("Top Cities")
|
| 881 |
city_cnt = (seed_filtered.merge(
|
| 882 |
aff_geo_df[["affiliation_name","city_name","country_name"]],
|
| 883 |
left_on="affiliation", right_on="affiliation_name", how="left")
|
|
|
|
| 891 |
.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
|
| 892 |
use_container_width=True)
|
| 893 |
|
| 894 |
+
# ββ Affiliation μκ°ν ββββββββββββββββββββββββββββββββββββββ
|
| 895 |
+
st.subheader("Top Affiliations")
|
| 896 |
+
geo_col1, geo_col2 = st.columns(2)
|
| 897 |
+
|
| 898 |
+
with geo_col1:
|
| 899 |
+
aff_cnt = (seed_filtered[seed_filtered["affiliation"].str.strip() != ""]
|
| 900 |
+
.groupby("affiliation").size()
|
| 901 |
+
.reset_index(name="count")
|
| 902 |
+
.sort_values("count", ascending=False).head(20))
|
| 903 |
+
if not aff_cnt.empty:
|
| 904 |
+
st.plotly_chart(
|
| 905 |
+
px.bar(aff_cnt, x="count", y="affiliation", orientation="h",
|
| 906 |
+
title="Top 20 Affiliations by Seed Papers",
|
| 907 |
+
labels={"count": "Seed Papers", "affiliation": ""})
|
| 908 |
+
.update_layout(yaxis=dict(autorange="reversed"),
|
| 909 |
+
xaxis_title="Seed Papers", yaxis_title="", height=520),
|
| 910 |
+
use_container_width=True)
|
| 911 |
+
|
| 912 |
+
with geo_col2:
|
| 913 |
+
aff_country = (seed_filtered[
|
| 914 |
+
(seed_filtered["affiliation"].str.strip() != "") &
|
| 915 |
+
(seed_filtered["country"].str.strip() != "")
|
| 916 |
+
]
|
| 917 |
+
.groupby(["country", "affiliation"]).size()
|
| 918 |
+
.reset_index(name="count")
|
| 919 |
+
.sort_values("count", ascending=False)
|
| 920 |
+
)
|
| 921 |
+
top_affs = aff_country.groupby("affiliation")["count"].sum().nlargest(20).index
|
| 922 |
+
aff_country_top = aff_country[aff_country["affiliation"].isin(top_affs)]
|
| 923 |
+
if not aff_country_top.empty:
|
| 924 |
+
st.plotly_chart(
|
| 925 |
+
px.bar(aff_country_top, x="count", y="affiliation",
|
| 926 |
+
color="country", orientation="h",
|
| 927 |
+
title="Top Affiliations by Country",
|
| 928 |
+
labels={"count": "Seed Papers", "affiliation": "", "country": "Country"})
|
| 929 |
+
.update_layout(yaxis=dict(autorange="reversed"),
|
| 930 |
+
barmode="stack",
|
| 931 |
+
xaxis_title="Seed Papers", yaxis_title="",
|
| 932 |
+
legend_title="Country", height=520),
|
| 933 |
+
use_container_width=True)
|
| 934 |
|
| 935 |
|
| 936 |
# βββ 7. ANALYTICS βββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1070 |
use_container_width=True,
|
| 1071 |
)
|
| 1072 |
|
| 1073 |
+
# ββ Citation Trend over Time ββββββββββββββββββββββββββββββββ
|
| 1074 |
+
st.markdown("---")
|
| 1075 |
+
st.subheader("Citation Trend over Time (selected paper)")
|
| 1076 |
+
st.caption("How citations to the selected seed paper have changed year by year")
|
| 1077 |
+
trend_sel = (seed_events.dropna(subset=["citing_year"])
|
| 1078 |
+
.assign(citing_year=lambda df: df["citing_year"].astype(int))
|
| 1079 |
+
.query("citing_year >= 2000")
|
| 1080 |
+
.groupby("citing_year").size().reset_index(name="count"))
|
| 1081 |
+
if not trend_sel.empty:
|
| 1082 |
+
st.plotly_chart(
|
| 1083 |
+
px.line(trend_sel, x="citing_year", y="count", markers=True,
|
| 1084 |
+
labels={"citing_year": "Year", "count": "Citations"})
|
| 1085 |
+
.update_layout(xaxis_title="Year", yaxis_title="Citations",
|
| 1086 |
+
hovermode="x unified"),
|
| 1087 |
+
use_container_width=True)
|
| 1088 |
+
else:
|
| 1089 |
+
st.info("No citation trend data for the selected paper.")
|
| 1090 |
+
|
| 1091 |
# ββ Export βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1092 |
st.markdown("---")
|
| 1093 |
st.subheader("Export Data")
|