Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +80 -82
src/app.py
CHANGED
|
@@ -636,10 +636,10 @@ contexts_df = build_context_rows(seed_events)
|
|
| 636 |
citing_table = build_citing_table(seed_events)
|
| 637 |
|
| 638 |
# ββ ν βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 639 |
-
(tab_overview, tab_cnet, tab_ontology,
|
| 640 |
tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
|
| 641 |
"Overview","Citation Network","Ontology",
|
| 642 |
-
"Knowledge Graph","
|
| 643 |
])
|
| 644 |
|
| 645 |
|
|
@@ -742,52 +742,7 @@ with tab_ontology:
|
|
| 742 |
st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
|
| 743 |
|
| 744 |
|
| 745 |
-
# βββ 4. KNOWLEDGE GRAPH βββββββββββββββββββββββββββ
|
| 746 |
-
with tab_kg:
|
| 747 |
-
st.subheader("Knowledge Graph")
|
| 748 |
-
|
| 749 |
-
max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
|
| 750 |
-
|
| 751 |
-
try:
|
| 752 |
-
with st.spinner("Loading..."):
|
| 753 |
-
kg_nodes_kg = load_kg_nodes(data_dir_val)
|
| 754 |
-
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 755 |
-
|
| 756 |
-
seed_doi = selected_seed["doi"]
|
| 757 |
-
if not seed_doi:
|
| 758 |
-
st.warning("Selected seed paper has no DOI.")
|
| 759 |
-
else:
|
| 760 |
-
node_id = f"seed:{seed_doi}"
|
| 761 |
-
with st.spinner("Querying graph..."):
|
| 762 |
-
edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
|
| 763 |
-
|
| 764 |
-
if edges_sub.empty:
|
| 765 |
-
st.warning("No edges found for this paper in the knowledge graph.")
|
| 766 |
-
else:
|
| 767 |
-
all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
|
| 768 |
-
nodes_sub = kg_nodes_kg[kg_nodes_kg["node_id"].isin(all_node_ids)]
|
| 769 |
-
|
| 770 |
-
c1, c2, c3 = st.columns(3)
|
| 771 |
-
c1.metric("Nodes", fmt_num(len(nodes_sub)))
|
| 772 |
-
c2.metric("Edges", fmt_num(len(edges_sub)))
|
| 773 |
-
c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
|
| 774 |
-
|
| 775 |
-
type_counts = nodes_sub["node_type"].value_counts().reset_index()
|
| 776 |
-
type_counts.columns = ["node_type", "count"]
|
| 777 |
-
st.plotly_chart(
|
| 778 |
-
px.bar(type_counts, x="node_type", y="count",
|
| 779 |
-
color="node_type", color_discrete_map=NODE_TYPE_COLORS)
|
| 780 |
-
.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
|
| 781 |
-
use_container_width=True)
|
| 782 |
-
|
| 783 |
-
st.plotly_chart(
|
| 784 |
-
plotly_network_fig(nodes_sub, edges_sub, height=750),
|
| 785 |
-
use_container_width=True)
|
| 786 |
-
except Exception as e:
|
| 787 |
-
st.error(str(e))
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
# βββ 5. KG EXPLORER ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 791 |
with tab_kg_exp:
|
| 792 |
st.subheader("KG Explorer")
|
| 793 |
|
|
@@ -820,15 +775,16 @@ with tab_kg_exp:
|
|
| 820 |
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 821 |
st.plotly_chart(nt_fig, use_container_width=True)
|
| 822 |
|
| 823 |
-
# ββ
|
| 824 |
st.markdown("---")
|
| 825 |
st.subheader("Multi-Node Knowledge Graph")
|
|
|
|
| 826 |
|
| 827 |
-
n_seeds = st.slider("Number of seed papers", 3,
|
| 828 |
-
|
| 829 |
|
| 830 |
with st.spinner("Querying graph..."):
|
| 831 |
-
# μΈμ©μ μμ seed papers
|
| 832 |
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
| 833 |
.sort_values("citedby_count", ascending=False)
|
| 834 |
.head(n_seeds))
|
|
@@ -836,27 +792,74 @@ with tab_kg_exp:
|
|
| 836 |
|
| 837 |
if seed_ids:
|
| 838 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 848 |
""").df()
|
| 849 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
|
| 851 |
exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
|
| 852 |
|
| 853 |
-
c1, c2, c3 = st.columns(
|
| 854 |
-
c1.metric("Nodes",
|
| 855 |
-
c2.metric("Edges",
|
| 856 |
-
c3.metric("Node types",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
|
| 858 |
st.plotly_chart(
|
| 859 |
-
plotly_network_fig(exp_nodes, exp_edges, height=
|
|
|
|
| 860 |
use_container_width=True)
|
| 861 |
|
| 862 |
except Exception as e:
|
|
@@ -946,7 +949,7 @@ with tab_analytics:
|
|
| 946 |
col_c, col_d = st.columns(2)
|
| 947 |
|
| 948 |
with col_c:
|
| 949 |
-
st.subheader("CitationHub Field Γ Intent Distribution")
|
| 950 |
fi = (seed[["seed_paper_id","field"]]
|
| 951 |
.merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
|
| 952 |
.groupby(["field","primary_intent"]).size().reset_index(name="count"))
|
|
@@ -954,7 +957,8 @@ with tab_analytics:
|
|
| 954 |
pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
|
| 955 |
st.plotly_chart(
|
| 956 |
px.imshow(pivot, color_continuous_scale="Blues",
|
| 957 |
-
title="CitationHub Field Γ Intent Distribution
|
|
|
|
| 958 |
.update_layout(xaxis_title="Intent", yaxis_title="Field"),
|
| 959 |
use_container_width=True)
|
| 960 |
|
|
@@ -969,13 +973,6 @@ with tab_analytics:
|
|
| 969 |
title="Influential vs Non-influential"),
|
| 970 |
use_container_width=True)
|
| 971 |
|
| 972 |
-
st.subheader("Intent Reference")
|
| 973 |
-
st.dataframe(intents_df, use_container_width=True, hide_index=True)
|
| 974 |
-
|
| 975 |
-
st.markdown("---")
|
| 976 |
-
st.subheader("Field Reference")
|
| 977 |
-
st.dataframe(fields_df, use_container_width=True, hide_index=True)
|
| 978 |
-
|
| 979 |
# ββ Intent Evolution over Years ββββββββββββββββββββββββββββ
|
| 980 |
st.markdown("---")
|
| 981 |
st.subheader("CitationHub Intent Evolution over Years")
|
|
@@ -983,6 +980,7 @@ with tab_analytics:
|
|
| 983 |
intent_trend_raw = (
|
| 984 |
events.dropna(subset=["citing_year"])
|
| 985 |
.assign(year=lambda df: df["citing_year"].astype(int))
|
|
|
|
| 986 |
.groupby(["year", "primary_intent"]).size()
|
| 987 |
.reset_index(name="count")
|
| 988 |
)
|
|
@@ -1023,8 +1021,8 @@ with tab_analytics:
|
|
| 1023 |
)
|
| 1024 |
|
| 1025 |
with col_v2:
|
| 1026 |
-
st.subheader("
|
| 1027 |
-
st.caption("How each field uses citations differently")
|
| 1028 |
fi_pct = (
|
| 1029 |
seed[["seed_paper_id", "field"]]
|
| 1030 |
.merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
|
|
@@ -1033,17 +1031,17 @@ with tab_analytics:
|
|
| 1033 |
if not fi_pct.empty:
|
| 1034 |
totals = fi_pct.groupby("field")["count"].transform("sum")
|
| 1035 |
fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
st.plotly_chart(
|
| 1039 |
-
px.bar(
|
| 1040 |
orientation="h", color_discrete_map=INTENT_COLORS,
|
| 1041 |
labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
|
| 1042 |
.update_layout(
|
| 1043 |
barmode="stack",
|
| 1044 |
-
yaxis=dict(autorange="reversed"),
|
| 1045 |
xaxis_title="% of citations", yaxis_title="",
|
| 1046 |
-
legend_title="Intent", height=
|
| 1047 |
),
|
| 1048 |
use_container_width=True,
|
| 1049 |
)
|
|
|
|
| 636 |
citing_table = build_citing_table(seed_events)
|
| 637 |
|
| 638 |
# ββ ν βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 639 |
+
(tab_overview, tab_cnet, tab_ontology,
|
| 640 |
tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
|
| 641 |
"Overview","Citation Network","Ontology",
|
| 642 |
+
"Knowledge Graph","Geographic Map","Analytics",
|
| 643 |
])
|
| 644 |
|
| 645 |
|
|
|
|
| 742 |
st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
|
| 743 |
|
| 744 |
|
| 745 |
+
# βββ 4. KNOWLEDGE GRAPH (KG Explorer) βββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
with tab_kg_exp:
|
| 747 |
st.subheader("KG Explorer")
|
| 748 |
|
|
|
|
| 775 |
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 776 |
st.plotly_chart(nt_fig, use_container_width=True)
|
| 777 |
|
| 778 |
+
# ββ Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
|
| 779 |
st.markdown("---")
|
| 780 |
st.subheader("Multi-Node Knowledge Graph")
|
| 781 |
+
st.caption("All 10 node types and all edge types β 2-hop from top cited seed papers")
|
| 782 |
|
| 783 |
+
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 784 |
+
edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type")
|
| 785 |
|
| 786 |
with st.spinner("Querying graph..."):
|
| 787 |
+
# ββ 1-hop: μΈμ©μ μμ seed papers κΈ°μ€ λͺ¨λ μ£μ§
|
| 788 |
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
| 789 |
.sort_values("citedby_count", ascending=False)
|
| 790 |
.head(n_seeds))
|
|
|
|
| 792 |
|
| 793 |
if seed_ids:
|
| 794 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 795 |
+
|
| 796 |
+
# 1-hop: seed paperμ μ°κ²°λ λͺ¨λ edge (journal, author, affiliation, city,
|
| 797 |
+
# country, field, citation_event)
|
| 798 |
+
hop1 = _ddb.execute(f"""
|
| 799 |
+
WITH ranked AS (
|
| 800 |
+
SELECT source, target, edge_type,
|
| 801 |
+
ROW_NUMBER() OVER (
|
| 802 |
+
PARTITION BY edge_type ORDER BY source
|
| 803 |
+
) AS rn
|
| 804 |
+
FROM read_parquet('{kg_edges_path}')
|
| 805 |
+
WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
|
| 806 |
+
)
|
| 807 |
+
SELECT source, target, edge_type FROM ranked
|
| 808 |
+
WHERE rn <= {int(edges_per_type)}
|
| 809 |
""").df()
|
| 810 |
|
| 811 |
+
# 2-hop: citation_event β HAS_CITING_PAPER β citing_paper
|
| 812 |
+
# citation_event β HAS_PRIMARY_INTENT β intent
|
| 813 |
+
event_ids = [
|
| 814 |
+
x for x in
|
| 815 |
+
set(hop1["source"].tolist()) | set(hop1["target"].tolist())
|
| 816 |
+
if str(x).startswith("event:")
|
| 817 |
+
][:20]
|
| 818 |
+
|
| 819 |
+
if event_ids:
|
| 820 |
+
ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
|
| 821 |
+
hop2 = _ddb.execute(f"""
|
| 822 |
+
WITH ranked AS (
|
| 823 |
+
SELECT source, target, edge_type,
|
| 824 |
+
ROW_NUMBER() OVER (
|
| 825 |
+
PARTITION BY edge_type ORDER BY source
|
| 826 |
+
) AS rn
|
| 827 |
+
FROM read_parquet('{kg_edges_path}')
|
| 828 |
+
WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
|
| 829 |
+
AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
|
| 830 |
+
)
|
| 831 |
+
SELECT source, target, edge_type FROM ranked
|
| 832 |
+
WHERE rn <= {int(edges_per_type)}
|
| 833 |
+
""").df()
|
| 834 |
+
exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
|
| 835 |
+
subset=["source", "target", "edge_type"]
|
| 836 |
+
)
|
| 837 |
+
else:
|
| 838 |
+
exp_edges = hop1
|
| 839 |
+
|
| 840 |
all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
|
| 841 |
exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
|
| 842 |
|
| 843 |
+
c1, c2, c3, c4 = st.columns(4)
|
| 844 |
+
c1.metric("Nodes", fmt_num(len(exp_nodes)))
|
| 845 |
+
c2.metric("Edges", fmt_num(len(exp_edges)))
|
| 846 |
+
c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
|
| 847 |
+
c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
|
| 848 |
+
|
| 849 |
+
# 컀λ²λ¦¬μ§ νμΈ νμ
|
| 850 |
+
present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
|
| 851 |
+
present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
|
| 852 |
+
all_10_ntypes = sorted(NODE_TYPE_COLORS.keys())
|
| 853 |
+
missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
|
| 854 |
+
if missing_nt:
|
| 855 |
+
st.caption(f"β Node types not yet in graph: {', '.join(missing_nt)} "
|
| 856 |
+
f"β try increasing 'Edges per type'")
|
| 857 |
+
else:
|
| 858 |
+
st.caption("β
All 10 node types represented")
|
| 859 |
|
| 860 |
st.plotly_chart(
|
| 861 |
+
plotly_network_fig(exp_nodes, exp_edges, height=800,
|
| 862 |
+
seed_node_ids=seed_ids),
|
| 863 |
use_container_width=True)
|
| 864 |
|
| 865 |
except Exception as e:
|
|
|
|
| 949 |
col_c, col_d = st.columns(2)
|
| 950 |
|
| 951 |
with col_c:
|
| 952 |
+
st.subheader("CitationHub Field Γ Intent Distribution Heatmap")
|
| 953 |
fi = (seed[["seed_paper_id","field"]]
|
| 954 |
.merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
|
| 955 |
.groupby(["field","primary_intent"]).size().reset_index(name="count"))
|
|
|
|
| 957 |
pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
|
| 958 |
st.plotly_chart(
|
| 959 |
px.imshow(pivot, color_continuous_scale="Blues",
|
| 960 |
+
title="CitationHub Field Γ Intent Distribution Heatmap",
|
| 961 |
+
aspect="auto")
|
| 962 |
.update_layout(xaxis_title="Intent", yaxis_title="Field"),
|
| 963 |
use_container_width=True)
|
| 964 |
|
|
|
|
| 973 |
title="Influential vs Non-influential"),
|
| 974 |
use_container_width=True)
|
| 975 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 976 |
# ββ Intent Evolution over Years ββββββββββββββββββββββββββββ
|
| 977 |
st.markdown("---")
|
| 978 |
st.subheader("CitationHub Intent Evolution over Years")
|
|
|
|
| 980 |
intent_trend_raw = (
|
| 981 |
events.dropna(subset=["citing_year"])
|
| 982 |
.assign(year=lambda df: df["citing_year"].astype(int))
|
| 983 |
+
.query("year >= 2000")
|
| 984 |
.groupby(["year", "primary_intent"]).size()
|
| 985 |
.reset_index(name="count")
|
| 986 |
)
|
|
|
|
| 1021 |
)
|
| 1022 |
|
| 1023 |
with col_v2:
|
| 1024 |
+
st.subheader("CitationHub Field Γ Intent Distribution")
|
| 1025 |
+
st.caption("How each field uses citations differently (all fields)")
|
| 1026 |
fi_pct = (
|
| 1027 |
seed[["seed_paper_id", "field"]]
|
| 1028 |
.merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
|
|
|
|
| 1031 |
if not fi_pct.empty:
|
| 1032 |
totals = fi_pct.groupby("field")["count"].transform("sum")
|
| 1033 |
fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
|
| 1034 |
+
n_fields = fi_pct["field"].nunique()
|
| 1035 |
+
chart_height = max(520, n_fields * 28)
|
| 1036 |
st.plotly_chart(
|
| 1037 |
+
px.bar(fi_pct, x="pct", y="field", color="primary_intent",
|
| 1038 |
orientation="h", color_discrete_map=INTENT_COLORS,
|
| 1039 |
labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
|
| 1040 |
.update_layout(
|
| 1041 |
barmode="stack",
|
| 1042 |
+
yaxis=dict(autorange="reversed", categoryorder="total ascending"),
|
| 1043 |
xaxis_title="% of citations", yaxis_title="",
|
| 1044 |
+
legend_title="Intent", height=chart_height,
|
| 1045 |
),
|
| 1046 |
use_container_width=True,
|
| 1047 |
)
|