Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +36 -52
src/app.py
CHANGED
|
@@ -682,16 +682,6 @@ with tab_overview:
|
|
| 682 |
fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 683 |
st.plotly_chart(fig, use_container_width=True)
|
| 684 |
|
| 685 |
-
st.subheader("Citation trend (selected paper)")
|
| 686 |
-
trend = (seed_events.dropna(subset=["citing_year"])
|
| 687 |
-
.assign(citing_year=lambda df: df["citing_year"].astype(int))
|
| 688 |
-
.groupby("citing_year").size().reset_index(name="count"))
|
| 689 |
-
if not trend.empty:
|
| 690 |
-
st.plotly_chart(
|
| 691 |
-
px.line(trend, x="citing_year", y="count", markers=True)
|
| 692 |
-
.update_layout(xaxis_title="Year", yaxis_title="Citations"),
|
| 693 |
-
use_container_width=True)
|
| 694 |
-
|
| 695 |
st.subheader("CitationHub Intent Distribution")
|
| 696 |
all_intents = events.groupby("primary_intent").size().to_dict()
|
| 697 |
ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
|
|
@@ -752,39 +742,47 @@ with tab_kg_exp:
|
|
| 752 |
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 753 |
|
| 754 |
# ββ λ
Έλ/μ£μ§ νμ
λΆν¬ ν΅κ³
|
| 755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
with col_a:
|
| 757 |
st.subheader("Node Types")
|
| 758 |
-
nt = kg_nodes_exp["node_type"].value_counts().reset_index()
|
| 759 |
-
nt.columns = ["node_type", "count"]
|
| 760 |
st.dataframe(nt, use_container_width=True, hide_index=True)
|
| 761 |
-
|
| 762 |
-
st.subheader("Edge Types")
|
| 763 |
-
import duckdb as _ddb
|
| 764 |
-
et = _ddb.execute(f"""
|
| 765 |
-
SELECT edge_type, COUNT(*) AS count
|
| 766 |
-
FROM read_parquet('{kg_edges_path}')
|
| 767 |
-
GROUP BY edge_type ORDER BY count DESC
|
| 768 |
-
""").df()
|
| 769 |
-
st.dataframe(et, use_container_width=True, hide_index=True)
|
| 770 |
-
|
| 771 |
with col_b:
|
| 772 |
st.subheader("CitationHub KG Node Distribution")
|
| 773 |
nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
|
| 774 |
color_discrete_map=NODE_TYPE_COLORS)
|
| 775 |
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 776 |
st.plotly_chart(nt_fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
|
| 778 |
# ββ Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
|
| 779 |
st.markdown("---")
|
| 780 |
st.subheader("Multi-Node Knowledge Graph")
|
| 781 |
-
st.caption("
|
| 782 |
|
| 783 |
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 784 |
-
|
|
|
|
| 785 |
|
| 786 |
with st.spinner("Querying graph..."):
|
| 787 |
-
# ββ 1-hop: μΈμ©μ μμ seed papers κΈ°μ€ λͺ¨λ μ£μ§
|
| 788 |
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
| 789 |
.sort_values("citedby_count", ascending=False)
|
| 790 |
.head(n_seeds))
|
|
@@ -793,8 +791,8 @@ with tab_kg_exp:
|
|
| 793 |
if seed_ids:
|
| 794 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 795 |
|
| 796 |
-
# 1-hop: seed paper
|
| 797 |
-
# country
|
| 798 |
hop1 = _ddb.execute(f"""
|
| 799 |
WITH ranked AS (
|
| 800 |
SELECT source, target, edge_type,
|
|
@@ -805,16 +803,15 @@ with tab_kg_exp:
|
|
| 805 |
WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
|
| 806 |
)
|
| 807 |
SELECT source, target, edge_type FROM ranked
|
| 808 |
-
WHERE rn <= {
|
| 809 |
""").df()
|
| 810 |
|
| 811 |
-
# 2-hop: citation_event β
|
| 812 |
-
# citation_event β HAS_PRIMARY_INTENT β intent
|
| 813 |
event_ids = [
|
| 814 |
x for x in
|
| 815 |
set(hop1["source"].tolist()) | set(hop1["target"].tolist())
|
| 816 |
if str(x).startswith("event:")
|
| 817 |
-
][:
|
| 818 |
|
| 819 |
if event_ids:
|
| 820 |
ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
|
|
@@ -829,7 +826,7 @@ with tab_kg_exp:
|
|
| 829 |
AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
|
| 830 |
)
|
| 831 |
SELECT source, target, edge_type FROM ranked
|
| 832 |
-
WHERE rn <= {
|
| 833 |
""").df()
|
| 834 |
exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
|
| 835 |
subset=["source", "target", "edge_type"]
|
|
@@ -841,26 +838,13 @@ with tab_kg_exp:
|
|
| 841 |
exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
|
| 842 |
|
| 843 |
c1, c2, c3, c4 = st.columns(4)
|
| 844 |
-
c1.metric("Nodes",
|
| 845 |
-
c2.metric("Edges",
|
| 846 |
-
c3.metric("Node types",
|
| 847 |
-
c4.metric("Edge types",
|
| 848 |
-
|
| 849 |
-
# 컀λ²λ¦¬μ§ νμΈ νμ
|
| 850 |
-
present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
|
| 851 |
-
present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
|
| 852 |
-
all_10_ntypes = sorted(NODE_TYPE_COLORS.keys())
|
| 853 |
-
missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
|
| 854 |
-
if missing_nt:
|
| 855 |
-
st.caption(f"β Node types not yet in graph: {', '.join(missing_nt)} "
|
| 856 |
-
f"β try increasing 'Edges per type'")
|
| 857 |
-
else:
|
| 858 |
-
st.caption("β
All 10 node types represented")
|
| 859 |
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
seed_node_ids=seed_ids),
|
| 863 |
-
use_container_width=True)
|
| 864 |
|
| 865 |
except Exception as e:
|
| 866 |
st.error(str(e))
|
|
|
|
| 682 |
fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 683 |
st.plotly_chart(fig, use_container_width=True)
|
| 684 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
st.subheader("CitationHub Intent Distribution")
|
| 686 |
all_intents = events.groupby("primary_intent").size().to_dict()
|
| 687 |
ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
|
|
|
|
| 742 |
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 743 |
|
| 744 |
# ββ λ
Έλ/μ£μ§ νμ
λΆν¬ ν΅κ³
|
| 745 |
+
import duckdb as _ddb
|
| 746 |
+
|
| 747 |
+
nt = kg_nodes_exp["node_type"].value_counts().reset_index()
|
| 748 |
+
nt.columns = ["node_type", "count"]
|
| 749 |
+
|
| 750 |
+
et = _ddb.execute(f"""
|
| 751 |
+
SELECT edge_type, COUNT(*) AS count
|
| 752 |
+
FROM read_parquet('{kg_edges_path}')
|
| 753 |
+
GROUP BY edge_type ORDER BY count DESC
|
| 754 |
+
""").df()
|
| 755 |
+
|
| 756 |
+
col_a, col_b, col_c, col_d = st.columns([1, 2, 1, 2])
|
| 757 |
with col_a:
|
| 758 |
st.subheader("Node Types")
|
|
|
|
|
|
|
| 759 |
st.dataframe(nt, use_container_width=True, hide_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
with col_b:
|
| 761 |
st.subheader("CitationHub KG Node Distribution")
|
| 762 |
nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
|
| 763 |
color_discrete_map=NODE_TYPE_COLORS)
|
| 764 |
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 765 |
st.plotly_chart(nt_fig, use_container_width=True)
|
| 766 |
+
with col_c:
|
| 767 |
+
st.subheader("Edge Types")
|
| 768 |
+
st.dataframe(et, use_container_width=True, hide_index=True)
|
| 769 |
+
with col_d:
|
| 770 |
+
st.subheader("CitationHub KG Edge Distribution")
|
| 771 |
+
et_fig = px.bar(et, x="edge_type", y="count", color="edge_type")
|
| 772 |
+
et_fig.update_layout(showlegend=False, xaxis_title="",
|
| 773 |
+
yaxis_title="Count", xaxis_tickangle=-35)
|
| 774 |
+
st.plotly_chart(et_fig, use_container_width=True)
|
| 775 |
|
| 776 |
# ββ Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
|
| 777 |
st.markdown("---")
|
| 778 |
st.subheader("Multi-Node Knowledge Graph")
|
| 779 |
+
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 780 |
|
| 781 |
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 782 |
+
|
| 783 |
+
EDGES_PER_TYPE = 10 # κ° edge typeλΉ κ³ μ μν μ β 10 types Γ 10 = μ΅λ 100 edges
|
| 784 |
|
| 785 |
with st.spinner("Querying graph..."):
|
|
|
|
| 786 |
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
| 787 |
.sort_values("citedby_count", ascending=False)
|
| 788 |
.head(n_seeds))
|
|
|
|
| 791 |
if seed_ids:
|
| 792 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 793 |
|
| 794 |
+
# 1-hop: seed paperμ μ°κ²°λ λͺ¨λ edge type (journal/author/affiliation/
|
| 795 |
+
# city/country/field/citation_event ν¬ν¨)
|
| 796 |
hop1 = _ddb.execute(f"""
|
| 797 |
WITH ranked AS (
|
| 798 |
SELECT source, target, edge_type,
|
|
|
|
| 803 |
WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
|
| 804 |
)
|
| 805 |
SELECT source, target, edge_type FROM ranked
|
| 806 |
+
WHERE rn <= {EDGES_PER_TYPE}
|
| 807 |
""").df()
|
| 808 |
|
| 809 |
+
# 2-hop: citation_event β citing_paper / intent
|
|
|
|
| 810 |
event_ids = [
|
| 811 |
x for x in
|
| 812 |
set(hop1["source"].tolist()) | set(hop1["target"].tolist())
|
| 813 |
if str(x).startswith("event:")
|
| 814 |
+
][:30]
|
| 815 |
|
| 816 |
if event_ids:
|
| 817 |
ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
|
|
|
|
| 826 |
AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
|
| 827 |
)
|
| 828 |
SELECT source, target, edge_type FROM ranked
|
| 829 |
+
WHERE rn <= {EDGES_PER_TYPE}
|
| 830 |
""").df()
|
| 831 |
exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
|
| 832 |
subset=["source", "target", "edge_type"]
|
|
|
|
| 838 |
exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
|
| 839 |
|
| 840 |
c1, c2, c3, c4 = st.columns(4)
|
| 841 |
+
c1.metric("Nodes", fmt_num(len(exp_nodes)))
|
| 842 |
+
c2.metric("Edges", fmt_num(len(exp_edges)))
|
| 843 |
+
c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
|
| 844 |
+
c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 845 |
|
| 846 |
+
kg_html = pyvis_from_kg(exp_nodes, exp_edges)
|
| 847 |
+
components.html(kg_html, height=860, scrolling=True)
|
|
|
|
|
|
|
| 848 |
|
| 849 |
except Exception as e:
|
| 850 |
st.error(str(e))
|