Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +60 -79
src/app.py
CHANGED
|
@@ -510,7 +510,7 @@ with tab_overview:
|
|
| 510 |
.update_layout(xaxis_title="Year", yaxis_title="Citations"),
|
| 511 |
use_container_width=True)
|
| 512 |
|
| 513 |
-
st.subheader("
|
| 514 |
all_intents = events.groupby("primary_intent").size().to_dict()
|
| 515 |
ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
|
| 516 |
"count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
|
|
@@ -519,7 +519,7 @@ with tab_overview:
|
|
| 519 |
fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 520 |
st.plotly_chart(fig2, use_container_width=True)
|
| 521 |
|
| 522 |
-
st.subheader("Field
|
| 523 |
fd = (seed_filtered.groupby("field", dropna=False).size()
|
| 524 |
.reset_index(name="count").sort_values("count", ascending=False).head(20))
|
| 525 |
fd["field"] = fd["field"].replace("","Unknown")
|
|
@@ -546,7 +546,7 @@ with tab_overview:
|
|
| 546 |
|
| 547 |
# βββ 2. CITATION NETWORK ββββββββββββββββββββββββββββββββββββββββ
|
| 548 |
with tab_cnet:
|
| 549 |
-
st.subheader("
|
| 550 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 551 |
if seed_events.empty:
|
| 552 |
st.info("No citation network data for this seed paper.")
|
|
@@ -561,31 +561,30 @@ with tab_ontology:
|
|
| 561 |
components.html(pyvis_ontology(), height=820, scrolling=True)
|
| 562 |
|
| 563 |
|
| 564 |
-
# βββ 4. KNOWLEDGE GRAPH
|
| 565 |
with tab_kg:
|
| 566 |
-
st.subheader("Knowledge Graph
|
| 567 |
-
st.caption("kg_nodes + kg_edgesμμ μ νλ seed paperμ 1-hop μλΈκ·Έλν (DuckDB λΆλΆ 쿼리)")
|
| 568 |
|
| 569 |
max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
|
| 570 |
|
| 571 |
try:
|
| 572 |
-
with st.spinner("
|
| 573 |
-
|
| 574 |
-
kg_edges_path
|
| 575 |
|
| 576 |
seed_doi = selected_seed["doi"]
|
| 577 |
if not seed_doi:
|
| 578 |
-
st.warning("
|
| 579 |
else:
|
| 580 |
node_id = f"seed:{seed_doi}"
|
| 581 |
-
with st.spinner("
|
| 582 |
edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
|
| 583 |
|
| 584 |
if edges_sub.empty:
|
| 585 |
-
st.warning(
|
| 586 |
else:
|
| 587 |
all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
|
| 588 |
-
nodes_sub =
|
| 589 |
|
| 590 |
c1, c2, c3 = st.columns(3)
|
| 591 |
c1.metric("Nodes", fmt_num(len(nodes_sub)))
|
|
@@ -596,8 +595,7 @@ with tab_kg:
|
|
| 596 |
type_counts.columns = ["node_type", "count"]
|
| 597 |
st.plotly_chart(
|
| 598 |
px.bar(type_counts, x="node_type", y="count",
|
| 599 |
-
color="node_type", color_discrete_map=NODE_TYPE_COLORS
|
| 600 |
-
title="Node Type Distribution")
|
| 601 |
.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
|
| 602 |
use_container_width=True)
|
| 603 |
|
|
@@ -607,27 +605,25 @@ with tab_kg:
|
|
| 607 |
st.error(str(e))
|
| 608 |
|
| 609 |
|
| 610 |
-
# βββ 5. KG EXPLORER βββββββββββββββββββββββββββββββββββββββββββββ
|
| 611 |
with tab_kg_exp:
|
| 612 |
st.subheader("KG Explorer")
|
| 613 |
-
st.caption("kg_nodesλ₯Ό νμνκ³ μμ λ
Έλμ μ°κ²° κ΄κ³λ₯Ό μκ°νν©λλ€. kg_edgesλ DuckDBλ‘ νμν λΆλΆλ§ 쿼리ν©λλ€.")
|
| 614 |
|
| 615 |
try:
|
| 616 |
-
|
| 617 |
-
with st.spinner("KG λ°μ΄ν° λ‘λ© μ€... (μ΅μ΄ 1ν ν μΊμλ©λλ€)"):
|
| 618 |
kg_nodes_exp = load_kg_nodes(data_dir_val)
|
| 619 |
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 620 |
enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
|
| 621 |
|
| 622 |
-
# ββ
|
| 623 |
col_a, col_b = st.columns([1, 2])
|
| 624 |
with col_a:
|
| 625 |
-
st.subheader("Node
|
| 626 |
nt = kg_nodes_exp["node_type"].value_counts().reset_index()
|
| 627 |
nt.columns = ["node_type", "count"]
|
| 628 |
st.dataframe(nt, use_container_width=True, hide_index=True)
|
| 629 |
|
| 630 |
-
st.subheader("Edge
|
| 631 |
import duckdb as _ddb
|
| 632 |
et = _ddb.execute(f"""
|
| 633 |
SELECT edge_type, COUNT(*) AS count
|
|
@@ -637,70 +633,55 @@ with tab_kg_exp:
|
|
| 637 |
st.dataframe(et, use_container_width=True, hide_index=True)
|
| 638 |
|
| 639 |
with col_b:
|
| 640 |
-
st.subheader("
|
| 641 |
nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
|
| 642 |
color_discrete_map=NODE_TYPE_COLORS)
|
| 643 |
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 644 |
st.plotly_chart(nt_fig, use_container_width=True)
|
| 645 |
|
| 646 |
-
# ββ
|
| 647 |
st.markdown("---")
|
| 648 |
-
st.subheader("Node
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
exp_edges = query_explorer_edges(sel_node_id, kg_edges_path, max_e)
|
| 682 |
-
if exp_edges.empty:
|
| 683 |
-
st.warning("μ°κ²°λ μ£μ§κ° μμ΅λλ€.")
|
| 684 |
-
else:
|
| 685 |
-
all_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
|
| 686 |
-
st.session_state["exp_nodes"] = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_ids)]
|
| 687 |
-
st.session_state["exp_edges"] = exp_edges
|
| 688 |
-
|
| 689 |
-
with exp_col2:
|
| 690 |
-
if "exp_nodes" in st.session_state:
|
| 691 |
-
en = st.session_state["exp_nodes"]
|
| 692 |
-
ee = st.session_state["exp_edges"]
|
| 693 |
-
st.caption(f"Nodes: {len(en)} | Edges: {len(ee)}")
|
| 694 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 695 |
-
components.html(pyvis_from_kg(
|
| 696 |
-
|
| 697 |
-
st.info("μΌμͺ½μμ λ
Έλλ₯Ό μ ννκ³ 'Show ego network'λ₯Ό ν΄λ¦νμΈμ.")
|
| 698 |
|
| 699 |
-
# ββ Enriched μΈμ¬μ΄νΈ
|
| 700 |
st.markdown("---")
|
| 701 |
-
st.subheader("
|
| 702 |
-
st.
|
| 703 |
-
with st.spinner("Enriched ν΅κ³ 쿼리 μ€ (DuckDB)..."):
|
| 704 |
sem_df, field_df = query_enriched_stats(enriched_path)
|
| 705 |
|
| 706 |
if not sem_df.empty:
|
|
@@ -810,7 +791,7 @@ with tab_analytics:
|
|
| 810 |
col_c, col_d = st.columns(2)
|
| 811 |
|
| 812 |
with col_c:
|
| 813 |
-
st.subheader("Field Γ Intent
|
| 814 |
fi = (seed[["seed_paper_id","field"]]
|
| 815 |
.merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
|
| 816 |
.groupby(["field","primary_intent"]).size().reset_index(name="count"))
|
|
@@ -818,7 +799,7 @@ with tab_analytics:
|
|
| 818 |
pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
|
| 819 |
st.plotly_chart(
|
| 820 |
px.imshow(pivot, color_continuous_scale="Blues",
|
| 821 |
-
title="
|
| 822 |
.update_layout(xaxis_title="Intent", yaxis_title="Field"),
|
| 823 |
use_container_width=True)
|
| 824 |
|
|
|
|
| 510 |
.update_layout(xaxis_title="Year", yaxis_title="Citations"),
|
| 511 |
use_container_width=True)
|
| 512 |
|
| 513 |
+
st.subheader("CitationHub Intent Distribution")
|
| 514 |
all_intents = events.groupby("primary_intent").size().to_dict()
|
| 515 |
ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
|
| 516 |
"count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
|
|
|
|
| 519 |
fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 520 |
st.plotly_chart(fig2, use_container_width=True)
|
| 521 |
|
| 522 |
+
st.subheader("CitationHub Field Distribution")
|
| 523 |
fd = (seed_filtered.groupby("field", dropna=False).size()
|
| 524 |
.reset_index(name="count").sort_values("count", ascending=False).head(20))
|
| 525 |
fd["field"] = fd["field"].replace("","Unknown")
|
|
|
|
| 546 |
|
| 547 |
# βββ 2. CITATION NETWORK ββββββββββββββββββββββββββββββββββββββββ
|
| 548 |
with tab_cnet:
|
| 549 |
+
st.subheader("Citation Network")
|
| 550 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 551 |
if seed_events.empty:
|
| 552 |
st.info("No citation network data for this seed paper.")
|
|
|
|
| 561 |
components.html(pyvis_ontology(), height=820, scrolling=True)
|
| 562 |
|
| 563 |
|
| 564 |
+
# βββ 4. KNOWLEDGE GRAPH ββββββββββββββββββββββββββββββββββββββββββ
|
| 565 |
with tab_kg:
|
| 566 |
+
st.subheader("Knowledge Graph")
|
|
|
|
| 567 |
|
| 568 |
max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
|
| 569 |
|
| 570 |
try:
|
| 571 |
+
with st.spinner("Loading..."):
|
| 572 |
+
kg_nodes_kg = load_kg_nodes(data_dir_val)
|
| 573 |
+
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 574 |
|
| 575 |
seed_doi = selected_seed["doi"]
|
| 576 |
if not seed_doi:
|
| 577 |
+
st.warning("Selected seed paper has no DOI.")
|
| 578 |
else:
|
| 579 |
node_id = f"seed:{seed_doi}"
|
| 580 |
+
with st.spinner("Querying graph..."):
|
| 581 |
edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
|
| 582 |
|
| 583 |
if edges_sub.empty:
|
| 584 |
+
st.warning("No edges found for this paper in the knowledge graph.")
|
| 585 |
else:
|
| 586 |
all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
|
| 587 |
+
nodes_sub = kg_nodes_kg[kg_nodes_kg["node_id"].isin(all_node_ids)]
|
| 588 |
|
| 589 |
c1, c2, c3 = st.columns(3)
|
| 590 |
c1.metric("Nodes", fmt_num(len(nodes_sub)))
|
|
|
|
| 595 |
type_counts.columns = ["node_type", "count"]
|
| 596 |
st.plotly_chart(
|
| 597 |
px.bar(type_counts, x="node_type", y="count",
|
| 598 |
+
color="node_type", color_discrete_map=NODE_TYPE_COLORS)
|
|
|
|
| 599 |
.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
|
| 600 |
use_container_width=True)
|
| 601 |
|
|
|
|
| 605 |
st.error(str(e))
|
| 606 |
|
| 607 |
|
| 608 |
+
# βββ 5. KG EXPLORER ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 609 |
with tab_kg_exp:
|
| 610 |
st.subheader("KG Explorer")
|
|
|
|
| 611 |
|
| 612 |
try:
|
| 613 |
+
with st.spinner("Loading..."):
|
|
|
|
| 614 |
kg_nodes_exp = load_kg_nodes(data_dir_val)
|
| 615 |
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 616 |
enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
|
| 617 |
|
| 618 |
+
# ββ λ
Έλ/μ£μ§ νμ
λΆν¬ ν΅κ³
|
| 619 |
col_a, col_b = st.columns([1, 2])
|
| 620 |
with col_a:
|
| 621 |
+
st.subheader("Node Types")
|
| 622 |
nt = kg_nodes_exp["node_type"].value_counts().reset_index()
|
| 623 |
nt.columns = ["node_type", "count"]
|
| 624 |
st.dataframe(nt, use_container_width=True, hide_index=True)
|
| 625 |
|
| 626 |
+
st.subheader("Edge Types")
|
| 627 |
import duckdb as _ddb
|
| 628 |
et = _ddb.execute(f"""
|
| 629 |
SELECT edge_type, COUNT(*) AS count
|
|
|
|
| 633 |
st.dataframe(et, use_container_width=True, hide_index=True)
|
| 634 |
|
| 635 |
with col_b:
|
| 636 |
+
st.subheader("CitationHub KG Node Distribution")
|
| 637 |
nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
|
| 638 |
color_discrete_map=NODE_TYPE_COLORS)
|
| 639 |
nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
|
| 640 |
st.plotly_chart(nt_fig, use_container_width=True)
|
| 641 |
|
| 642 |
+
# ββ λ©ν°λ
Έλ μλ μκ°ν (μΈμ©μ μμ seed papers + μ°κ²° λ
Έλ)
|
| 643 |
st.markdown("---")
|
| 644 |
+
st.subheader("Multi-Node Knowledge Graph")
|
| 645 |
+
|
| 646 |
+
n_seeds = st.slider("Number of seed papers", 3, 20, 8, key="kg_exp_n_seeds")
|
| 647 |
+
max_exp_edges = st.slider("Max edges", 30, 200, 100, key="kg_exp_max_edges")
|
| 648 |
+
|
| 649 |
+
with st.spinner("Querying graph..."):
|
| 650 |
+
# μΈμ©μ μμ seed papers μ ν
|
| 651 |
+
top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
|
| 652 |
+
.sort_values("citedby_count", ascending=False)
|
| 653 |
+
.head(n_seeds))
|
| 654 |
+
seed_ids = top_seeds["node_id"].tolist()
|
| 655 |
+
|
| 656 |
+
if seed_ids:
|
| 657 |
+
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 658 |
+
# λ©νλ°μ΄ν° μ£μ§λ§ κ°μ Έμ΄ (citing/cited/intent μ μΈ β μ μ/μ λ/λΆμΌ λ±)
|
| 659 |
+
exp_edges = _ddb.execute(f"""
|
| 660 |
+
SELECT source, target, edge_type
|
| 661 |
+
FROM read_parquet('{kg_edges_path}')
|
| 662 |
+
WHERE (source IN ({ids_sql}) OR target IN ({ids_sql}))
|
| 663 |
+
AND edge_type NOT IN (
|
| 664 |
+
'HAS_CITING_PAPER','HAS_CITED_PAPER','HAS_PRIMARY_INTENT'
|
| 665 |
+
)
|
| 666 |
+
LIMIT {int(max_exp_edges)}
|
| 667 |
+
""").df()
|
| 668 |
+
|
| 669 |
+
all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
|
| 670 |
+
exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
|
| 671 |
+
|
| 672 |
+
c1, c2, c3 = st.columns(3)
|
| 673 |
+
c1.metric("Nodes", fmt_num(len(exp_nodes)))
|
| 674 |
+
c2.metric("Edges", fmt_num(len(exp_edges)))
|
| 675 |
+
c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
|
| 676 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 678 |
+
components.html(pyvis_from_kg(exp_nodes, exp_edges, height="780px"),
|
| 679 |
+
height=800, scrolling=True)
|
|
|
|
| 680 |
|
| 681 |
+
# ββ Enriched μΈμ¬μ΄νΈ
|
| 682 |
st.markdown("---")
|
| 683 |
+
st.subheader("CitationHub Semantic Evidence Distribution")
|
| 684 |
+
with st.spinner("Loading..."):
|
|
|
|
| 685 |
sem_df, field_df = query_enriched_stats(enriched_path)
|
| 686 |
|
| 687 |
if not sem_df.empty:
|
|
|
|
| 791 |
col_c, col_d = st.columns(2)
|
| 792 |
|
| 793 |
with col_c:
|
| 794 |
+
st.subheader("CitationHub Field Γ Intent Distribution")
|
| 795 |
fi = (seed[["seed_paper_id","field"]]
|
| 796 |
.merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
|
| 797 |
.groupby(["field","primary_intent"]).size().reset_index(name="count"))
|
|
|
|
| 799 |
pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
|
| 800 |
st.plotly_chart(
|
| 801 |
px.imshow(pivot, color_continuous_scale="Blues",
|
| 802 |
+
title="CitationHub Field Γ Intent Distribution", aspect="auto")
|
| 803 |
.update_layout(xaxis_title="Intent", yaxis_title="Field"),
|
| 804 |
use_container_width=True)
|
| 805 |
|