Spaces:

Daniel0315
/

cithub_website

Sleeping

App Files Files Community

Daniel0315 commited on Mar 19

Commit

a2524a3

verified ·

1 Parent(s): f7077fb

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +80 -82

src/app.py CHANGED Viewed

@@ -636,10 +636,10 @@ contexts_df    = build_context_rows(seed_events)
 citing_table   = build_citing_table(seed_events)
 # ── 탭 ─────────────────────────────────────────────────────────
-(tab_overview, tab_cnet, tab_ontology, tab_kg,
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
     "Overview","Citation Network","Ontology",
-    "Knowledge Graph","KG Explorer","Geographic Map","Analytics",
 ])
@@ -742,52 +742,7 @@ with tab_ontology:
     st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
-# ═══ 4. KNOWLEDGE GRAPH ══════════════════════════════════════════
-with tab_kg:
-    st.subheader("Knowledge Graph")
-    max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
-    try:
-        with st.spinner("Loading..."):
-            kg_nodes_kg   = load_kg_nodes(data_dir_val)
-            kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
-        seed_doi = selected_seed["doi"]
-        if not seed_doi:
-            st.warning("Selected seed paper has no DOI.")
-        else:
-            node_id = f"seed:{seed_doi}"
-            with st.spinner("Querying graph..."):
-                edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
-            if edges_sub.empty:
-                st.warning("No edges found for this paper in the knowledge graph.")
-            else:
-                all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
-                nodes_sub = kg_nodes_kg[kg_nodes_kg["node_id"].isin(all_node_ids)]
-                c1, c2, c3 = st.columns(3)
-                c1.metric("Nodes",      fmt_num(len(nodes_sub)))
-                c2.metric("Edges",      fmt_num(len(edges_sub)))
-                c3.metric("Node types", fmt_num(nodes_sub["node_type"].nunique()))
-                type_counts = nodes_sub["node_type"].value_counts().reset_index()
-                type_counts.columns = ["node_type", "count"]
-                st.plotly_chart(
-                    px.bar(type_counts, x="node_type", y="count",
-                           color="node_type", color_discrete_map=NODE_TYPE_COLORS)
-                    .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
-                    use_container_width=True)
-                st.plotly_chart(
-                    plotly_network_fig(nodes_sub, edges_sub, height=750),
-                    use_container_width=True)
-    except Exception as e:
-        st.error(str(e))
-# ═══ 5. KG EXPLORER ══════════════════════════════════════════════
 with tab_kg_exp:
     st.subheader("KG Explorer")
@@ -820,15 +775,16 @@ with tab_kg_exp:
             nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
             st.plotly_chart(nt_fig, use_container_width=True)
-        # ── 멀티노드 자동 시각화 (인용수 상위 seed papers + 연결 노드)
         st.markdown("---")
         st.subheader("Multi-Node Knowledge Graph")
-        n_seeds = st.slider("Number of seed papers", 3, 20, 8, key="kg_exp_n_seeds")
-        max_exp_edges = st.slider("Max edges", 30, 200, 100, key="kg_exp_max_edges")
         with st.spinner("Querying graph..."):
-            # 인용수 상위 seed papers 선택
             top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
                          .sort_values("citedby_count", ascending=False)
                          .head(n_seeds))
@@ -836,27 +792,74 @@ with tab_kg_exp:
             if seed_ids:
                 ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
-                # 메타데이터 엣지만 가져옴 (citing/cited/intent 제외 → 저자/저널/분야 등)
-                exp_edges = _ddb.execute(f"""
-                    SELECT source, target, edge_type
-                    FROM read_parquet('{kg_edges_path}')
-                    WHERE (source IN ({ids_sql}) OR target IN ({ids_sql}))
-                      AND edge_type NOT IN (
-                          'HAS_CITING_PAPER','HAS_CITED_PAPER','HAS_PRIMARY_INTENT'
-                      )
-                    LIMIT {int(max_exp_edges)}
                 """).df()
                 all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
                 exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
-                c1, c2, c3 = st.columns(3)
-                c1.metric("Nodes", fmt_num(len(exp_nodes)))
-                c2.metric("Edges", fmt_num(len(exp_edges)))
-                c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
                 st.plotly_chart(
-                    plotly_network_fig(exp_nodes, exp_edges, height=750),
                     use_container_width=True)
     except Exception as e:
@@ -946,7 +949,7 @@ with tab_analytics:
     col_c, col_d = st.columns(2)
     with col_c:
-        st.subheader("CitationHub Field × Intent Distribution")
         fi = (seed[["seed_paper_id","field"]]
               .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
               .groupby(["field","primary_intent"]).size().reset_index(name="count"))
@@ -954,7 +957,8 @@ with tab_analytics:
             pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
             st.plotly_chart(
                 px.imshow(pivot, color_continuous_scale="Blues",
-                          title="CitationHub Field × Intent Distribution", aspect="auto")
                 .update_layout(xaxis_title="Intent", yaxis_title="Field"),
                 use_container_width=True)
@@ -969,13 +973,6 @@ with tab_analytics:
                        title="Influential vs Non-influential"),
                 use_container_width=True)
-        st.subheader("Intent Reference")
-        st.dataframe(intents_df, use_container_width=True, hide_index=True)
-    st.markdown("---")
-    st.subheader("Field Reference")
-    st.dataframe(fields_df, use_container_width=True, hide_index=True)
     # ── Intent Evolution over Years ────────────────────────────
     st.markdown("---")
     st.subheader("CitationHub Intent Evolution over Years")
@@ -983,6 +980,7 @@ with tab_analytics:
     intent_trend_raw = (
         events.dropna(subset=["citing_year"])
         .assign(year=lambda df: df["citing_year"].astype(int))
         .groupby(["year", "primary_intent"]).size()
         .reset_index(name="count")
     )
@@ -1023,8 +1021,8 @@ with tab_analytics:
             )
     with col_v2:
-        st.subheader("Intent Mix by Field")
-        st.caption("How each field uses citations differently")
         fi_pct = (
             seed[["seed_paper_id", "field"]]
             .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
@@ -1033,17 +1031,17 @@ with tab_analytics:
         if not fi_pct.empty:
             totals = fi_pct.groupby("field")["count"].transform("sum")
             fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
-            top_fields = fi_pct.groupby("field")["count"].sum().nlargest(12).index
-            fi_pct_top = fi_pct[fi_pct["field"].isin(top_fields)]
             st.plotly_chart(
-                px.bar(fi_pct_top, x="pct", y="field", color="primary_intent",
                        orientation="h", color_discrete_map=INTENT_COLORS,
                        labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
                 .update_layout(
                     barmode="stack",
-                    yaxis=dict(autorange="reversed"),
                     xaxis_title="% of citations", yaxis_title="",
-                    legend_title="Intent", height=520,
                 ),
                 use_container_width=True,
             )

 citing_table   = build_citing_table(seed_events)
 # ── 탭 ─────────────────────────────────────────────────────────
+(tab_overview, tab_cnet, tab_ontology,
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
     "Overview","Citation Network","Ontology",
+    "Knowledge Graph","Geographic Map","Analytics",
 ])
     st.plotly_chart(plotly_ontology_fig(height=750), use_container_width=True)
+# ═══ 4. KNOWLEDGE GRAPH (KG Explorer) ═══════════════════════════
 with tab_kg_exp:
     st.subheader("KG Explorer")
             nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
             st.plotly_chart(nt_fig, use_container_width=True)
+        # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
         st.markdown("---")
         st.subheader("Multi-Node Knowledge Graph")
+        st.caption("All 10 node types and all edge types — 2-hop from top cited seed papers")
+        n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
+        edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type")
         with st.spinner("Querying graph..."):
+            # ── 1-hop: 인용수 상위 seed papers 기준 모든 엣지
             top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
                          .sort_values("citedby_count", ascending=False)
                          .head(n_seeds))
             if seed_ids:
                 ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
+                # 1-hop: seed paper와 연결된 모든 edge (journal, author, affiliation, city,
+                #         country, field, citation_event)
+                hop1 = _ddb.execute(f"""
+                    WITH ranked AS (
+                        SELECT source, target, edge_type,
+                               ROW_NUMBER() OVER (
+                                   PARTITION BY edge_type ORDER BY source
+                               ) AS rn
+                        FROM read_parquet('{kg_edges_path}')
+                        WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
+                    )
+                    SELECT source, target, edge_type FROM ranked
+                    WHERE rn <= {int(edges_per_type)}
                 """).df()
+                # 2-hop: citation_event → HAS_CITING_PAPER → citing_paper
+                #         citation_event → HAS_PRIMARY_INTENT → intent
+                event_ids = [
+                    x for x in
+                    set(hop1["source"].tolist()) | set(hop1["target"].tolist())
+                    if str(x).startswith("event:")
+                ][:20]
+                if event_ids:
+                    ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
+                    hop2 = _ddb.execute(f"""
+                        WITH ranked AS (
+                            SELECT source, target, edge_type,
+                                   ROW_NUMBER() OVER (
+                                       PARTITION BY edge_type ORDER BY source
+                                   ) AS rn
+                            FROM read_parquet('{kg_edges_path}')
+                            WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))
+                              AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
+                        )
+                        SELECT source, target, edge_type FROM ranked
+                        WHERE rn <= {int(edges_per_type)}
+                    """).df()
+                    exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
+                        subset=["source", "target", "edge_type"]
+                    )
+                else:
+                    exp_edges = hop1
                 all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
                 exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
+                c1, c2, c3, c4 = st.columns(4)
+                c1.metric("Nodes",       fmt_num(len(exp_nodes)))
+                c2.metric("Edges",       fmt_num(len(exp_edges)))
+                c3.metric("Node types",  fmt_num(exp_nodes["node_type"].nunique()))
+                c4.metric("Edge types",  fmt_num(exp_edges["edge_type"].nunique()))
+                # 커버리지 확인 표시
+                present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
+                present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
+                all_10_ntypes  = sorted(NODE_TYPE_COLORS.keys())
+                missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
+                if missing_nt:
+                    st.caption(f"⚠ Node types not yet in graph: {', '.join(missing_nt)} "
+                               f"— try increasing 'Edges per type'")
+                else:
+                    st.caption("✅ All 10 node types represented")
                 st.plotly_chart(
+                    plotly_network_fig(exp_nodes, exp_edges, height=800,
+                                       seed_node_ids=seed_ids),
                     use_container_width=True)
     except Exception as e:
     col_c, col_d = st.columns(2)
     with col_c:
+        st.subheader("CitationHub Field × Intent Distribution Heatmap")
         fi = (seed[["seed_paper_id","field"]]
               .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
               .groupby(["field","primary_intent"]).size().reset_index(name="count"))
             pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
             st.plotly_chart(
                 px.imshow(pivot, color_continuous_scale="Blues",
+                          title="CitationHub Field × Intent Distribution Heatmap",
+                          aspect="auto")
                 .update_layout(xaxis_title="Intent", yaxis_title="Field"),
                 use_container_width=True)
                        title="Influential vs Non-influential"),
                 use_container_width=True)
     # ── Intent Evolution over Years ────────────────────────────
     st.markdown("---")
     st.subheader("CitationHub Intent Evolution over Years")
     intent_trend_raw = (
         events.dropna(subset=["citing_year"])
         .assign(year=lambda df: df["citing_year"].astype(int))
+        .query("year >= 2000")
         .groupby(["year", "primary_intent"]).size()
         .reset_index(name="count")
     )
             )
     with col_v2:
+        st.subheader("CitationHub Field × Intent Distribution")
+        st.caption("How each field uses citations differently (all fields)")
         fi_pct = (
             seed[["seed_paper_id", "field"]]
             .merge(events[["seed_paper_id", "primary_intent"]], on="seed_paper_id", how="inner")
         if not fi_pct.empty:
             totals = fi_pct.groupby("field")["count"].transform("sum")
             fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
+            n_fields = fi_pct["field"].nunique()
+            chart_height = max(520, n_fields * 28)
             st.plotly_chart(
+                px.bar(fi_pct, x="pct", y="field", color="primary_intent",
                        orientation="h", color_discrete_map=INTENT_COLORS,
                        labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
                 .update_layout(
                     barmode="stack",
+                    yaxis=dict(autorange="reversed", categoryorder="total ascending"),
                     xaxis_title="% of citations", yaxis_title="",
+                    legend_title="Intent", height=chart_height,
                 ),
                 use_container_width=True,
             )