Spaces:

Daniel0315
/

cithub_website

Sleeping

App Files Files Community

Daniel0315 commited on Mar 19

Commit

4dd37d3

verified ·

1 Parent(s): a2524a3

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +36 -52

src/app.py CHANGED Viewed

@@ -682,16 +682,6 @@ with tab_overview:
         fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
         st.plotly_chart(fig, use_container_width=True)
-        st.subheader("Citation trend (selected paper)")
-        trend = (seed_events.dropna(subset=["citing_year"])
-                 .assign(citing_year=lambda df: df["citing_year"].astype(int))
-                 .groupby("citing_year").size().reset_index(name="count"))
-        if not trend.empty:
-            st.plotly_chart(
-                px.line(trend, x="citing_year", y="count", markers=True)
-                .update_layout(xaxis_title="Year", yaxis_title="Citations"),
-                use_container_width=True)
         st.subheader("CitationHub Intent Distribution")
         all_intents = events.groupby("primary_intent").size().to_dict()
         ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
@@ -752,39 +742,47 @@ with tab_kg_exp:
             kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
         # ── 노드/엣지 타입 분포 통계
-        col_a, col_b = st.columns([1, 2])
         with col_a:
             st.subheader("Node Types")
-            nt = kg_nodes_exp["node_type"].value_counts().reset_index()
-            nt.columns = ["node_type", "count"]
             st.dataframe(nt, use_container_width=True, hide_index=True)
-            st.subheader("Edge Types")
-            import duckdb as _ddb
-            et = _ddb.execute(f"""
-                SELECT edge_type, COUNT(*) AS count
-                FROM read_parquet('{kg_edges_path}')
-                GROUP BY edge_type ORDER BY count DESC
-            """).df()
-            st.dataframe(et, use_container_width=True, hide_index=True)
         with col_b:
             st.subheader("CitationHub KG Node Distribution")
             nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
                             color_discrete_map=NODE_TYPE_COLORS)
             nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
             st.plotly_chart(nt_fig, use_container_width=True)
         # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
         st.markdown("---")
         st.subheader("Multi-Node Knowledge Graph")
-        st.caption("All 10 node types and all edge types — 2-hop from top cited seed papers")
         n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
-        edges_per_type = st.slider("Edges per type (max)", 3, 20, 8, key="kg_exp_edges_per_type")
         with st.spinner("Querying graph..."):
-            # ── 1-hop: 인용수 상위 seed papers 기준 모든 엣지
             top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
                          .sort_values("citedby_count", ascending=False)
                          .head(n_seeds))
@@ -793,8 +791,8 @@ with tab_kg_exp:
             if seed_ids:
                 ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
-                # 1-hop: seed paper와 연결된 모든 edge (journal, author, affiliation, city,
-                #         country, field, citation_event)
                 hop1 = _ddb.execute(f"""
                     WITH ranked AS (
                         SELECT source, target, edge_type,
@@ -805,16 +803,15 @@ with tab_kg_exp:
                         WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
                     )
                     SELECT source, target, edge_type FROM ranked
-                    WHERE rn <= {int(edges_per_type)}
                 """).df()
-                # 2-hop: citation_event → HAS_CITING_PAPER → citing_paper
-                #         citation_event → HAS_PRIMARY_INTENT → intent
                 event_ids = [
                     x for x in
                     set(hop1["source"].tolist()) | set(hop1["target"].tolist())
                     if str(x).startswith("event:")
-                ][:20]
                 if event_ids:
                     ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
@@ -829,7 +826,7 @@ with tab_kg_exp:
                               AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
                         )
                         SELECT source, target, edge_type FROM ranked
-                        WHERE rn <= {int(edges_per_type)}
                     """).df()
                     exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
                         subset=["source", "target", "edge_type"]
@@ -841,26 +838,13 @@ with tab_kg_exp:
                 exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
                 c1, c2, c3, c4 = st.columns(4)
-                c1.metric("Nodes",       fmt_num(len(exp_nodes)))
-                c2.metric("Edges",       fmt_num(len(exp_edges)))
-                c3.metric("Node types",  fmt_num(exp_nodes["node_type"].nunique()))
-                c4.metric("Edge types",  fmt_num(exp_edges["edge_type"].nunique()))
-                # 커버리지 확인 표시
-                present_ntypes = sorted(exp_nodes["node_type"].unique().tolist())
-                present_etypes = sorted(exp_edges["edge_type"].unique().tolist())
-                all_10_ntypes  = sorted(NODE_TYPE_COLORS.keys())
-                missing_nt = [t for t in all_10_ntypes if t not in present_ntypes]
-                if missing_nt:
-                    st.caption(f"⚠ Node types not yet in graph: {', '.join(missing_nt)} "
-                               f"— try increasing 'Edges per type'")
-                else:
-                    st.caption("✅ All 10 node types represented")
-                st.plotly_chart(
-                    plotly_network_fig(exp_nodes, exp_edges, height=800,
-                                       seed_node_ids=seed_ids),
-                    use_container_width=True)
     except Exception as e:
         st.error(str(e))

         fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
         st.plotly_chart(fig, use_container_width=True)
         st.subheader("CitationHub Intent Distribution")
         all_intents = events.groupby("primary_intent").size().to_dict()
         ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
             kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
         # ── 노드/엣지 타입 분포 통계
+        import duckdb as _ddb
+        nt = kg_nodes_exp["node_type"].value_counts().reset_index()
+        nt.columns = ["node_type", "count"]
+        et = _ddb.execute(f"""
+            SELECT edge_type, COUNT(*) AS count
+            FROM read_parquet('{kg_edges_path}')
+            GROUP BY edge_type ORDER BY count DESC
+        """).df()
+        col_a, col_b, col_c, col_d = st.columns([1, 2, 1, 2])
         with col_a:
             st.subheader("Node Types")
             st.dataframe(nt, use_container_width=True, hide_index=True)
         with col_b:
             st.subheader("CitationHub KG Node Distribution")
             nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
                             color_discrete_map=NODE_TYPE_COLORS)
             nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
             st.plotly_chart(nt_fig, use_container_width=True)
+        with col_c:
+            st.subheader("Edge Types")
+            st.dataframe(et, use_container_width=True, hide_index=True)
+        with col_d:
+            st.subheader("CitationHub KG Edge Distribution")
+            et_fig = px.bar(et, x="edge_type", y="count", color="edge_type")
+            et_fig.update_layout(showlegend=False, xaxis_title="",
+                                 yaxis_title="Count", xaxis_tickangle=-35)
+            st.plotly_chart(et_fig, use_container_width=True)
         # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
         st.markdown("---")
         st.subheader("Multi-Node Knowledge Graph")
+        st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
         n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
+        EDGES_PER_TYPE = 10   # 각 edge type당 고정 샘플 수 → 10 types × 10 = 최대 100 edges
         with st.spinner("Querying graph..."):
             top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
                          .sort_values("citedby_count", ascending=False)
                          .head(n_seeds))
             if seed_ids:
                 ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
+                # 1-hop: seed paper에 연결된 모든 edge type (journal/author/affiliation/
+                #         city/country/field/citation_event 포함)
                 hop1 = _ddb.execute(f"""
                     WITH ranked AS (
                         SELECT source, target, edge_type,
                         WHERE source IN ({ids_sql}) OR target IN ({ids_sql})
                     )
                     SELECT source, target, edge_type FROM ranked
+                    WHERE rn <= {EDGES_PER_TYPE}
                 """).df()
+                # 2-hop: citation_event → citing_paper / intent
                 event_ids = [
                     x for x in
                     set(hop1["source"].tolist()) | set(hop1["target"].tolist())
                     if str(x).startswith("event:")
+                ][:30]
                 if event_ids:
                     ev_sql = ", ".join(f"'{eid}'" for eid in event_ids)
                               AND edge_type IN ('HAS_CITING_PAPER','HAS_PRIMARY_INTENT')
                         )
                         SELECT source, target, edge_type FROM ranked
+                        WHERE rn <= {EDGES_PER_TYPE}
                     """).df()
                     exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
                         subset=["source", "target", "edge_type"]
                 exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
                 c1, c2, c3, c4 = st.columns(4)
+                c1.metric("Nodes",      fmt_num(len(exp_nodes)))
+                c2.metric("Edges",      fmt_num(len(exp_edges)))
+                c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
+                c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))
+                kg_html = pyvis_from_kg(exp_nodes, exp_edges)
+                components.html(kg_html, height=860, scrolling=True)
     except Exception as e:
         st.error(str(e))