Spaces:

Daniel0315
/

cithub_website

Sleeping

App Files Files Community

Daniel0315 commited on Mar 19

Commit

5e840b3

verified ·

1 Parent(s): 3899ec8

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +60 -79

src/app.py CHANGED Viewed

@@ -510,7 +510,7 @@ with tab_overview:
                 .update_layout(xaxis_title="Year", yaxis_title="Citations"),
                 use_container_width=True)
-        st.subheader("Overall intent distribution")
         all_intents = events.groupby("primary_intent").size().to_dict()
         ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
                                "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
@@ -519,7 +519,7 @@ with tab_overview:
         fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
         st.plotly_chart(fig2, use_container_width=True)
-        st.subheader("Field distribution")
         fd = (seed_filtered.groupby("field", dropna=False).size()
               .reset_index(name="count").sort_values("count", ascending=False).head(20))
         fd["field"] = fd["field"].replace("","Unknown")
@@ -546,7 +546,7 @@ with tab_overview:
 # ═══ 2. CITATION NETWORK ════════════════════════════════════════
 with tab_cnet:
-    st.subheader("Citing ↔ Cited Citation Network")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
     if seed_events.empty:
         st.info("No citation network data for this seed paper.")
@@ -561,31 +561,30 @@ with tab_ontology:
     components.html(pyvis_ontology(), height=820, scrolling=True)
-# ═══ 4. KNOWLEDGE GRAPH (실제 KG 데이터, DuckDB) ════════════════
 with tab_kg:
-    st.subheader("Knowledge Graph — Selected Seed Paper")
-    st.caption("kg_nodes + kg_edges에서 선택된 seed paper의 1-hop 서브그래프 (DuckDB 부분 쿼리)")
     max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
     try:
-        with st.spinner("KG 데이터 로딩 중... (최초 1회 후 캐시됩니다)"):
-            kg_nodes       = load_kg_nodes(data_dir_val)
-            kg_edges_path  = get_parquet_path("kg_edges.parquet", data_dir_val)
         seed_doi = selected_seed["doi"]
         if not seed_doi:
-            st.warning("선택된 seed paper의 DOI가 없어 KG 조회가 불가합니다.")
         else:
             node_id = f"seed:{seed_doi}"
-            with st.spinner("kg_edges 쿼리 중 (DuckDB)..."):
                 edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
             if edges_sub.empty:
-                st.warning(f"KG에서 해당 노드의 엣지를 찾을 수 없습니다. (node_id: {node_id})")
             else:
                 all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
-                nodes_sub = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
                 c1, c2, c3 = st.columns(3)
                 c1.metric("Nodes",      fmt_num(len(nodes_sub)))
@@ -596,8 +595,7 @@ with tab_kg:
                 type_counts.columns = ["node_type", "count"]
                 st.plotly_chart(
                     px.bar(type_counts, x="node_type", y="count",
-                           color="node_type", color_discrete_map=NODE_TYPE_COLORS,
-                           title="Node Type Distribution")
                     .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
                     use_container_width=True)
@@ -607,27 +605,25 @@ with tab_kg:
         st.error(str(e))
-# ═══ 5. KG EXPLORER ═════════════════════════════════════════════
 with tab_kg_exp:
     st.subheader("KG Explorer")
-    st.caption("kg_nodes를 탐색하고 임의 노드의 연결 관계를 시각화합니다. kg_edges는 DuckDB로 필요한 부분만 쿼리합니다.")
     try:
-        # spinner는 로딩만, UI는 spinner 밖에
-        with st.spinner("KG 데이터 로딩 중... (최초 1회 후 캐시됩니다)"):
             kg_nodes_exp  = load_kg_nodes(data_dir_val)
             kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
             enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
-        # ── 전체 노드/엣지 타입 분포
         col_a, col_b = st.columns([1, 2])
         with col_a:
-            st.subheader("Node Type Counts")
             nt = kg_nodes_exp["node_type"].value_counts().reset_index()
             nt.columns = ["node_type", "count"]
             st.dataframe(nt, use_container_width=True, hide_index=True)
-            st.subheader("Edge Type Counts")
             import duckdb as _ddb
             et = _ddb.execute(f"""
                 SELECT edge_type, COUNT(*) AS count
@@ -637,70 +633,55 @@ with tab_kg_exp:
             st.dataframe(et, use_container_width=True, hide_index=True)
         with col_b:
-            st.subheader("Node Type Distribution")
             nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
                             color_discrete_map=NODE_TYPE_COLORS)
             nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
             st.plotly_chart(nt_fig, use_container_width=True)
-        # ── Node Search & Ego Network
         st.markdown("---")
-        st.subheader("Node Search & Ego Network")
-        exp_col1, exp_col2 = st.columns([1, 3])
-        with exp_col1:
-            type_options = ["(all)"] + sorted(kg_nodes_exp["node_type"].unique().tolist())
-            sel_type = st.selectbox("Filter by node type", type_options)
-            filtered_nodes = (kg_nodes_exp if sel_type == "(all)"
-                              else kg_nodes_exp[kg_nodes_exp["node_type"] == sel_type])
-            search_q = st.text_input("Search node label / DOI")
-            if search_q:
-                filtered_nodes = filtered_nodes[
-                    filtered_nodes["label"].str.contains(search_q, case=False, na=False) |
-                    filtered_nodes["doi"].str.contains(search_q, case=False, na=False)
-                ]
-            sample = filtered_nodes.head(100)
-            node_options = sample["node_id"].tolist()
-            if not node_options:
-                st.warning("검색 결과가 없습니다.")
-            else:
-                sel_node_id = st.selectbox(
-                    "Select node", node_options,
-                    format_func=lambda nid: sample.loc[sample["node_id"] == nid, "label"].iloc[0][:60],
-                )
-                sel_node_info = sample[sample["node_id"] == sel_node_id].iloc[0]
-                st.markdown(f"**Type**: {sel_node_info.get('node_type', '')}")
-                st.markdown(f"**DOI**: {sel_node_info.get('doi', '') or '-'}")
-                st.markdown(f"**Publication**: {sel_node_info.get('publication_name', '') or '-'}")
-                st.markdown(f"**Group**: {sel_node_info.get('group', '') or '-'}")
-                st.markdown(f"**Cited by**: {fmt_num(sel_node_info.get('citedby_count', ''))}")
-                max_e = st.slider("Max edges shown", 20, 150, 60, key="kg_exp_max")
-                if st.button("Show ego network", key="kg_exp_show"):
-                    with st.spinner("DuckDB로 엣지 쿼리 중..."):
-                        exp_edges = query_explorer_edges(sel_node_id, kg_edges_path, max_e)
-                    if exp_edges.empty:
-                        st.warning("연결된 엣지가 없습니다.")
-                    else:
-                        all_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
-                        st.session_state["exp_nodes"] = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_ids)]
-                        st.session_state["exp_edges"] = exp_edges
-        with exp_col2:
-            if "exp_nodes" in st.session_state:
-                en = st.session_state["exp_nodes"]
-                ee = st.session_state["exp_edges"]
-                st.caption(f"Nodes: {len(en)}  |  Edges: {len(ee)}")
                 st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
-                components.html(pyvis_from_kg(en, ee, height="740px"), height=760, scrolling=True)
-            else:
-                st.info("왼쪽에서 노드를 선택하고 'Show ego network'를 클릭하세요.")
-        # ── Enriched 인사이트 (DuckDB 집계만)
         st.markdown("---")
-        st.subheader("Enriched Citation Insights")
-        st.caption("citation_events_enriched: DuckDB로 집계 통계만 쿼리 (전체 로드 없음)")
-        with st.spinner("Enriched 통계 쿼리 중 (DuckDB)..."):
             sem_df, field_df = query_enriched_stats(enriched_path)
         if not sem_df.empty:
@@ -810,7 +791,7 @@ with tab_analytics:
     col_c, col_d = st.columns(2)
     with col_c:
-        st.subheader("Field × Intent Heatmap")
         fi = (seed[["seed_paper_id","field"]]
               .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
               .groupby(["field","primary_intent"]).size().reset_index(name="count"))
@@ -818,7 +799,7 @@ with tab_analytics:
             pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
             st.plotly_chart(
                 px.imshow(pivot, color_continuous_scale="Blues",
-                          title="Citation Intent by Field", aspect="auto")
                 .update_layout(xaxis_title="Intent", yaxis_title="Field"),
                 use_container_width=True)

                 .update_layout(xaxis_title="Year", yaxis_title="Citations"),
                 use_container_width=True)
+        st.subheader("CitationHub Intent Distribution")
         all_intents = events.groupby("primary_intent").size().to_dict()
         ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
                                "count": [int(all_intents.get(i, 0)) for i in ALLOWED_INTENTS]})
         fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
         st.plotly_chart(fig2, use_container_width=True)
+        st.subheader("CitationHub Field Distribution")
         fd = (seed_filtered.groupby("field", dropna=False).size()
               .reset_index(name="count").sort_values("count", ascending=False).head(20))
         fd["field"] = fd["field"].replace("","Unknown")
 # ═══ 2. CITATION NETWORK ════════════════════════════════════════
 with tab_cnet:
+    st.subheader("Citation Network")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
     if seed_events.empty:
         st.info("No citation network data for this seed paper.")
     components.html(pyvis_ontology(), height=820, scrolling=True)
+# ═══ 4. KNOWLEDGE GRAPH ══════════════════════════════════════════
 with tab_kg:
+    st.subheader("Knowledge Graph")
     max_edges_kg = st.slider("Max edges", 20, 150, 80, key="kg_max_edges")
     try:
+        with st.spinner("Loading..."):
+            kg_nodes_kg   = load_kg_nodes(data_dir_val)
+            kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
         seed_doi = selected_seed["doi"]
         if not seed_doi:
+            st.warning("Selected seed paper has no DOI.")
         else:
             node_id = f"seed:{seed_doi}"
+            with st.spinner("Querying graph..."):
                 edges_sub = query_kg_edges_for_node(node_id, kg_edges_path, max_edges_kg)
             if edges_sub.empty:
+                st.warning("No edges found for this paper in the knowledge graph.")
             else:
                 all_node_ids = set(edges_sub["source"].tolist()) | set(edges_sub["target"].tolist())
+                nodes_sub = kg_nodes_kg[kg_nodes_kg["node_id"].isin(all_node_ids)]
                 c1, c2, c3 = st.columns(3)
                 c1.metric("Nodes",      fmt_num(len(nodes_sub)))
                 type_counts.columns = ["node_type", "count"]
                 st.plotly_chart(
                     px.bar(type_counts, x="node_type", y="count",
+                           color="node_type", color_discrete_map=NODE_TYPE_COLORS)
                     .update_layout(showlegend=False, xaxis_title="", yaxis_title="Count"),
                     use_container_width=True)
         st.error(str(e))
+# ═══ 5. KG EXPLORER ══════════════════════════════════════════════
 with tab_kg_exp:
     st.subheader("KG Explorer")
     try:
+        with st.spinner("Loading..."):
             kg_nodes_exp  = load_kg_nodes(data_dir_val)
             kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
             enriched_path = get_parquet_path("citation_events_enriched.parquet", data_dir_val)
+        # ── 노드/엣지 타입 분포 통계
         col_a, col_b = st.columns([1, 2])
         with col_a:
+            st.subheader("Node Types")
             nt = kg_nodes_exp["node_type"].value_counts().reset_index()
             nt.columns = ["node_type", "count"]
             st.dataframe(nt, use_container_width=True, hide_index=True)
+            st.subheader("Edge Types")
             import duckdb as _ddb
             et = _ddb.execute(f"""
                 SELECT edge_type, COUNT(*) AS count
             st.dataframe(et, use_container_width=True, hide_index=True)
         with col_b:
+            st.subheader("CitationHub KG Node Distribution")
             nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
                             color_discrete_map=NODE_TYPE_COLORS)
             nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
             st.plotly_chart(nt_fig, use_container_width=True)
+        # ── 멀티노드 자동 시각화 (인용수 상위 seed papers + 연결 노드)
         st.markdown("---")
+        st.subheader("Multi-Node Knowledge Graph")
+        n_seeds = st.slider("Number of seed papers", 3, 20, 8, key="kg_exp_n_seeds")
+        max_exp_edges = st.slider("Max edges", 30, 200, 100, key="kg_exp_max_edges")
+        with st.spinner("Querying graph..."):
+            # 인용수 상위 seed papers 선택
+            top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
+                         .sort_values("citedby_count", ascending=False)
+                         .head(n_seeds))
+            seed_ids = top_seeds["node_id"].tolist()
+            if seed_ids:
+                ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
+                # 메타데이터 엣지만 가져옴 (citing/cited/intent 제외 → 저자/저널/분야 등)
+                exp_edges = _ddb.execute(f"""
+                    SELECT source, target, edge_type
+                    FROM read_parquet('{kg_edges_path}')
+                    WHERE (source IN ({ids_sql}) OR target IN ({ids_sql}))
+                      AND edge_type NOT IN (
+                          'HAS_CITING_PAPER','HAS_CITED_PAPER','HAS_PRIMARY_INTENT'
+                      )
+                    LIMIT {int(max_exp_edges)}
+                """).df()
+                all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
+                exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]
+                c1, c2, c3 = st.columns(3)
+                c1.metric("Nodes", fmt_num(len(exp_nodes)))
+                c2.metric("Edges", fmt_num(len(exp_edges)))
+                c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
                 st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
+                components.html(pyvis_from_kg(exp_nodes, exp_edges, height="780px"),
+                                height=800, scrolling=True)
+        # ── Enriched 인사이트
         st.markdown("---")
+        st.subheader("CitationHub Semantic Evidence Distribution")
+        with st.spinner("Loading..."):
             sem_df, field_df = query_enriched_stats(enriched_path)
         if not sem_df.empty:
     col_c, col_d = st.columns(2)
     with col_c:
+        st.subheader("CitationHub Field × Intent Distribution")
         fi = (seed[["seed_paper_id","field"]]
               .merge(events[["seed_paper_id","primary_intent"]], on="seed_paper_id", how="inner")
               .groupby(["field","primary_intent"]).size().reset_index(name="count"))
             pivot = fi.pivot(index="field", columns="primary_intent", values="count").fillna(0)
             st.plotly_chart(
                 px.imshow(pivot, color_continuous_scale="Blues",
+                          title="CitationHub Field × Intent Distribution", aspect="auto")
                 .update_layout(xaxis_title="Intent", yaxis_title="Field"),
                 use_container_width=True)