Spaces:

Daniel0315
/

cithub_website

Sleeping

App Files Files Community

Daniel0315 commited on Apr 29

Commit

8586795

verified ·

1 Parent(s): 8e3b7ef

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +27 -23

src/app.py CHANGED Viewed

@@ -298,22 +298,14 @@ def inject_fullscreen(html: str) -> str:
     return html.replace("</body>", extra + "</body>")
-# ── 메인 데이터 로드 (11개) ────────────────────────────────────
 @st.cache_data(show_spinner=False)
 def load_data(data_dir_str: str):
     d = None if HF_REPO_ID else Path(data_dir_str)
-    seed_df    = _read("seed_cited_papers_normalized.parquet", d)
-    events_df  = _read("citation_events_normalized.parquet", d)
-    citing_df  = _read("citing_papers_normalized.parquet", d)
-    authors_df      = _read("authors.parquet", d)
-    affiliations_df = _read("affiliations.parquet", d)
-    aff_geo_df      = _read("affiliation_geo.parquet", d)
-    cities_df       = _read("cities.parquet", d)
-    countries_df    = _read("countries.parquet", d)
-    fields_df       = _read("fields.parquet", d)
-    intents_df      = _read("intents.parquet", d)
-    journals_df     = _read("journals.parquet", d)
     seed = pd.DataFrame({
         "seed_paper_id":  seed_df["seed_paper_id"],
@@ -375,22 +367,31 @@ def load_data(data_dir_str: str):
         "seed_papers":     int(len(seed)),
         "citation_events": int(len(events)),
         "citing_papers":   int(events["citing_paper_id"].nunique()),
-        "authors":         int(len(authors_df)),
         "journals":        int(seed["journal"].replace("", pd.NA).dropna().nunique()),
         "countries":       int(seed["country"].replace("", pd.NA).dropna().nunique()),
         "fields":          int(seed["field"].replace("", pd.NA).dropna().nunique()),
         "intents":         len(ALLOWED_INTENTS),
     }
-    return (seed, events, citing, filters, overview,
-            authors_df, affiliations_df, aff_geo_df,
-            cities_df, countries_df, fields_df, intents_df, journals_df)
-# ── KG 데이터: DuckDB 방식으로 분리 로드 ─────────────────────
-#   kg_nodes  : pandas 전체 로드 (~160MB 파일, 메모리 허용 범위)
-#   kg_edges  : DuckDB로 필요한 노드의 엣지만 쿼리 (전체 로드 안 함)
-#   enriched  : DuckDB로 집계 통계만 쿼리 (전체 로드 안 함)
 @st.cache_data(show_spinner=False)
 def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
     """kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
@@ -626,9 +627,7 @@ with st.sidebar:
         data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
     try:
-        (seed, events, citing, filters, overview,
-         authors_df, affiliations_df, aff_geo_df,
-         cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
         st.success("Data loaded")
     except Exception as e:
         st.error(str(e)); st.stop()
@@ -905,6 +904,8 @@ with tab_kg_exp:
 # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
 with tab_geo:
     st.subheader("Geographic Distribution of Seed Papers")
     country_cnt = (seed_filtered.groupby("country", dropna=False).size()
                    .reset_index(name="count").rename(columns={"country":"country_name"}))
@@ -977,6 +978,9 @@ with tab_geo:
 # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
 with tab_analytics:
     col_a, col_b = st.columns(2)
     with col_a:

     return html.replace("</body>", extra + "</body>")
+# ── 메인 데이터 로드 (핵심 3개 — 빠른 초기 기동) ──────────────
 @st.cache_data(show_spinner=False)
 def load_data(data_dir_str: str):
     d = None if HF_REPO_ID else Path(data_dir_str)
+    seed_df   = _read("seed_cited_papers_normalized.parquet", d)
+    events_df = _read("citation_events_normalized.parquet", d)
+    citing_df = _read("citing_papers_normalized.parquet", d)
     seed = pd.DataFrame({
         "seed_paper_id":  seed_df["seed_paper_id"],
         "seed_papers":     int(len(seed)),
         "citation_events": int(len(events)),
         "citing_papers":   int(events["citing_paper_id"].nunique()),
+        "authors":         int(seed["author"].replace("", pd.NA).dropna().nunique()),
         "journals":        int(seed["journal"].replace("", pd.NA).dropna().nunique()),
         "countries":       int(seed["country"].replace("", pd.NA).dropna().nunique()),
         "fields":          int(seed["field"].replace("", pd.NA).dropna().nunique()),
         "intents":         len(ALLOWED_INTENTS),
     }
+    return seed, events, citing, filters, overview
+# ── 보조 데이터: 해당 탭 접근 시에만 로드 (lazy) ───────────────
+@st.cache_data(show_spinner=False)
+def load_authors_data(data_dir_str: str) -> pd.DataFrame:
+    """Analytics 탭에서만 사용 — 탭 진입 시 로드"""
+    d = None if HF_REPO_ID else Path(data_dir_str)
+    return _read("authors.parquet", d)
+@st.cache_data(show_spinner=False)
+def load_geo_data(data_dir_str: str) -> pd.DataFrame:
+    """Geographic Map 탭에서만 사용 — 탭 진입 시 로드"""
+    d = None if HF_REPO_ID else Path(data_dir_str)
+    return _read("affiliation_geo.parquet", d)
+# ── KG 데이터: DuckDB 방식으로 분리 로드 ─────────────────────
 @st.cache_data(show_spinner=False)
 def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
     """kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
         data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
     try:
+        seed, events, citing, filters, overview = load_data(data_dir_val)
         st.success("Data loaded")
     except Exception as e:
         st.error(str(e)); st.stop()
 # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
 with tab_geo:
     st.subheader("Geographic Distribution of Seed Papers")
+    with st.spinner("Loading geographic data..."):
+        aff_geo_df = load_geo_data(data_dir_val)
     country_cnt = (seed_filtered.groupby("country", dropna=False).size()
                    .reset_index(name="count").rename(columns={"country":"country_name"}))
 # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
 with tab_analytics:
+    with st.spinner("Loading analytics data..."):
+        authors_df = load_authors_data(data_dir_val)
     col_a, col_b = st.columns(2)
     with col_a: