Daniel0315 commited on
Commit
8586795
·
verified ·
1 Parent(s): 8e3b7ef

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +27 -23
src/app.py CHANGED
@@ -298,22 +298,14 @@ def inject_fullscreen(html: str) -> str:
298
  return html.replace("</body>", extra + "</body>")
299
 
300
 
301
- # ── 메인 데이터 로드 (11개) ────────────────────────────────────
302
  @st.cache_data(show_spinner=False)
303
  def load_data(data_dir_str: str):
304
  d = None if HF_REPO_ID else Path(data_dir_str)
305
 
306
- seed_df = _read("seed_cited_papers_normalized.parquet", d)
307
- events_df = _read("citation_events_normalized.parquet", d)
308
- citing_df = _read("citing_papers_normalized.parquet", d)
309
- authors_df = _read("authors.parquet", d)
310
- affiliations_df = _read("affiliations.parquet", d)
311
- aff_geo_df = _read("affiliation_geo.parquet", d)
312
- cities_df = _read("cities.parquet", d)
313
- countries_df = _read("countries.parquet", d)
314
- fields_df = _read("fields.parquet", d)
315
- intents_df = _read("intents.parquet", d)
316
- journals_df = _read("journals.parquet", d)
317
 
318
  seed = pd.DataFrame({
319
  "seed_paper_id": seed_df["seed_paper_id"],
@@ -375,22 +367,31 @@ def load_data(data_dir_str: str):
375
  "seed_papers": int(len(seed)),
376
  "citation_events": int(len(events)),
377
  "citing_papers": int(events["citing_paper_id"].nunique()),
378
- "authors": int(len(authors_df)),
379
  "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
380
  "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
381
  "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
382
  "intents": len(ALLOWED_INTENTS),
383
  }
384
- return (seed, events, citing, filters, overview,
385
- authors_df, affiliations_df, aff_geo_df,
386
- cities_df, countries_df, fields_df, intents_df, journals_df)
387
 
388
 
389
- # ── KG 데이터: DuckDB 방식으로 분리 로드 ─────────────────────
390
- # kg_nodes : pandas 전체 로드 (~160MB 파일, 메모리 허용 범위)
391
- # kg_edges : DuckDB로 필요한 노드의 엣지만 쿼리 (전체 로드 함)
392
- # enriched : DuckDB로 집계 통계만 쿼리 (전체 로드 안 함)
 
 
 
 
 
 
 
 
 
 
393
 
 
394
  @st.cache_data(show_spinner=False)
395
  def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
396
  """kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
@@ -626,9 +627,7 @@ with st.sidebar:
626
  data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
627
 
628
  try:
629
- (seed, events, citing, filters, overview,
630
- authors_df, affiliations_df, aff_geo_df,
631
- cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
632
  st.success("Data loaded")
633
  except Exception as e:
634
  st.error(str(e)); st.stop()
@@ -905,6 +904,8 @@ with tab_kg_exp:
905
  # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
906
  with tab_geo:
907
  st.subheader("Geographic Distribution of Seed Papers")
 
 
908
 
909
  country_cnt = (seed_filtered.groupby("country", dropna=False).size()
910
  .reset_index(name="count").rename(columns={"country":"country_name"}))
@@ -977,6 +978,9 @@ with tab_geo:
977
 
978
  # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
979
  with tab_analytics:
 
 
 
980
  col_a, col_b = st.columns(2)
981
 
982
  with col_a:
 
298
  return html.replace("</body>", extra + "</body>")
299
 
300
 
301
+ # ── 메인 데이터 로드 (핵심 3 — 빠른 초기 기동) ──────────────
302
  @st.cache_data(show_spinner=False)
303
  def load_data(data_dir_str: str):
304
  d = None if HF_REPO_ID else Path(data_dir_str)
305
 
306
+ seed_df = _read("seed_cited_papers_normalized.parquet", d)
307
+ events_df = _read("citation_events_normalized.parquet", d)
308
+ citing_df = _read("citing_papers_normalized.parquet", d)
 
 
 
 
 
 
 
 
309
 
310
  seed = pd.DataFrame({
311
  "seed_paper_id": seed_df["seed_paper_id"],
 
367
  "seed_papers": int(len(seed)),
368
  "citation_events": int(len(events)),
369
  "citing_papers": int(events["citing_paper_id"].nunique()),
370
+ "authors": int(seed["author"].replace("", pd.NA).dropna().nunique()),
371
  "journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
372
  "countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
373
  "fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
374
  "intents": len(ALLOWED_INTENTS),
375
  }
376
+ return seed, events, citing, filters, overview
 
 
377
 
378
 
379
+ # ── 보조 데이터: 해당 접근 시에만 로드 (lazy) ───────────────
380
+ @st.cache_data(show_spinner=False)
381
+ def load_authors_data(data_dir_str: str) -> pd.DataFrame:
382
+ """Analytics 탭에서만 사용 진입 로드"""
383
+ d = None if HF_REPO_ID else Path(data_dir_str)
384
+ return _read("authors.parquet", d)
385
+
386
+
387
+ @st.cache_data(show_spinner=False)
388
+ def load_geo_data(data_dir_str: str) -> pd.DataFrame:
389
+ """Geographic Map 탭에서만 사용 — 탭 진입 시 로드"""
390
+ d = None if HF_REPO_ID else Path(data_dir_str)
391
+ return _read("affiliation_geo.parquet", d)
392
+
393
 
394
+ # ── KG 데이터: DuckDB 방식으로 분리 로드 ─────────────────────
395
  @st.cache_data(show_spinner=False)
396
  def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
397
  """kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
 
627
  data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
628
 
629
  try:
630
+ seed, events, citing, filters, overview = load_data(data_dir_val)
 
 
631
  st.success("Data loaded")
632
  except Exception as e:
633
  st.error(str(e)); st.stop()
 
904
  # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
905
  with tab_geo:
906
  st.subheader("Geographic Distribution of Seed Papers")
907
+ with st.spinner("Loading geographic data..."):
908
+ aff_geo_df = load_geo_data(data_dir_val)
909
 
910
  country_cnt = (seed_filtered.groupby("country", dropna=False).size()
911
  .reset_index(name="count").rename(columns={"country":"country_name"}))
 
978
 
979
  # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
980
  with tab_analytics:
981
+ with st.spinner("Loading analytics data..."):
982
+ authors_df = load_authors_data(data_dir_val)
983
+
984
  col_a, col_b = st.columns(2)
985
 
986
  with col_a: