Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +27 -23
src/app.py
CHANGED
|
@@ -298,22 +298,14 @@ def inject_fullscreen(html: str) -> str:
|
|
| 298 |
return html.replace("</body>", extra + "</body>")
|
| 299 |
|
| 300 |
|
| 301 |
-
# ── 메인 데이터 로드 (
|
| 302 |
@st.cache_data(show_spinner=False)
|
| 303 |
def load_data(data_dir_str: str):
|
| 304 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 305 |
|
| 306 |
-
seed_df
|
| 307 |
-
events_df
|
| 308 |
-
citing_df
|
| 309 |
-
authors_df = _read("authors.parquet", d)
|
| 310 |
-
affiliations_df = _read("affiliations.parquet", d)
|
| 311 |
-
aff_geo_df = _read("affiliation_geo.parquet", d)
|
| 312 |
-
cities_df = _read("cities.parquet", d)
|
| 313 |
-
countries_df = _read("countries.parquet", d)
|
| 314 |
-
fields_df = _read("fields.parquet", d)
|
| 315 |
-
intents_df = _read("intents.parquet", d)
|
| 316 |
-
journals_df = _read("journals.parquet", d)
|
| 317 |
|
| 318 |
seed = pd.DataFrame({
|
| 319 |
"seed_paper_id": seed_df["seed_paper_id"],
|
|
@@ -375,22 +367,31 @@ def load_data(data_dir_str: str):
|
|
| 375 |
"seed_papers": int(len(seed)),
|
| 376 |
"citation_events": int(len(events)),
|
| 377 |
"citing_papers": int(events["citing_paper_id"].nunique()),
|
| 378 |
-
"authors": int(
|
| 379 |
"journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
|
| 380 |
"countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
|
| 381 |
"fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
|
| 382 |
"intents": len(ALLOWED_INTENTS),
|
| 383 |
}
|
| 384 |
-
return
|
| 385 |
-
authors_df, affiliations_df, aff_geo_df,
|
| 386 |
-
cities_df, countries_df, fields_df, intents_df, journals_df)
|
| 387 |
|
| 388 |
|
| 389 |
-
# ──
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
|
|
|
| 394 |
@st.cache_data(show_spinner=False)
|
| 395 |
def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
|
| 396 |
"""kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
|
|
@@ -626,9 +627,7 @@ with st.sidebar:
|
|
| 626 |
data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
|
| 627 |
|
| 628 |
try:
|
| 629 |
-
|
| 630 |
-
authors_df, affiliations_df, aff_geo_df,
|
| 631 |
-
cities_df, countries_df, fields_df, intents_df, journals_df) = load_data(data_dir_val)
|
| 632 |
st.success("Data loaded")
|
| 633 |
except Exception as e:
|
| 634 |
st.error(str(e)); st.stop()
|
|
@@ -905,6 +904,8 @@ with tab_kg_exp:
|
|
| 905 |
# ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
|
| 906 |
with tab_geo:
|
| 907 |
st.subheader("Geographic Distribution of Seed Papers")
|
|
|
|
|
|
|
| 908 |
|
| 909 |
country_cnt = (seed_filtered.groupby("country", dropna=False).size()
|
| 910 |
.reset_index(name="count").rename(columns={"country":"country_name"}))
|
|
@@ -977,6 +978,9 @@ with tab_geo:
|
|
| 977 |
|
| 978 |
# ═══ 7. ANALYTICS ═══════════════════════════════════════════════
|
| 979 |
with tab_analytics:
|
|
|
|
|
|
|
|
|
|
| 980 |
col_a, col_b = st.columns(2)
|
| 981 |
|
| 982 |
with col_a:
|
|
|
|
| 298 |
return html.replace("</body>", extra + "</body>")
|
| 299 |
|
| 300 |
|
| 301 |
+
# ── 메인 데이터 로드 (핵심 3개 — 빠른 초기 기동) ──────────────
|
| 302 |
@st.cache_data(show_spinner=False)
|
| 303 |
def load_data(data_dir_str: str):
|
| 304 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 305 |
|
| 306 |
+
seed_df = _read("seed_cited_papers_normalized.parquet", d)
|
| 307 |
+
events_df = _read("citation_events_normalized.parquet", d)
|
| 308 |
+
citing_df = _read("citing_papers_normalized.parquet", d)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
seed = pd.DataFrame({
|
| 311 |
"seed_paper_id": seed_df["seed_paper_id"],
|
|
|
|
| 367 |
"seed_papers": int(len(seed)),
|
| 368 |
"citation_events": int(len(events)),
|
| 369 |
"citing_papers": int(events["citing_paper_id"].nunique()),
|
| 370 |
+
"authors": int(seed["author"].replace("", pd.NA).dropna().nunique()),
|
| 371 |
"journals": int(seed["journal"].replace("", pd.NA).dropna().nunique()),
|
| 372 |
"countries": int(seed["country"].replace("", pd.NA).dropna().nunique()),
|
| 373 |
"fields": int(seed["field"].replace("", pd.NA).dropna().nunique()),
|
| 374 |
"intents": len(ALLOWED_INTENTS),
|
| 375 |
}
|
| 376 |
+
return seed, events, citing, filters, overview
|
|
|
|
|
|
|
| 377 |
|
| 378 |
|
| 379 |
+
# ── 보조 데이터: 해당 탭 접근 시에만 로드 (lazy) ───────────────
|
| 380 |
+
@st.cache_data(show_spinner=False)
|
| 381 |
+
def load_authors_data(data_dir_str: str) -> pd.DataFrame:
|
| 382 |
+
"""Analytics 탭에서만 사용 — 탭 진입 시 로드"""
|
| 383 |
+
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 384 |
+
return _read("authors.parquet", d)
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
@st.cache_data(show_spinner=False)
|
| 388 |
+
def load_geo_data(data_dir_str: str) -> pd.DataFrame:
|
| 389 |
+
"""Geographic Map 탭에서만 사용 — 탭 진입 시 로드"""
|
| 390 |
+
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 391 |
+
return _read("affiliation_geo.parquet", d)
|
| 392 |
+
|
| 393 |
|
| 394 |
+
# ── KG 데이터: DuckDB 방식으로 분리 로드 ─────────────────────
|
| 395 |
@st.cache_data(show_spinner=False)
|
| 396 |
def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
|
| 397 |
"""kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
|
|
|
|
| 627 |
data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))
|
| 628 |
|
| 629 |
try:
|
| 630 |
+
seed, events, citing, filters, overview = load_data(data_dir_val)
|
|
|
|
|
|
|
| 631 |
st.success("Data loaded")
|
| 632 |
except Exception as e:
|
| 633 |
st.error(str(e)); st.stop()
|
|
|
|
| 904 |
# ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
|
| 905 |
with tab_geo:
|
| 906 |
st.subheader("Geographic Distribution of Seed Papers")
|
| 907 |
+
with st.spinner("Loading geographic data..."):
|
| 908 |
+
aff_geo_df = load_geo_data(data_dir_val)
|
| 909 |
|
| 910 |
country_cnt = (seed_filtered.groupby("country", dropna=False).size()
|
| 911 |
.reset_index(name="count").rename(columns={"country":"country_name"}))
|
|
|
|
| 978 |
|
| 979 |
# ═══ 7. ANALYTICS ═══════════════════════════════════════════════
|
| 980 |
with tab_analytics:
|
| 981 |
+
with st.spinner("Loading analytics data..."):
|
| 982 |
+
authors_df = load_authors_data(data_dir_val)
|
| 983 |
+
|
| 984 |
col_a, col_b = st.columns(2)
|
| 985 |
|
| 986 |
with col_a:
|