Spaces:
Sleeping
Sleeping
ykjung commited on
Commit ยท
dd5d980
1
Parent(s): 1d2414a
feat: implement chunked Parquet uploads to Hugging Face for memory efficiency and improved data handling
Browse files- README.md +7 -7
- __pycache__/app.cpython-313.pyc +0 -0
- app.py +106 -61
README.md
CHANGED
|
@@ -85,16 +85,15 @@ UI ์ฐ๊ฒฐ:
|
|
| 85 |
- ๋ง๊ฐ ํ์ ์ (today) ๋ฐ์ดํฐ๋ ์ ์ธ
|
| 86 |
- ์คํจ ์ ์ต๋ 3ํ ์ฌ์๋
|
| 87 |
- 10ํฐ์ปค๋ง๋ค 0.5์ด ๋๊ธฐ(ํธ์ถ ์ฐจ๋จ ์ํ)
|
| 88 |
-
- ์ค์ ํ ๊ฐ๊ฒฉ(`checkpoint_batch_size`)๋ง๋ค
|
|
|
|
| 89 |
4. **์ต์ข
๋ณํฉ ๋ฐ์**
|
| 90 |
-
- ๋ง์ง๋ง ๋ฏธ๋ฐ์ ๋ฒํผ๋ฅผ
|
| 91 |
-
- `Ticker + Date` ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ(์ต์ ๊ฐ ์ ์ง)
|
| 92 |
5. **์ต๊ทผ 30์ผ ๋ฐ์ดํฐ์
๊ฐฑ์ **
|
| 93 |
-
-
|
| 94 |
6. **Hugging Face ์
๋ก๋**
|
| 95 |
-
- `
|
| 96 |
-
-
|
| 97 |
-
- ๋ด๋ถ์ ์ผ๋ก `Dataset.from_pandas(...).push_to_hub(...)` ์ฌ์ฉ
|
| 98 |
7. **์๋ฃ ๋ก๊ทธ ๋ฐํ**
|
| 99 |
- ์์/์ข
๋ฃ ์๊ฐ, ์ฑ๊ณต/์คํจ ๊ฐ์, ์
๋ก๋ ๊ฒฐ๊ณผ๋ฅผ ํ
์คํธ๋ก ๋ฐํ
|
| 100 |
|
|
@@ -170,6 +169,7 @@ UI ์ฐ๊ฒฐ:
|
|
| 170 |
|
| 171 |
- ์ด๋ฏธ ๋ฐ์ดํฐ์
์ ์กด์ฌํ๋ ํฐ์ปค๋ ์๋์ผ๋ก ์คํตํฉ๋๋ค.
|
| 172 |
- ์ด์ ์คํ์์ ์ค๊ฐ์ ์คํจํด๋, ๋ค์ ์คํ ์ ๋จ์ ํฐ์ปค ์์ฃผ๋ก ์ด์ด์ ์์งํฉ๋๋ค.
|
|
|
|
| 173 |
|
| 174 |
---
|
| 175 |
|
|
|
|
| 85 |
- ๋ง๊ฐ ํ์ ์ (today) ๋ฐ์ดํฐ๋ ์ ์ธ
|
| 86 |
- ์คํจ ์ ์ต๋ 3ํ ์ฌ์๋
|
| 87 |
- 10ํฐ์ปค๋ง๋ค 0.5์ด ๋๊ธฐ(ํธ์ถ ์ฐจ๋จ ์ํ)
|
| 88 |
+
- ์ค์ ํ ๊ฐ๊ฒฉ(`checkpoint_batch_size`)๋ง๋ค ์์ง ๋ฒํผ๋ฅผ Parquet ์ฒญํฌ๋ก ์
๋ก๋
|
| 89 |
+
- ์ฒญํฌ ์
๋ก๋ ํ ๋ฉ๋ชจ๋ฆฌ ๋ฒํผ๋ฅผ ์ฆ์ ๋น์ ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋์ ์ ํ
|
| 90 |
4. **์ต์ข
๋ณํฉ ๋ฐ์**
|
| 91 |
+
- ๋ง์ง๋ง ๋ฏธ๋ฐ์ ๋ฒํผ๋ฅผ ์ฒญํฌ ์
๋ก๋๋ก ๋ฐ์
|
|
|
|
| 92 |
5. **์ต๊ทผ 30์ผ ๋ฐ์ดํฐ์
๊ฐฑ์ **
|
| 93 |
+
- ํฐ์ปค๋ณ ์ต๊ทผ 30์ผ ๋ฐ์ดํฐ ์ฒญํฌ๋ฅผ ํจ๊ป ์
๋ก๋
|
| 94 |
6. **Hugging Face ์
๋ก๋**
|
| 95 |
+
- ๋ด๋ถ์ ์ผ๋ก `HfApi.upload_file(..., path_in_repo="data/chunks/...parquet")` ๋ฐฉ์ ์ฌ์ฉ
|
| 96 |
+
- ๋์ฉ๋ ์ ์ฒด ๋ฐ์ดํฐํ๋ ์ ์ฌ์
๋ก๋๋ฅผ ํผํด์ ๋ฉ๋ชจ๋ฆฌ ํผํฌ๋ฅผ ๋ฎ์ถค
|
|
|
|
| 97 |
7. **์๋ฃ ๋ก๊ทธ ๋ฐํ**
|
| 98 |
- ์์/์ข
๋ฃ ์๊ฐ, ์ฑ๊ณต/์คํจ ๊ฐ์, ์
๋ก๋ ๊ฒฐ๊ณผ๋ฅผ ํ
์คํธ๋ก ๋ฐํ
|
| 99 |
|
|
|
|
| 169 |
|
| 170 |
- ์ด๋ฏธ ๋ฐ์ดํฐ์
์ ์กด์ฌํ๋ ํฐ์ปค๋ ์๋์ผ๋ก ์คํตํฉ๋๋ค.
|
| 171 |
- ์ด์ ์คํ์์ ์ค๊ฐ์ ์คํจํด๋, ๋ค์ ์คํ ์ ๋จ์ ํฐ์ปค ์์ฃผ๋ก ์ด์ด์ ์์งํฉ๋๋ค.
|
| 172 |
+
- ์ฒดํฌํฌ์ธํธ ๊ฐ๊ฒฉ(์: 100) ๊ธฐ์ค์ผ๋ก ์ ์ฅ๋๋ฏ๋ก, ์คํจ ์ ์ต๊ทผ ์ฒดํฌํฌ์ธํธ ์ดํ ๊ตฌ๊ฐ๋ง ์ฌ์์ง๋ฉ๋๋ค.
|
| 173 |
|
| 174 |
---
|
| 175 |
|
__pycache__/app.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
|
|
|
app.py
CHANGED
|
@@ -17,6 +17,8 @@ import logging
|
|
| 17 |
import json
|
| 18 |
import traceback
|
| 19 |
import gc
|
|
|
|
|
|
|
| 20 |
from urllib.request import Request, urlopen
|
| 21 |
|
| 22 |
# ๋ก๊น
์ค์
|
|
@@ -500,6 +502,83 @@ def upload_dataset_to_hf(df, repo_name, hf_token, max_retries=3, retry_wait_sec=
|
|
| 500 |
}
|
| 501 |
|
| 502 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
def run_pipeline(
|
| 504 |
hf_token,
|
| 505 |
all_dataset_name,
|
|
@@ -521,16 +600,6 @@ def run_pipeline(
|
|
| 521 |
|
| 522 |
logs = []
|
| 523 |
try:
|
| 524 |
-
def _merge_rows(base_df, incoming_df):
|
| 525 |
-
if incoming_df is None or incoming_df.empty:
|
| 526 |
-
return base_df
|
| 527 |
-
if base_df is None or base_df.empty:
|
| 528 |
-
return incoming_df.reset_index(drop=True)
|
| 529 |
-
|
| 530 |
-
merged = pd.concat([base_df, incoming_df], ignore_index=True)
|
| 531 |
-
merged = merged.drop_duplicates(subset=["Ticker", "Date"], keep="last")
|
| 532 |
-
return merged.reset_index(drop=True)
|
| 533 |
-
|
| 534 |
def _df_stats(df, label):
|
| 535 |
if df is None or df.empty:
|
| 536 |
return f"{label}: 0ํ"
|
|
@@ -553,6 +622,7 @@ def run_pipeline(
|
|
| 553 |
logs.append("๐ ์ฃผ์ ๋ฐ์ดํฐ ์์ง ํ์ดํ๋ผ์ธ ์์")
|
| 554 |
logs.append(f"โฐ ์์ ์๊ฐ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 555 |
logs.append("=" * 60)
|
|
|
|
| 556 |
|
| 557 |
# ========== 1๋จ๊ณ: ํฐ์ปค ๋ชฉ๋ก ์์ง ==========
|
| 558 |
progress(0, desc="๋์ค๋ฅ & ๋ด์ ํฐ์ปค ๋ชฉ๋ก ์์ง ์ค...")
|
|
@@ -611,24 +681,16 @@ def run_pipeline(
|
|
| 611 |
progress(0.08, desc="๊ธฐ์กด ๋ฐ์ดํฐ์
๋ก๋ ์ค...")
|
| 612 |
logs.append("\n๐ [2๋จ๊ณ] ๊ธฐ์กด ๋ฐ์ดํฐ์
๋ก๋ ๋ฐ ์ฌ๊ฐ ๋์ ๊ณ์ฐ...")
|
| 613 |
|
| 614 |
-
|
| 615 |
-
base_recent_df = pd.DataFrame(columns=["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"])
|
| 616 |
-
|
| 617 |
try:
|
| 618 |
-
|
| 619 |
-
logs.append(f" - ๊ธฐ์กด
|
| 620 |
except Exception as e:
|
| 621 |
-
logs.append(f" - ๊ธฐ์กด
|
| 622 |
|
| 623 |
-
|
| 624 |
-
base_recent_df = load_hf_dataset_as_df(recent_dataset_name, hf_token)
|
| 625 |
-
logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํฐ: {len(base_recent_df)}ํ")
|
| 626 |
-
except Exception as e:
|
| 627 |
-
logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํฐ ๋ก๋ ์คํจ(์ ๊ท ์์ฑ์ผ๋ก ์งํ): {e}")
|
| 628 |
-
|
| 629 |
-
existing_tickers = set(base_all_df["Ticker"].dropna().astype(str).str.upper().tolist())
|
| 630 |
if not existing_tickers:
|
| 631 |
-
|
| 632 |
|
| 633 |
pending_tickers = [ticker for ticker in all_tickers if ticker not in existing_tickers]
|
| 634 |
|
|
@@ -655,7 +717,7 @@ def run_pipeline(
|
|
| 655 |
total = len(pending_tickers)
|
| 656 |
|
| 657 |
def _upload_checkpoint(end_index):
|
| 658 |
-
nonlocal last_checkpoint_success_index
|
| 659 |
|
| 660 |
if success_count <= last_checkpoint_success_index:
|
| 661 |
return
|
|
@@ -671,15 +733,21 @@ def run_pipeline(
|
|
| 671 |
checkpoint_all_df = pd.concat(all_data_frames, ignore_index=True)
|
| 672 |
checkpoint_recent_df = pd.concat(recent_30d_frames, ignore_index=True)
|
| 673 |
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
base_recent_df = filter_last_30_days(base_recent_df)
|
| 677 |
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
|
| 684 |
_append_upload_result("all ์ฒดํฌํฌ์ธํธ", result_all_ckpt)
|
| 685 |
_append_upload_result("30d ์ฒดํฌํฌ์ธํธ", result_30d_ckpt)
|
|
@@ -729,37 +797,14 @@ def run_pipeline(
|
|
| 729 |
progress(0.9, desc="๋ง์ง๋ง ์ฒดํฌํฌ์ธํธ ๋ฐ์ ์ค...")
|
| 730 |
logs.append("\n๐ง [4๋จ๊ณ] ๋ง์ง๋ง ๋ฏธ๋ฐ์ ๋ฐ์ดํฐ ๋ฐ์ ์ค...")
|
| 731 |
|
| 732 |
-
if
|
| 733 |
-
logs.append("\n๐พ [์ฒดํฌํฌ์ธํธ] ๋ง์ง๋ง ๋ฏธ๋ฐ์ ๊ตฌ๊ฐ ์
๋ก๋")
|
| 734 |
-
_upload_checkpoint(total)
|
| 735 |
-
|
| 736 |
-
if checkpoint_batch_size == 0 and all_data_frames:
|
| 737 |
logs.append("\n๐พ [์ต์ข
๋ฐ์] ์ค๊ฐ ์
๋ก๋ ์์ด ๋์ ๋ ๋ฐ์ดํฐ ๋ฐ์")
|
| 738 |
_upload_checkpoint(total)
|
| 739 |
|
| 740 |
-
|
| 741 |
-
logs.append(
|
| 742 |
-
logs.append(
|
| 743 |
-
logs.append(
|
| 744 |
-
|
| 745 |
-
# ========== 5๋จ๊ณ: ํ๊น
ํ์ด์ค ๋ฐ์ดํฐ์
์
๋ก๋ ==========
|
| 746 |
-
progress(0.97, desc="all ๋ฐ์ดํฐ์
์
๋ก๋ ์ค...")
|
| 747 |
-
logs.append("\n๐ [5๋จ๊ณ] ํ๊น
ํ์ด์ค ๋ฐ์ดํฐ์
์
๋ก๋ ์ค...")
|
| 748 |
-
|
| 749 |
-
# all ๋ฐ์ดํฐ์
์
๋ก๋
|
| 750 |
-
logs.append(f" - {_df_stats(base_all_df, 'all ์ต์ข
์
๋ก๋ ๋์')}")
|
| 751 |
-
result_all = upload_dataset_to_hf(base_all_df, all_dataset_name, hf_token)
|
| 752 |
-
_append_upload_result("all ์ต์ข
", result_all)
|
| 753 |
-
if not result_all["ok"]:
|
| 754 |
-
raise RuntimeError("all ์ต์ข
์
๋ก๋ ์คํจ")
|
| 755 |
-
|
| 756 |
-
# 30์ผ ๋ฐ์ดํฐ์
์
๋ก๋
|
| 757 |
-
progress(0.99, desc="30d ๋ฐ์ดํฐ์
์
๋ก๋ ์ค...")
|
| 758 |
-
logs.append(f" - {_df_stats(base_recent_df, '30d ์ต์ข
์
๋ก๋ ๋์')}")
|
| 759 |
-
result_30d = upload_dataset_to_hf(base_recent_df, recent_dataset_name, hf_token)
|
| 760 |
-
_append_upload_result("30d ์ต์ข
", result_30d)
|
| 761 |
-
if not result_30d["ok"]:
|
| 762 |
-
raise RuntimeError("30d ์ต์ข
์
๋ก๋ ์คํจ")
|
| 763 |
|
| 764 |
# ========== ์๋ฃ ==========
|
| 765 |
progress(1.0, desc="์๋ฃ!")
|
|
|
|
| 17 |
import json
|
| 18 |
import traceback
|
| 19 |
import gc
|
| 20 |
+
import tempfile
|
| 21 |
+
import uuid
|
| 22 |
from urllib.request import Request, urlopen
|
| 23 |
|
| 24 |
# ๋ก๊น
์ค์
|
|
|
|
| 502 |
}
|
| 503 |
|
| 504 |
|
| 505 |
+
def append_parquet_chunk_to_hf(df, repo_name, hf_token, subdir="data/chunks", max_retries=3, retry_wait_sec=2):
|
| 506 |
+
"""๋ฐ์ดํฐํ๋ ์์ Parquet ์ฒญํฌ ํ์ผ๋ก ํ๊น
ํ์ด์ค ๋ฐ์ดํฐ์
์ ์ฅ์์ ์ถ๊ฐ ์
๋ก๋"""
|
| 507 |
+
if df is None or df.empty:
|
| 508 |
+
return {
|
| 509 |
+
"ok": False,
|
| 510 |
+
"repo": repo_name,
|
| 511 |
+
"rows": 0,
|
| 512 |
+
"attempts": 0,
|
| 513 |
+
"elapsed_sec": 0.0,
|
| 514 |
+
"error": "์
๋ก๋ํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.",
|
| 515 |
+
"traceback": "",
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
api = HfApi()
|
| 519 |
+
last_error = ""
|
| 520 |
+
last_traceback = ""
|
| 521 |
+
start_ts = time.time()
|
| 522 |
+
|
| 523 |
+
for attempt in range(1, max_retries + 1):
|
| 524 |
+
temp_path = None
|
| 525 |
+
try:
|
| 526 |
+
api.create_repo(
|
| 527 |
+
repo_id=repo_name,
|
| 528 |
+
repo_type="dataset",
|
| 529 |
+
token=hf_token,
|
| 530 |
+
private=False,
|
| 531 |
+
exist_ok=True,
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
chunk_name = f"chunk-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid.uuid4().hex[:8]}.parquet"
|
| 535 |
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
| 536 |
+
temp_path = tmp.name
|
| 537 |
+
|
| 538 |
+
df.to_parquet(temp_path, index=False)
|
| 539 |
+
|
| 540 |
+
path_in_repo = f"{subdir}/{chunk_name}"
|
| 541 |
+
api.upload_file(
|
| 542 |
+
path_or_fileobj=temp_path,
|
| 543 |
+
path_in_repo=path_in_repo,
|
| 544 |
+
repo_id=repo_name,
|
| 545 |
+
repo_type="dataset",
|
| 546 |
+
token=hf_token,
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
return {
|
| 550 |
+
"ok": True,
|
| 551 |
+
"repo": repo_name,
|
| 552 |
+
"rows": len(df),
|
| 553 |
+
"attempts": attempt,
|
| 554 |
+
"elapsed_sec": time.time() - start_ts,
|
| 555 |
+
"error": "",
|
| 556 |
+
"traceback": "",
|
| 557 |
+
}
|
| 558 |
+
except Exception as e:
|
| 559 |
+
last_error = str(e)
|
| 560 |
+
last_traceback = traceback.format_exc()
|
| 561 |
+
logger.warning(f"[{repo_name}] ์ฒญํฌ ์
๋ก๋ ์คํจ (์๋ {attempt}/{max_retries}): {last_error}")
|
| 562 |
+
if attempt < max_retries:
|
| 563 |
+
time.sleep(retry_wait_sec * attempt)
|
| 564 |
+
finally:
|
| 565 |
+
if temp_path and os.path.exists(temp_path):
|
| 566 |
+
try:
|
| 567 |
+
os.remove(temp_path)
|
| 568 |
+
except Exception:
|
| 569 |
+
pass
|
| 570 |
+
|
| 571 |
+
return {
|
| 572 |
+
"ok": False,
|
| 573 |
+
"repo": repo_name,
|
| 574 |
+
"rows": len(df),
|
| 575 |
+
"attempts": max_retries,
|
| 576 |
+
"elapsed_sec": time.time() - start_ts,
|
| 577 |
+
"error": last_error,
|
| 578 |
+
"traceback": last_traceback,
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
|
| 582 |
def run_pipeline(
|
| 583 |
hf_token,
|
| 584 |
all_dataset_name,
|
|
|
|
| 600 |
|
| 601 |
logs = []
|
| 602 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
def _df_stats(df, label):
|
| 604 |
if df is None or df.empty:
|
| 605 |
return f"{label}: 0ํ"
|
|
|
|
| 622 |
logs.append("๐ ์ฃผ์ ๋ฐ์ดํฐ ์์ง ํ์ดํ๋ผ์ธ ์์")
|
| 623 |
logs.append(f"โฐ ์์ ์๊ฐ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 624 |
logs.append("=" * 60)
|
| 625 |
+
logs.append("โน๏ธ ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ ๋ชจ๋: 100๊ฐ ๋จ์ ๋ฑ ์ฒญํฌ ์
๋ก๋ ํ ๋ฒํผ๋ฅผ ์ฆ์ ๋น์๋๋ค.")
|
| 626 |
|
| 627 |
# ========== 1๋จ๊ณ: ํฐ์ปค ๋ชฉ๋ก ์์ง ==========
|
| 628 |
progress(0, desc="๋์ค๋ฅ & ๋ด์ ํฐ์ปค ๋ชฉ๋ก ์์ง ์ค...")
|
|
|
|
| 681 |
progress(0.08, desc="๊ธฐ์กด ๋ฐ์ดํฐ์
๋ก๋ ์ค...")
|
| 682 |
logs.append("\n๐ [2๋จ๊ณ] ๊ธฐ์กด ๋ฐ์ดํฐ์
๋ก๋ ๋ฐ ์ฌ๊ฐ ๋์ ๊ณ์ฐ...")
|
| 683 |
|
| 684 |
+
recent_for_resume = pd.DataFrame(columns=["Ticker"])
|
|
|
|
|
|
|
| 685 |
try:
|
| 686 |
+
recent_for_resume = load_hf_dataset_as_df(recent_dataset_name, hf_token)
|
| 687 |
+
logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํฐ: {len(recent_for_resume)}ํ")
|
| 688 |
except Exception as e:
|
| 689 |
+
logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํฐ ๋ก๋ ์คํจ(์ ๊ท ์์ง ๊ธฐ์ค์ผ๋ก ์งํ): {e}")
|
| 690 |
|
| 691 |
+
existing_tickers = set(recent_for_resume["Ticker"].dropna().astype(str).str.upper().tolist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 692 |
if not existing_tickers:
|
| 693 |
+
logs.append(" - ๊ธฐ์กด ํฐ์ปค ์ ๋ณด๊ฐ ๋น์ด ์์ด ์ ์ฒด ๋์ ๊ธฐ์ค์ผ๋ก ์งํํฉ๋๋ค.")
|
| 694 |
|
| 695 |
pending_tickers = [ticker for ticker in all_tickers if ticker not in existing_tickers]
|
| 696 |
|
|
|
|
| 717 |
total = len(pending_tickers)
|
| 718 |
|
| 719 |
def _upload_checkpoint(end_index):
|
| 720 |
+
nonlocal last_checkpoint_success_index
|
| 721 |
|
| 722 |
if success_count <= last_checkpoint_success_index:
|
| 723 |
return
|
|
|
|
| 733 |
checkpoint_all_df = pd.concat(all_data_frames, ignore_index=True)
|
| 734 |
checkpoint_recent_df = pd.concat(recent_30d_frames, ignore_index=True)
|
| 735 |
|
| 736 |
+
logs.append(f" - {_df_stats(checkpoint_all_df, 'all ์ฒญํฌ')}")
|
| 737 |
+
logs.append(f" - {_df_stats(checkpoint_recent_df, '30d ์ฒญํฌ')}")
|
|
|
|
| 738 |
|
| 739 |
+
result_all_ckpt = append_parquet_chunk_to_hf(
|
| 740 |
+
checkpoint_all_df,
|
| 741 |
+
all_dataset_name,
|
| 742 |
+
hf_token,
|
| 743 |
+
subdir="data/chunks/all"
|
| 744 |
+
)
|
| 745 |
+
result_30d_ckpt = append_parquet_chunk_to_hf(
|
| 746 |
+
checkpoint_recent_df,
|
| 747 |
+
recent_dataset_name,
|
| 748 |
+
hf_token,
|
| 749 |
+
subdir="data/chunks/30d"
|
| 750 |
+
)
|
| 751 |
|
| 752 |
_append_upload_result("all ์ฒดํฌํฌ์ธํธ", result_all_ckpt)
|
| 753 |
_append_upload_result("30d ์ฒดํฌํฌ์ธํธ", result_30d_ckpt)
|
|
|
|
| 797 |
progress(0.9, desc="๋ง์ง๋ง ์ฒดํฌํฌ์ธํธ ๋ฐ์ ์ค...")
|
| 798 |
logs.append("\n๐ง [4๋จ๊ณ] ๋ง์ง๋ง ๋ฏธ๋ฐ์ ๋ฐ์ดํฐ ๋ฐ์ ์ค...")
|
| 799 |
|
| 800 |
+
if success_count > last_checkpoint_success_index:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
logs.append("\n๐พ [์ต์ข
๋ฐ์] ์ค๊ฐ ์
๋ก๋ ์์ด ๋์ ๋ ๋ฐ์ดํฐ ๋ฐ์")
|
| 802 |
_upload_checkpoint(total)
|
| 803 |
|
| 804 |
+
progress(0.97, desc="์ฒญํฌ ์
๋ก๋ ์ํ ๋ง๋ฌด๋ฆฌ ์ค...")
|
| 805 |
+
logs.append("\n๐ [5๋จ๊ณ] ์ฒญํฌ ์
๋ก๋ ๋ชจ๋ ์๋ฃ")
|
| 806 |
+
logs.append(" - all/30d ๋ชจ๋ ์ฒญํฌ ํ์ผ ๊ธฐ์ค์ผ๋ก ์ ์ฅ๋์์ต๋๋ค.")
|
| 807 |
+
logs.append(" - ๋ค์ ์คํ ์ 30d ํฐ์ปค ๋ชฉ๋ก ๊ธฐ์ค์ผ๋ก ์๋ ์คํต/์ฌ๊ฐ๋ฉ๋๋ค.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
|
| 809 |
# ========== ์๋ฃ ==========
|
| 810 |
progress(1.0, desc="์๋ฃ!")
|