ykjung commited on
Commit
dd5d980
ยท
1 Parent(s): 1d2414a

feat: implement chunked Parquet uploads to Hugging Face for memory efficiency and improved data handling

Browse files
Files changed (3) hide show
  1. README.md +7 -7
  2. __pycache__/app.cpython-313.pyc +0 -0
  3. app.py +106 -61
README.md CHANGED
@@ -85,16 +85,15 @@ UI ์—ฐ๊ฒฐ:
85
  - ๋งˆ๊ฐ ํ™•์ • ์ „(today) ๋ฐ์ดํ„ฐ๋Š” ์ œ์™ธ
86
  - ์‹คํŒจ ์‹œ ์ตœ๋Œ€ 3ํšŒ ์žฌ์‹œ๋„
87
  - 10ํ‹ฐ์ปค๋งˆ๋‹ค 0.5์ดˆ ๋Œ€๊ธฐ(ํ˜ธ์ถœ ์ฐจ๋‹จ ์™„ํ™”)
88
- - ์„ค์ •ํ•œ ๊ฐ„๊ฒฉ(`checkpoint_batch_size`)๋งˆ๋‹ค ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹๊ณผ ๋ณ‘ํ•ฉ ํ›„ ์ค‘๊ฐ„ ์—…๋กœ๋“œ(์ฒดํฌํฌ์ธํŠธ) ์ˆ˜ํ–‰
 
89
  4. **์ตœ์ข… ๋ณ‘ํ•ฉ ๋ฐ˜์˜**
90
- - ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ๋ฒ„ํผ๋ฅผ ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹์— ๋ณ‘ํ•ฉ
91
- - `Ticker + Date` ๊ธฐ์ค€ ์ค‘๋ณต ์ œ๊ฑฐ(์ตœ์‹ ๊ฐ’ ์œ ์ง€)
92
  5. **์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ์…‹ ๊ฐฑ์‹ **
93
- - ๋ณ‘ํ•ฉ๋œ ๋ฐ์ดํ„ฐ ๊ธฐ์ค€์œผ๋กœ ํ‹ฐ์ปค๋ณ„ 30์ผ ์œˆ๋„์šฐ ์œ ์ง€
94
  6. **Hugging Face ์—…๋กœ๋“œ**
95
- - `upload_dataset_to_hf(all_df, all_dataset_name, hf_token)`
96
- - `upload_dataset_to_hf(recent_30d_df, recent_dataset_name, hf_token)`
97
- - ๋‚ด๋ถ€์ ์œผ๋กœ `Dataset.from_pandas(...).push_to_hub(...)` ์‚ฌ์šฉ
98
  7. **์™„๋ฃŒ ๋กœ๊ทธ ๋ฐ˜ํ™˜**
99
  - ์‹œ์ž‘/์ข…๋ฃŒ ์‹œ๊ฐ„, ์„ฑ๊ณต/์‹คํŒจ ๊ฐœ์ˆ˜, ์—…๋กœ๋“œ ๊ฒฐ๊ณผ๋ฅผ ํ…์ŠคํŠธ๋กœ ๋ฐ˜ํ™˜
100
 
@@ -170,6 +169,7 @@ UI ์—ฐ๊ฒฐ:
170
 
171
  - ์ด๋ฏธ ๋ฐ์ดํ„ฐ์…‹์— ์กด์žฌํ•˜๋Š” ํ‹ฐ์ปค๋Š” ์ž๋™์œผ๋กœ ์Šคํ‚ตํ•ฉ๋‹ˆ๋‹ค.
172
  - ์ด์ „ ์‹คํ–‰์—์„œ ์ค‘๊ฐ„์— ์‹คํŒจํ•ด๋„, ๋‹ค์Œ ์‹คํ–‰ ์‹œ ๋‚จ์€ ํ‹ฐ์ปค ์œ„์ฃผ๋กœ ์ด์–ด์„œ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.
 
173
 
174
  ---
175
 
 
85
  - ๋งˆ๊ฐ ํ™•์ • ์ „(today) ๋ฐ์ดํ„ฐ๋Š” ์ œ์™ธ
86
  - ์‹คํŒจ ์‹œ ์ตœ๋Œ€ 3ํšŒ ์žฌ์‹œ๋„
87
  - 10ํ‹ฐ์ปค๋งˆ๋‹ค 0.5์ดˆ ๋Œ€๊ธฐ(ํ˜ธ์ถœ ์ฐจ๋‹จ ์™„ํ™”)
88
+ - ์„ค์ •ํ•œ ๊ฐ„๊ฒฉ(`checkpoint_batch_size`)๋งˆ๋‹ค ์ˆ˜์ง‘ ๋ฒ„ํผ๋ฅผ Parquet ์ฒญํฌ๋กœ ์—…๋กœ๋“œ
89
+ - ์ฒญํฌ ์—…๋กœ๋“œ ํ›„ ๋ฉ”๋ชจ๋ฆฌ ๋ฒ„ํผ๋ฅผ ์ฆ‰์‹œ ๋น„์›Œ ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰์„ ์ œํ•œ
90
  4. **์ตœ์ข… ๋ณ‘ํ•ฉ ๋ฐ˜์˜**
91
+ - ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ๋ฒ„ํผ๋ฅผ ์ฒญํฌ ์—…๋กœ๋“œ๋กœ ๋ฐ˜์˜
 
92
  5. **์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ์…‹ ๊ฐฑ์‹ **
93
+ - ํ‹ฐ์ปค๋ณ„ ์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ ์ฒญํฌ๋ฅผ ํ•จ๊ป˜ ์—…๋กœ๋“œ
94
  6. **Hugging Face ์—…๋กœ๋“œ**
95
+ - ๋‚ด๋ถ€์ ์œผ๋กœ `HfApi.upload_file(..., path_in_repo="data/chunks/...parquet")` ๋ฐฉ์‹ ์‚ฌ์šฉ
96
+ - ๋Œ€์šฉ๋Ÿ‰ ์ „์ฒด ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์žฌ์—…๋กœ๋“œ๋ฅผ ํ”ผํ•ด์„œ ๋ฉ”๋ชจ๋ฆฌ ํ”ผํฌ๋ฅผ ๋‚ฎ์ถค
 
97
  7. **์™„๋ฃŒ ๋กœ๊ทธ ๋ฐ˜ํ™˜**
98
  - ์‹œ์ž‘/์ข…๋ฃŒ ์‹œ๊ฐ„, ์„ฑ๊ณต/์‹คํŒจ ๊ฐœ์ˆ˜, ์—…๋กœ๋“œ ๊ฒฐ๊ณผ๋ฅผ ํ…์ŠคํŠธ๋กœ ๋ฐ˜ํ™˜
99
 
 
169
 
170
  - ์ด๋ฏธ ๋ฐ์ดํ„ฐ์…‹์— ์กด์žฌํ•˜๋Š” ํ‹ฐ์ปค๋Š” ์ž๋™์œผ๋กœ ์Šคํ‚ตํ•ฉ๋‹ˆ๋‹ค.
171
  - ์ด์ „ ์‹คํ–‰์—์„œ ์ค‘๊ฐ„์— ์‹คํŒจํ•ด๋„, ๋‹ค์Œ ์‹คํ–‰ ์‹œ ๋‚จ์€ ํ‹ฐ์ปค ์œ„์ฃผ๋กœ ์ด์–ด์„œ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.
172
+ - ์ฒดํฌํฌ์ธํŠธ ๊ฐ„๊ฒฉ(์˜ˆ: 100) ๊ธฐ์ค€์œผ๋กœ ์ €์žฅ๋˜๋ฏ€๋กœ, ์‹คํŒจ ์‹œ ์ตœ๊ทผ ์ฒดํฌํฌ์ธํŠธ ์ดํ›„ ๊ตฌ๊ฐ„๋งŒ ์žฌ์ˆ˜์ง‘๋ฉ๋‹ˆ๋‹ค.
173
 
174
  ---
175
 
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -17,6 +17,8 @@ import logging
17
  import json
18
  import traceback
19
  import gc
 
 
20
  from urllib.request import Request, urlopen
21
 
22
  # ๋กœ๊น… ์„ค์ •
@@ -500,6 +502,83 @@ def upload_dataset_to_hf(df, repo_name, hf_token, max_retries=3, retry_wait_sec=
500
  }
501
 
502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  def run_pipeline(
504
  hf_token,
505
  all_dataset_name,
@@ -521,16 +600,6 @@ def run_pipeline(
521
 
522
  logs = []
523
  try:
524
- def _merge_rows(base_df, incoming_df):
525
- if incoming_df is None or incoming_df.empty:
526
- return base_df
527
- if base_df is None or base_df.empty:
528
- return incoming_df.reset_index(drop=True)
529
-
530
- merged = pd.concat([base_df, incoming_df], ignore_index=True)
531
- merged = merged.drop_duplicates(subset=["Ticker", "Date"], keep="last")
532
- return merged.reset_index(drop=True)
533
-
534
  def _df_stats(df, label):
535
  if df is None or df.empty:
536
  return f"{label}: 0ํ–‰"
@@ -553,6 +622,7 @@ def run_pipeline(
553
  logs.append("๐Ÿ“Š ์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ ์‹œ์ž‘")
554
  logs.append(f"โฐ ์‹œ์ž‘ ์‹œ๊ฐ„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
555
  logs.append("=" * 60)
 
556
 
557
  # ========== 1๋‹จ๊ณ„: ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ==========
558
  progress(0, desc="๋‚˜์Šค๋‹ฅ & ๋‰ด์š• ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ์ค‘...")
@@ -611,24 +681,16 @@ def run_pipeline(
611
  progress(0.08, desc="๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์ค‘...")
612
  logs.append("\n๐Ÿ“‚ [2๋‹จ๊ณ„] ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ๋ฐ ์žฌ๊ฐœ ๋Œ€์ƒ ๊ณ„์‚ฐ...")
613
 
614
- base_all_df = pd.DataFrame(columns=["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"])
615
- base_recent_df = pd.DataFrame(columns=["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"])
616
-
617
  try:
618
- base_all_df = load_hf_dataset_as_df(all_dataset_name, hf_token)
619
- logs.append(f" - ๊ธฐ์กด all ๋ฐ์ดํ„ฐ: {len(base_all_df)}ํ–‰")
620
  except Exception as e:
621
- logs.append(f" - ๊ธฐ์กด all ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์‹คํŒจ(์‹ ๊ทœ ์ƒ์„ฑ์œผ๋กœ ์ง„ํ–‰): {e}")
622
 
623
- try:
624
- base_recent_df = load_hf_dataset_as_df(recent_dataset_name, hf_token)
625
- logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํ„ฐ: {len(base_recent_df)}ํ–‰")
626
- except Exception as e:
627
- logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์‹คํŒจ(์‹ ๊ทœ ์ƒ์„ฑ์œผ๋กœ ์ง„ํ–‰): {e}")
628
-
629
- existing_tickers = set(base_all_df["Ticker"].dropna().astype(str).str.upper().tolist())
630
  if not existing_tickers:
631
- existing_tickers = set(base_recent_df["Ticker"].dropna().astype(str).str.upper().tolist())
632
 
633
  pending_tickers = [ticker for ticker in all_tickers if ticker not in existing_tickers]
634
 
@@ -655,7 +717,7 @@ def run_pipeline(
655
  total = len(pending_tickers)
656
 
657
  def _upload_checkpoint(end_index):
658
- nonlocal last_checkpoint_success_index, base_all_df, base_recent_df
659
 
660
  if success_count <= last_checkpoint_success_index:
661
  return
@@ -671,15 +733,21 @@ def run_pipeline(
671
  checkpoint_all_df = pd.concat(all_data_frames, ignore_index=True)
672
  checkpoint_recent_df = pd.concat(recent_30d_frames, ignore_index=True)
673
 
674
- base_all_df = _merge_rows(base_all_df, checkpoint_all_df)
675
- base_recent_df = _merge_rows(base_recent_df, checkpoint_recent_df)
676
- base_recent_df = filter_last_30_days(base_recent_df)
677
 
678
- logs.append(f" - {_df_stats(base_all_df, 'all ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ')}")
679
- logs.append(f" - {_df_stats(base_recent_df, '30d ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ')}")
680
-
681
- result_all_ckpt = upload_dataset_to_hf(base_all_df, all_dataset_name, hf_token)
682
- result_30d_ckpt = upload_dataset_to_hf(base_recent_df, recent_dataset_name, hf_token)
 
 
 
 
 
 
 
683
 
684
  _append_upload_result("all ์ฒดํฌํฌ์ธํŠธ", result_all_ckpt)
685
  _append_upload_result("30d ์ฒดํฌํฌ์ธํŠธ", result_30d_ckpt)
@@ -729,37 +797,14 @@ def run_pipeline(
729
  progress(0.9, desc="๋งˆ์ง€๋ง‰ ์ฒดํฌํฌ์ธํŠธ ๋ฐ˜์˜ ์ค‘...")
730
  logs.append("\n๐Ÿ”ง [4๋‹จ๊ณ„] ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜ ์ค‘...")
731
 
732
- if checkpoint_batch_size > 0 and success_count > last_checkpoint_success_index:
733
- logs.append("\n๐Ÿ’พ [์ฒดํฌํฌ์ธํŠธ] ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ๊ตฌ๊ฐ„ ์—…๋กœ๋“œ")
734
- _upload_checkpoint(total)
735
-
736
- if checkpoint_batch_size == 0 and all_data_frames:
737
  logs.append("\n๐Ÿ’พ [์ตœ์ข…๋ฐ˜์˜] ์ค‘๊ฐ„ ์—…๋กœ๋“œ ์—†์ด ๋ˆ„์ ๋œ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜")
738
  _upload_checkpoint(total)
739
 
740
- logs.append(f" - ์ „์ฒด ๋ฐ์ดํ„ฐ: {len(base_all_df)}ํ–‰ x {len(base_all_df.columns)}์—ด")
741
- logs.append(f" - ๊ณ ์œ  ํ‹ฐ์ปค ์ˆ˜: {base_all_df['Ticker'].nunique()}")
742
- logs.append(f" - 30์ผ ๋ฐ์ดํ„ฐ: {len(base_recent_df)}ํ–‰ x {len(base_recent_df.columns)}์—ด")
743
- logs.append(f" - 30์ผ ๊ณ ์œ  ํ‹ฐ์ปค ์ˆ˜: {base_recent_df['Ticker'].nunique()}")
744
-
745
- # ========== 5๋‹จ๊ณ„: ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ ==========
746
- progress(0.97, desc="all ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ ์ค‘...")
747
- logs.append("\n๐Ÿš€ [5๋‹จ๊ณ„] ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ ์ค‘...")
748
-
749
- # all ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ
750
- logs.append(f" - {_df_stats(base_all_df, 'all ์ตœ์ข… ์—…๋กœ๋“œ ๋Œ€์ƒ')}")
751
- result_all = upload_dataset_to_hf(base_all_df, all_dataset_name, hf_token)
752
- _append_upload_result("all ์ตœ์ข…", result_all)
753
- if not result_all["ok"]:
754
- raise RuntimeError("all ์ตœ์ข… ์—…๋กœ๋“œ ์‹คํŒจ")
755
-
756
- # 30์ผ ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ
757
- progress(0.99, desc="30d ๋ฐ์ดํ„ฐ์…‹ ์—…๋กœ๋“œ ์ค‘...")
758
- logs.append(f" - {_df_stats(base_recent_df, '30d ์ตœ์ข… ์—…๋กœ๋“œ ๋Œ€์ƒ')}")
759
- result_30d = upload_dataset_to_hf(base_recent_df, recent_dataset_name, hf_token)
760
- _append_upload_result("30d ์ตœ์ข…", result_30d)
761
- if not result_30d["ok"]:
762
- raise RuntimeError("30d ์ตœ์ข… ์—…๋กœ๋“œ ์‹คํŒจ")
763
 
764
  # ========== ์™„๋ฃŒ ==========
765
  progress(1.0, desc="์™„๋ฃŒ!")
 
17
  import json
18
  import traceback
19
  import gc
20
+ import tempfile
21
+ import uuid
22
  from urllib.request import Request, urlopen
23
 
24
  # ๋กœ๊น… ์„ค์ •
 
502
  }
503
 
504
 
505
+ def append_parquet_chunk_to_hf(df, repo_name, hf_token, subdir="data/chunks", max_retries=3, retry_wait_sec=2):
506
+ """๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ Parquet ์ฒญํฌ ํŒŒ์ผ๋กœ ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹ ์ €์žฅ์†Œ์— ์ถ”๊ฐ€ ์—…๋กœ๋“œ"""
507
+ if df is None or df.empty:
508
+ return {
509
+ "ok": False,
510
+ "repo": repo_name,
511
+ "rows": 0,
512
+ "attempts": 0,
513
+ "elapsed_sec": 0.0,
514
+ "error": "์—…๋กœ๋“œํ•  ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.",
515
+ "traceback": "",
516
+ }
517
+
518
+ api = HfApi()
519
+ last_error = ""
520
+ last_traceback = ""
521
+ start_ts = time.time()
522
+
523
+ for attempt in range(1, max_retries + 1):
524
+ temp_path = None
525
+ try:
526
+ api.create_repo(
527
+ repo_id=repo_name,
528
+ repo_type="dataset",
529
+ token=hf_token,
530
+ private=False,
531
+ exist_ok=True,
532
+ )
533
+
534
+ chunk_name = f"chunk-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid.uuid4().hex[:8]}.parquet"
535
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
536
+ temp_path = tmp.name
537
+
538
+ df.to_parquet(temp_path, index=False)
539
+
540
+ path_in_repo = f"{subdir}/{chunk_name}"
541
+ api.upload_file(
542
+ path_or_fileobj=temp_path,
543
+ path_in_repo=path_in_repo,
544
+ repo_id=repo_name,
545
+ repo_type="dataset",
546
+ token=hf_token,
547
+ )
548
+
549
+ return {
550
+ "ok": True,
551
+ "repo": repo_name,
552
+ "rows": len(df),
553
+ "attempts": attempt,
554
+ "elapsed_sec": time.time() - start_ts,
555
+ "error": "",
556
+ "traceback": "",
557
+ }
558
+ except Exception as e:
559
+ last_error = str(e)
560
+ last_traceback = traceback.format_exc()
561
+ logger.warning(f"[{repo_name}] ์ฒญํฌ ์—…๋กœ๋“œ ์‹คํŒจ (์‹œ๋„ {attempt}/{max_retries}): {last_error}")
562
+ if attempt < max_retries:
563
+ time.sleep(retry_wait_sec * attempt)
564
+ finally:
565
+ if temp_path and os.path.exists(temp_path):
566
+ try:
567
+ os.remove(temp_path)
568
+ except Exception:
569
+ pass
570
+
571
+ return {
572
+ "ok": False,
573
+ "repo": repo_name,
574
+ "rows": len(df),
575
+ "attempts": max_retries,
576
+ "elapsed_sec": time.time() - start_ts,
577
+ "error": last_error,
578
+ "traceback": last_traceback,
579
+ }
580
+
581
+
582
  def run_pipeline(
583
  hf_token,
584
  all_dataset_name,
 
600
 
601
  logs = []
602
  try:
 
 
 
 
 
 
 
 
 
 
603
  def _df_stats(df, label):
604
  if df is None or df.empty:
605
  return f"{label}: 0ํ–‰"
 
622
  logs.append("๐Ÿ“Š ์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ ์‹œ์ž‘")
623
  logs.append(f"โฐ ์‹œ์ž‘ ์‹œ๊ฐ„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
624
  logs.append("=" * 60)
625
+ logs.append("โ„น๏ธ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ ๋ชจ๋“œ: 100๊ฐœ ๋‹จ์œ„ ๋“ฑ ์ฒญํฌ ์—…๋กœ๋“œ ํ›„ ๋ฒ„ํผ๋ฅผ ์ฆ‰์‹œ ๋น„์›๋‹ˆ๋‹ค.")
626
 
627
  # ========== 1๋‹จ๊ณ„: ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ==========
628
  progress(0, desc="๋‚˜์Šค๋‹ฅ & ๋‰ด์š• ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ์ค‘...")
 
681
  progress(0.08, desc="๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์ค‘...")
682
  logs.append("\n๐Ÿ“‚ [2๋‹จ๊ณ„] ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ๋ฐ ์žฌ๊ฐœ ๋Œ€์ƒ ๊ณ„์‚ฐ...")
683
 
684
+ recent_for_resume = pd.DataFrame(columns=["Ticker"])
 
 
685
  try:
686
+ recent_for_resume = load_hf_dataset_as_df(recent_dataset_name, hf_token)
687
+ logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํ„ฐ: {len(recent_for_resume)}ํ–‰")
688
  except Exception as e:
689
+ logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์‹คํŒจ(์‹ ๊ทœ ์ˆ˜์ง‘ ๊ธฐ์ค€์œผ๋กœ ์ง„ํ–‰): {e}")
690
 
691
+ existing_tickers = set(recent_for_resume["Ticker"].dropna().astype(str).str.upper().tolist())
 
 
 
 
 
 
692
  if not existing_tickers:
693
+ logs.append(" - ๊ธฐ์กด ํ‹ฐ์ปค ์ •๋ณด๊ฐ€ ๋น„์–ด ์žˆ์–ด ์ „์ฒด ๋Œ€์ƒ ๊ธฐ์ค€์œผ๋กœ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.")
694
 
695
  pending_tickers = [ticker for ticker in all_tickers if ticker not in existing_tickers]
696
 
 
717
  total = len(pending_tickers)
718
 
719
  def _upload_checkpoint(end_index):
720
+ nonlocal last_checkpoint_success_index
721
 
722
  if success_count <= last_checkpoint_success_index:
723
  return
 
733
  checkpoint_all_df = pd.concat(all_data_frames, ignore_index=True)
734
  checkpoint_recent_df = pd.concat(recent_30d_frames, ignore_index=True)
735
 
736
+ logs.append(f" - {_df_stats(checkpoint_all_df, 'all ์ฒญํฌ')}")
737
+ logs.append(f" - {_df_stats(checkpoint_recent_df, '30d ์ฒญํฌ')}")
 
738
 
739
+ result_all_ckpt = append_parquet_chunk_to_hf(
740
+ checkpoint_all_df,
741
+ all_dataset_name,
742
+ hf_token,
743
+ subdir="data/chunks/all"
744
+ )
745
+ result_30d_ckpt = append_parquet_chunk_to_hf(
746
+ checkpoint_recent_df,
747
+ recent_dataset_name,
748
+ hf_token,
749
+ subdir="data/chunks/30d"
750
+ )
751
 
752
  _append_upload_result("all ์ฒดํฌํฌ์ธํŠธ", result_all_ckpt)
753
  _append_upload_result("30d ์ฒดํฌํฌ์ธํŠธ", result_30d_ckpt)
 
797
  progress(0.9, desc="๋งˆ์ง€๋ง‰ ์ฒดํฌํฌ์ธํŠธ ๋ฐ˜์˜ ์ค‘...")
798
  logs.append("\n๐Ÿ”ง [4๋‹จ๊ณ„] ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜ ์ค‘...")
799
 
800
+ if success_count > last_checkpoint_success_index:
 
 
 
 
801
  logs.append("\n๐Ÿ’พ [์ตœ์ข…๋ฐ˜์˜] ์ค‘๊ฐ„ ์—…๋กœ๋“œ ์—†์ด ๋ˆ„์ ๋œ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜")
802
  _upload_checkpoint(total)
803
 
804
+ progress(0.97, desc="์ฒญํฌ ์—…๋กœ๋“œ ์ƒํƒœ ๋งˆ๋ฌด๋ฆฌ ์ค‘...")
805
+ logs.append("\n๐Ÿš€ [5๋‹จ๊ณ„] ์ฒญํฌ ์—…๋กœ๋“œ ๋ชจ๋“œ ์™„๋ฃŒ")
806
+ logs.append(" - all/30d ๋ชจ๋‘ ์ฒญํฌ ํŒŒ์ผ ๊ธฐ์ค€์œผ๋กœ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
807
+ logs.append(" - ๋‹ค์Œ ์‹คํ–‰ ์‹œ 30d ํ‹ฐ์ปค ๋ชฉ๋ก ๊ธฐ์ค€์œผ๋กœ ์ž๋™ ์Šคํ‚ต/์žฌ๊ฐœ๋ฉ๋‹ˆ๋‹ค.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
 
809
  # ========== ์™„๋ฃŒ ==========
810
  progress(1.0, desc="์™„๋ฃŒ!")