ykjung commited on
Commit
9b2ea10
Β·
1 Parent(s): 25e73ac

feat: enhance run_pipeline with error handling and improve logging for data collection process

Browse files
Files changed (1) hide show
  1. app.py +112 -92
app.py CHANGED
@@ -15,6 +15,7 @@ import os
15
  import time
16
  import logging
17
  import json
 
18
  from urllib.request import Request, urlopen
19
 
20
  # λ‘œκΉ… μ„€μ •
@@ -422,7 +423,15 @@ def filter_last_30_days(df):
422
  return df
423
 
424
  df_copy = df.copy()
425
- df_copy["_date_parsed"] = pd.to_datetime(df_copy["Date"])
 
 
 
 
 
 
 
 
426
 
427
  max_date_by_ticker = df_copy.groupby("Ticker")["_date_parsed"].transform("max")
428
  cutoff_by_ticker = max_date_by_ticker - pd.Timedelta(days=30)
@@ -472,99 +481,110 @@ def run_pipeline(
472
  return "❌ ν—ˆκΉ…νŽ˜μ΄μŠ€ 토큰이 ν•„μš”ν•©λ‹ˆλ‹€. HF_TOKEN ν™˜κ²½λ³€μˆ˜ λ˜λŠ” μž…λ ₯창에 토큰을 λ„£μ–΄μ£Όμ„Έμš”."
473
 
474
  logs = []
475
- logs.append("=" * 60)
476
- logs.append("πŸ“Š 주식 데이터 μˆ˜μ§‘ νŒŒμ΄ν”„λΌμΈ μ‹œμž‘")
477
- logs.append(f"⏰ μ‹œμž‘ μ‹œκ°„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
478
- logs.append("=" * 60)
479
-
480
- # ========== 1단계: 티컀 λͺ©λ‘ μˆ˜μ§‘ ==========
481
- progress(0, desc="λ‚˜μŠ€λ‹₯ & λ‰΄μš• 티컀 λͺ©λ‘ μˆ˜μ§‘ 쀑...")
482
- logs.append("\nπŸ” [1단계] λ‚˜μŠ€λ‹₯ & λ‰΄μš•μ¦κΆŒκ±°λž˜μ†Œ 티컀 λͺ©λ‘ μˆ˜μ§‘ 쀑...")
483
-
484
- nasdaq_tickers, nyse_tickers, all_tickers = get_all_us_tickers()
485
-
486
- logs.append(f" - λ‚˜μŠ€λ‹₯: {len(nasdaq_tickers)}개")
487
- logs.append(f" - λ‰΄μš•μ¦κΆŒκ±°λž˜μ†Œ: {len(nyse_tickers)}개")
488
- logs.append(f" - 전체: {len(all_tickers)}개")
489
-
490
- if not all_tickers:
491
- return "\n".join(logs) + "\n\n❌ 티컀 λͺ©λ‘μ„ κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€."
492
-
493
- # ========== 2단계: μ•Όν›„ νŒŒμ΄λ‚ΈμŠ€ 데이터 μˆ˜μ§‘ ==========
494
- logs.append(f"\nπŸ“₯ [2단계] μ•Όν›„ νŒŒμ΄λ‚ΈμŠ€ 데이터 μˆ˜μ§‘ μ‹œμž‘ (총 {len(all_tickers)}개 티컀)")
495
- logs.append(f" - 배치 크기: {batch_size}")
496
- logs.append(f" - 쑰회 κΈ°κ°„(period): {period}")
497
- logs.append(f" ⚠️ 반볡문이라 였래 κ±Έλ¦½λ‹ˆλ‹€. 전체 티컀 μˆ˜μ— 따라 수 μ‹œκ°„ μ†Œμš”λ  수 μžˆμŠ΅λ‹ˆλ‹€.")
498
-
499
- all_data_frames = []
500
- success_count = 0
501
- fail_count = 0
502
- total = len(all_tickers)
503
-
504
- for i, ticker in enumerate(all_tickers):
505
- # μ§„ν–‰λ₯  μ—…λ°μ΄νŠΈ
506
- progress_pct = (i + 1) / total
507
- progress(progress_pct, desc=f"μˆ˜μ§‘ 쀑: {ticker} ({i + 1}/{total})")
508
-
509
- ticker_df = fetch_ticker_data(ticker, period=period)
510
-
511
- if ticker_df is not None and not ticker_df.empty:
512
- all_data_frames.append(ticker_df)
513
- success_count += 1
514
- else:
515
- fail_count += 1
516
-
517
- # 배치 λ‹¨μœ„λ‘œ 둜그 좜λ ₯
518
- if (i + 1) % batch_size == 0 or (i + 1) == total:
519
- logs.append(f" μ§„ν–‰: {i + 1}/{total} (성곡: {success_count}, μ‹€νŒ¨: {fail_count})")
520
-
521
- # API 호좜 κ°„ 짧은 λŒ€κΈ° (μ•Όν›„ 차단 λ°©μ§€)
522
- if (i + 1) % 10 == 0:
523
- time.sleep(0.5)
524
-
525
- logs.append(f"\nπŸ“Š μˆ˜μ§‘ μ™„λ£Œ: 성곡 {success_count}개 / μ‹€νŒ¨ {fail_count}개")
526
-
527
- if not all_data_frames:
528
- return "\n".join(logs) + "\n\n❌ μˆ˜μ§‘λœ 데이터가 μ—†μŠ΅λ‹ˆλ‹€."
529
-
530
- # ========== 3단계: 데이터 ν•©μΉ˜κΈ° ==========
531
- progress(0.9, desc="데이터 병합 쀑...")
532
- logs.append("\nπŸ”§ [3단계] 티컀별 λ²”μœ„ 데이터 병합 쀑...")
533
-
534
- all_df = pd.concat(all_data_frames, ignore_index=True)
535
- logs.append(f" - 전체 데이터: {len(all_df)}ν–‰ x {len(all_df.columns)}μ—΄")
536
- logs.append(f" - 고유 티컀 수: {all_df['Ticker'].nunique()}")
537
-
538
- # ========== 4단계: 30일 필터링 ==========
539
- progress(0.93, desc="졜근 30일 데이터 필터링 쀑...")
540
- logs.append("\nπŸ—“οΈ [4단계] 티컀별 졜근 30일 데이터 필터링 쀑...")
541
-
542
- recent_30d_df = filter_last_30_days(all_df)
543
- progress(0.96, desc="졜근 30일 필터링 μ™„λ£Œ")
544
- logs.append(f" - 30일 데이터: {len(recent_30d_df)}ν–‰ x {len(recent_30d_df.columns)}μ—΄")
545
- logs.append(f" - 고유 티컀 수: {recent_30d_df['Ticker'].nunique()}")
546
-
547
- # ========== 5단계: ν—ˆκΉ…νŽ˜μ΄μŠ€ 데이터셋 μ—…λ‘œλ“œ ==========
548
- progress(0.97, desc="all 데이터셋 μ—…λ‘œλ“œ 쀑...")
549
- logs.append("\nπŸš€ [5단계] ν—ˆκΉ…νŽ˜μ΄μŠ€ 데이터셋 μ—…λ‘œλ“œ 쀑...")
550
-
551
- # all 데이터셋 μ—…λ‘œλ“œ
552
- result_all = upload_dataset_to_hf(all_df, all_dataset_name, hf_token)
553
- logs.append(f" {result_all}")
554
-
555
- # 30일 데이터셋 μ—…λ‘œλ“œ
556
- progress(0.99, desc="30d 데이터셋 μ—…λ‘œλ“œ 쀑...")
557
- result_30d = upload_dataset_to_hf(recent_30d_df, recent_dataset_name, hf_token)
558
- logs.append(f" {result_30d}")
 
 
 
 
 
 
 
 
559
 
560
- # ========== μ™„λ£Œ ==========
561
- progress(1.0, desc="μ™„λ£Œ!")
562
- logs.append("\n" + "=" * 60)
563
- logs.append(f"βœ… νŒŒμ΄ν”„λΌμΈ μ™„λ£Œ!")
564
- logs.append(f"⏰ μ’…λ£Œ μ‹œκ°„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
565
- logs.append("=" * 60)
566
 
567
- return "\n".join(logs)
 
 
 
 
 
 
 
 
568
 
569
 
570
  def preview_tickers():
 
15
  import time
16
  import logging
17
  import json
18
+ import traceback
19
  from urllib.request import Request, urlopen
20
 
21
  # λ‘œκΉ… μ„€μ •
 
423
  return df
424
 
425
  df_copy = df.copy()
426
+ df_copy["_date_parsed"] = pd.to_datetime(df_copy["Date"], errors="coerce")
427
+
428
+ invalid_date_count = int(df_copy["_date_parsed"].isna().sum())
429
+ if invalid_date_count > 0:
430
+ logger.warning(f"Date νŒŒμ‹± μ‹€νŒ¨ ν–‰ {invalid_date_count}κ°œλŠ” 30일 ν•„ν„°μ—μ„œ μ œμ™Έλ©λ‹ˆλ‹€.")
431
+
432
+ df_copy = df_copy[df_copy["_date_parsed"].notna()].copy()
433
+ if df_copy.empty:
434
+ return pd.DataFrame(columns=df.columns)
435
 
436
  max_date_by_ticker = df_copy.groupby("Ticker")["_date_parsed"].transform("max")
437
  cutoff_by_ticker = max_date_by_ticker - pd.Timedelta(days=30)
 
481
  return "❌ ν—ˆκΉ…νŽ˜μ΄μŠ€ 토큰이 ν•„μš”ν•©λ‹ˆλ‹€. HF_TOKEN ν™˜κ²½λ³€μˆ˜ λ˜λŠ” μž…λ ₯창에 토큰을 λ„£μ–΄μ£Όμ„Έμš”."
482
 
483
  logs = []
484
+ try:
485
+ logs.append("=" * 60)
486
+ logs.append("πŸ“Š 주식 데이터 μˆ˜μ§‘ νŒŒμ΄ν”„λΌμΈ μ‹œμž‘")
487
+ logs.append(f"⏰ μ‹œμž‘ μ‹œκ°„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
488
+ logs.append("=" * 60)
489
+
490
+ # ========== 1단계: 티컀 λͺ©λ‘ μˆ˜μ§‘ ==========
491
+ progress(0, desc="λ‚˜μŠ€λ‹₯ & λ‰΄μš• 티컀 λͺ©λ‘ μˆ˜μ§‘ 쀑...")
492
+ logs.append("\nπŸ” [1단계] λ‚˜μŠ€λ‹₯ & λ‰΄μš•μ¦κΆŒκ±°λž˜μ†Œ 티컀 λͺ©λ‘ μˆ˜μ§‘ 쀑...")
493
+
494
+ nasdaq_tickers, nyse_tickers, all_tickers = get_all_us_tickers()
495
+
496
+ logs.append(f" - λ‚˜μŠ€λ‹₯: {len(nasdaq_tickers)}개")
497
+ logs.append(f" - λ‰΄μš•μ¦κΆŒκ±°λž˜μ†Œ: {len(nyse_tickers)}개")
498
+ logs.append(f" - 전체: {len(all_tickers)}개")
499
+
500
+ if not all_tickers:
501
+ return "\n".join(logs) + "\n\n❌ 티컀 λͺ©λ‘μ„ κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€."
502
+
503
+ # ========== 2단계: μ•Όν›„ νŒŒμ΄λ‚ΈμŠ€ 데이터 μˆ˜μ§‘ ==========
504
+ logs.append(f"\nπŸ“₯ [2단계] μ•Όν›„ νŒŒμ΄λ‚ΈμŠ€ 데이터 μˆ˜μ§‘ μ‹œμž‘ (총 {len(all_tickers)}개 티컀)")
505
+ logs.append(f" - 배치 크기: {batch_size}")
506
+ logs.append(f" - 쑰회 κΈ°κ°„(period): {period}")
507
+ logs.append(f" ⚠️ 반볡문이라 였래 κ±Έλ¦½λ‹ˆλ‹€. 전체 티컀 μˆ˜μ— 따라 수 μ‹œκ°„ μ†Œμš”λ  수 μžˆμŠ΅λ‹ˆλ‹€.")
508
+
509
+ all_data_frames = []
510
+ success_count = 0
511
+ fail_count = 0
512
+ total = len(all_tickers)
513
+
514
+ for i, ticker in enumerate(all_tickers):
515
+ # μ§„ν–‰λ₯  μ—…λ°μ΄νŠΈ
516
+ progress_pct = (i + 1) / total
517
+ progress(progress_pct, desc=f"μˆ˜μ§‘ 쀑: {ticker} ({i + 1}/{total})")
518
+
519
+ ticker_df = fetch_ticker_data(ticker, period=period)
520
+
521
+ if ticker_df is not None and not ticker_df.empty:
522
+ all_data_frames.append(ticker_df)
523
+ success_count += 1
524
+ else:
525
+ fail_count += 1
526
+
527
+ # 배치 λ‹¨μœ„λ‘œ 둜그 좜λ ₯
528
+ if (i + 1) % batch_size == 0 or (i + 1) == total:
529
+ logs.append(f" μ§„ν–‰: {i + 1}/{total} (성곡: {success_count}, μ‹€νŒ¨: {fail_count})")
530
+
531
+ # API 호좜 κ°„ 짧은 λŒ€κΈ° (μ•Όν›„ 차단 λ°©μ§€)
532
+ if (i + 1) % 10 == 0:
533
+ time.sleep(0.5)
534
+
535
+ logs.append(f"\nπŸ“Š μˆ˜μ§‘ μ™„λ£Œ: 성곡 {success_count}개 / μ‹€νŒ¨ {fail_count}개")
536
+
537
+ if not all_data_frames:
538
+ return "\n".join(logs) + "\n\n❌ μˆ˜μ§‘λœ 데이터가 μ—†μŠ΅λ‹ˆλ‹€."
539
+
540
+ # ========== 3단계: 데이터 ν•©μΉ˜κΈ° ==========
541
+ progress(0.9, desc="데이터 병합 쀑...")
542
+ logs.append("\nπŸ”§ [3단계] 티컀별 λ²”μœ„ 데이터 병합 쀑...")
543
+
544
+ all_df = pd.concat(all_data_frames, ignore_index=True)
545
+ logs.append(f" - 전체 데이터: {len(all_df)}ν–‰ x {len(all_df.columns)}μ—΄")
546
+ logs.append(f" - 고유 티컀 수: {all_df['Ticker'].nunique()}")
547
+
548
+ # ========== 4단계: 30일 필터링 ==========
549
+ progress(0.93, desc="졜근 30일 데이터 필터링 쀑...")
550
+ logs.append("\nπŸ—“οΈ [4단계] 티컀별 졜근 30일 데이터 필터링 쀑...")
551
+
552
+ recent_30d_df = filter_last_30_days(all_df)
553
+ progress(0.96, desc="졜근 30일 필터링 μ™„λ£Œ")
554
+ logs.append(f" - 30일 데이터: {len(recent_30d_df)}ν–‰ x {len(recent_30d_df.columns)}μ—΄")
555
+ logs.append(f" - 고유 티컀 수: {recent_30d_df['Ticker'].nunique()}")
556
+
557
+ # ========== 5단계: ν—ˆκΉ…νŽ˜μ΄μŠ€ 데이터셋 μ—…λ‘œλ“œ ==========
558
+ progress(0.97, desc="all 데이터셋 μ—…λ‘œλ“œ 쀑...")
559
+ logs.append("\nπŸš€ [5단계] ν—ˆκΉ…νŽ˜μ΄μŠ€ 데이터셋 μ—…λ‘œλ“œ 쀑...")
560
+
561
+ # all 데이터셋 μ—…λ‘œλ“œ
562
+ result_all = upload_dataset_to_hf(all_df, all_dataset_name, hf_token)
563
+ logs.append(f" {result_all}")
564
+
565
+ # 30일 데이터셋 μ—…λ‘œλ“œ
566
+ progress(0.99, desc="30d 데이터셋 μ—…λ‘œλ“œ 쀑...")
567
+ result_30d = upload_dataset_to_hf(recent_30d_df, recent_dataset_name, hf_token)
568
+ logs.append(f" {result_30d}")
569
+
570
+ # ========== μ™„λ£Œ ==========
571
+ progress(1.0, desc="μ™„λ£Œ!")
572
+ logs.append("\n" + "=" * 60)
573
+ logs.append(f"βœ… νŒŒμ΄ν”„λΌμΈ μ™„λ£Œ!")
574
+ logs.append(f"⏰ μ’…λ£Œ μ‹œκ°„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
575
+ logs.append("=" * 60)
576
 
577
+ return "\n".join(logs)
 
 
 
 
 
578
 
579
+ except Exception as e:
580
+ logger.exception("run_pipeline μ‹€ν–‰ 쀑 μ˜ˆμ™Έ λ°œμƒ")
581
+ logs.append("\n" + "=" * 60)
582
+ logs.append("❌ νŒŒμ΄ν”„λΌμΈ μ‹€ν–‰ 쀑 μ˜ˆμ™Έκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€.")
583
+ logs.append(f"였λ₯˜ λ©”μ‹œμ§€: {e}")
584
+ logs.append("\n[Traceback]")
585
+ logs.append(traceback.format_exc())
586
+ logs.append("=" * 60)
587
+ return "\n".join(logs)
588
 
589
 
590
  def preview_tickers():