Spaces:
Running
Running
ykjung commited on
Commit Β·
9b2ea10
1
Parent(s): 25e73ac
feat: enhance run_pipeline with error handling and improve logging for data collection process
Browse files
app.py
CHANGED
|
@@ -15,6 +15,7 @@ import os
|
|
| 15 |
import time
|
| 16 |
import logging
|
| 17 |
import json
|
|
|
|
| 18 |
from urllib.request import Request, urlopen
|
| 19 |
|
| 20 |
# λ‘κΉ
μ€μ
|
|
@@ -422,7 +423,15 @@ def filter_last_30_days(df):
|
|
| 422 |
return df
|
| 423 |
|
| 424 |
df_copy = df.copy()
|
| 425 |
-
df_copy["_date_parsed"] = pd.to_datetime(df_copy["Date"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
max_date_by_ticker = df_copy.groupby("Ticker")["_date_parsed"].transform("max")
|
| 428 |
cutoff_by_ticker = max_date_by_ticker - pd.Timedelta(days=30)
|
|
@@ -472,99 +481,110 @@ def run_pipeline(
|
|
| 472 |
return "β νκΉ
νμ΄μ€ ν ν°μ΄ νμν©λλ€. HF_TOKEN νκ²½λ³μ λλ μ
λ ₯μ°½μ ν ν°μ λ£μ΄μ£ΌμΈμ."
|
| 473 |
|
| 474 |
logs = []
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
-
|
| 561 |
-
progress(1.0, desc="μλ£!")
|
| 562 |
-
logs.append("\n" + "=" * 60)
|
| 563 |
-
logs.append(f"β
νμ΄νλΌμΈ μλ£!")
|
| 564 |
-
logs.append(f"β° μ’
λ£ μκ°: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 565 |
-
logs.append("=" * 60)
|
| 566 |
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
|
| 569 |
|
| 570 |
def preview_tickers():
|
|
|
|
| 15 |
import time
|
| 16 |
import logging
|
| 17 |
import json
|
| 18 |
+
import traceback
|
| 19 |
from urllib.request import Request, urlopen
|
| 20 |
|
| 21 |
# λ‘κΉ
μ€μ
|
|
|
|
| 423 |
return df
|
| 424 |
|
| 425 |
df_copy = df.copy()
|
| 426 |
+
df_copy["_date_parsed"] = pd.to_datetime(df_copy["Date"], errors="coerce")
|
| 427 |
+
|
| 428 |
+
invalid_date_count = int(df_copy["_date_parsed"].isna().sum())
|
| 429 |
+
if invalid_date_count > 0:
|
| 430 |
+
logger.warning(f"Date νμ± μ€ν¨ ν {invalid_date_count}κ°λ 30μΌ νν°μμ μ μΈλ©λλ€.")
|
| 431 |
+
|
| 432 |
+
df_copy = df_copy[df_copy["_date_parsed"].notna()].copy()
|
| 433 |
+
if df_copy.empty:
|
| 434 |
+
return pd.DataFrame(columns=df.columns)
|
| 435 |
|
| 436 |
max_date_by_ticker = df_copy.groupby("Ticker")["_date_parsed"].transform("max")
|
| 437 |
cutoff_by_ticker = max_date_by_ticker - pd.Timedelta(days=30)
|
|
|
|
| 481 |
return "β νκΉ
νμ΄μ€ ν ν°μ΄ νμν©λλ€. HF_TOKEN νκ²½λ³μ λλ μ
λ ₯μ°½μ ν ν°μ λ£μ΄μ£ΌμΈμ."
|
| 482 |
|
| 483 |
logs = []
|
| 484 |
+
try:
|
| 485 |
+
logs.append("=" * 60)
|
| 486 |
+
logs.append("π μ£Όμ λ°μ΄ν° μμ§ νμ΄νλΌμΈ μμ")
|
| 487 |
+
logs.append(f"β° μμ μκ°: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 488 |
+
logs.append("=" * 60)
|
| 489 |
+
|
| 490 |
+
# ========== 1λ¨κ³: ν°μ»€ λͺ©λ‘ μμ§ ==========
|
| 491 |
+
progress(0, desc="λμ€λ₯ & λ΄μ ν°μ»€ λͺ©λ‘ μμ§ μ€...")
|
| 492 |
+
logs.append("\nπ [1λ¨κ³] λμ€λ₯ & λ΄μμ¦κΆκ±°λμ ν°μ»€ λͺ©λ‘ μμ§ μ€...")
|
| 493 |
+
|
| 494 |
+
nasdaq_tickers, nyse_tickers, all_tickers = get_all_us_tickers()
|
| 495 |
+
|
| 496 |
+
logs.append(f" - λμ€λ₯: {len(nasdaq_tickers)}κ°")
|
| 497 |
+
logs.append(f" - λ΄μμ¦κΆκ±°λμ: {len(nyse_tickers)}κ°")
|
| 498 |
+
logs.append(f" - μ 체: {len(all_tickers)}κ°")
|
| 499 |
+
|
| 500 |
+
if not all_tickers:
|
| 501 |
+
return "\n".join(logs) + "\n\nβ ν°μ»€ λͺ©λ‘μ κ°μ Έμ¬ μ μμ΅λλ€."
|
| 502 |
+
|
| 503 |
+
# ========== 2λ¨κ³: μΌν νμ΄λΈμ€ λ°μ΄ν° μμ§ ==========
|
| 504 |
+
logs.append(f"\nπ₯ [2λ¨κ³] μΌν νμ΄λΈμ€ λ°μ΄ν° μμ§ μμ (μ΄ {len(all_tickers)}κ° ν°μ»€)")
|
| 505 |
+
logs.append(f" - λ°°μΉ ν¬κΈ°: {batch_size}")
|
| 506 |
+
logs.append(f" - μ‘°ν κΈ°κ°(period): {period}")
|
| 507 |
+
logs.append(f" β οΈ λ°λ³΅λ¬Έμ΄λΌ μ€λ 걸립λλ€. μ 체 ν°μ»€ μμ λ°λΌ μ μκ° μμλ μ μμ΅λλ€.")
|
| 508 |
+
|
| 509 |
+
all_data_frames = []
|
| 510 |
+
success_count = 0
|
| 511 |
+
fail_count = 0
|
| 512 |
+
total = len(all_tickers)
|
| 513 |
+
|
| 514 |
+
for i, ticker in enumerate(all_tickers):
|
| 515 |
+
# μ§νλ₯ μ
λ°μ΄νΈ
|
| 516 |
+
progress_pct = (i + 1) / total
|
| 517 |
+
progress(progress_pct, desc=f"μμ§ μ€: {ticker} ({i + 1}/{total})")
|
| 518 |
+
|
| 519 |
+
ticker_df = fetch_ticker_data(ticker, period=period)
|
| 520 |
+
|
| 521 |
+
if ticker_df is not None and not ticker_df.empty:
|
| 522 |
+
all_data_frames.append(ticker_df)
|
| 523 |
+
success_count += 1
|
| 524 |
+
else:
|
| 525 |
+
fail_count += 1
|
| 526 |
+
|
| 527 |
+
# λ°°μΉ λ¨μλ‘ λ‘κ·Έ μΆλ ₯
|
| 528 |
+
if (i + 1) % batch_size == 0 or (i + 1) == total:
|
| 529 |
+
logs.append(f" μ§ν: {i + 1}/{total} (μ±κ³΅: {success_count}, μ€ν¨: {fail_count})")
|
| 530 |
+
|
| 531 |
+
# API νΈμΆ κ° μ§§μ λκΈ° (μΌν μ°¨λ¨ λ°©μ§)
|
| 532 |
+
if (i + 1) % 10 == 0:
|
| 533 |
+
time.sleep(0.5)
|
| 534 |
+
|
| 535 |
+
logs.append(f"\nπ μμ§ μλ£: μ±κ³΅ {success_count}κ° / μ€ν¨ {fail_count}κ°")
|
| 536 |
+
|
| 537 |
+
if not all_data_frames:
|
| 538 |
+
return "\n".join(logs) + "\n\nβ μμ§λ λ°μ΄ν°κ° μμ΅λλ€."
|
| 539 |
+
|
| 540 |
+
# ========== 3λ¨κ³: λ°μ΄ν° ν©μΉκΈ° ==========
|
| 541 |
+
progress(0.9, desc="λ°μ΄ν° λ³ν© μ€...")
|
| 542 |
+
logs.append("\nπ§ [3λ¨κ³] ν°μ»€λ³ λ²μ λ°μ΄ν° λ³ν© μ€...")
|
| 543 |
+
|
| 544 |
+
all_df = pd.concat(all_data_frames, ignore_index=True)
|
| 545 |
+
logs.append(f" - μ 체 λ°μ΄ν°: {len(all_df)}ν x {len(all_df.columns)}μ΄")
|
| 546 |
+
logs.append(f" - κ³ μ ν°μ»€ μ: {all_df['Ticker'].nunique()}")
|
| 547 |
+
|
| 548 |
+
# ========== 4λ¨κ³: 30μΌ νν°λ§ ==========
|
| 549 |
+
progress(0.93, desc="μ΅κ·Ό 30μΌ λ°μ΄ν° νν°λ§ μ€...")
|
| 550 |
+
logs.append("\nποΈ [4λ¨κ³] ν°μ»€λ³ μ΅κ·Ό 30μΌ λ°μ΄ν° νν°λ§ μ€...")
|
| 551 |
+
|
| 552 |
+
recent_30d_df = filter_last_30_days(all_df)
|
| 553 |
+
progress(0.96, desc="μ΅κ·Ό 30μΌ νν°λ§ μλ£")
|
| 554 |
+
logs.append(f" - 30μΌ λ°μ΄ν°: {len(recent_30d_df)}ν x {len(recent_30d_df.columns)}μ΄")
|
| 555 |
+
logs.append(f" - κ³ μ ν°μ»€ μ: {recent_30d_df['Ticker'].nunique()}")
|
| 556 |
+
|
| 557 |
+
# ========== 5λ¨κ³: νκΉ
νμ΄μ€ λ°μ΄ν°μ
μ
λ‘λ ==========
|
| 558 |
+
progress(0.97, desc="all λ°μ΄ν°μ
μ
λ‘λ μ€...")
|
| 559 |
+
logs.append("\nπ [5λ¨κ³] νκΉ
νμ΄μ€ λ°μ΄ν°μ
μ
λ‘λ μ€...")
|
| 560 |
+
|
| 561 |
+
# all λ°μ΄ν°μ
μ
λ‘λ
|
| 562 |
+
result_all = upload_dataset_to_hf(all_df, all_dataset_name, hf_token)
|
| 563 |
+
logs.append(f" {result_all}")
|
| 564 |
+
|
| 565 |
+
# 30μΌ λ°μ΄ν°μ
μ
λ‘λ
|
| 566 |
+
progress(0.99, desc="30d λ°μ΄ν°μ
μ
λ‘λ μ€...")
|
| 567 |
+
result_30d = upload_dataset_to_hf(recent_30d_df, recent_dataset_name, hf_token)
|
| 568 |
+
logs.append(f" {result_30d}")
|
| 569 |
+
|
| 570 |
+
# ========== μλ£ ==========
|
| 571 |
+
progress(1.0, desc="μλ£!")
|
| 572 |
+
logs.append("\n" + "=" * 60)
|
| 573 |
+
logs.append(f"β
νμ΄νλΌμΈ μλ£!")
|
| 574 |
+
logs.append(f"β° μ’
λ£ μκ°: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 575 |
+
logs.append("=" * 60)
|
| 576 |
|
| 577 |
+
return "\n".join(logs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
|
| 579 |
+
except Exception as e:
|
| 580 |
+
logger.exception("run_pipeline μ€ν μ€ μμΈ λ°μ")
|
| 581 |
+
logs.append("\n" + "=" * 60)
|
| 582 |
+
logs.append("β νμ΄νλΌμΈ μ€ν μ€ μμΈκ° λ°μνμ΅λλ€.")
|
| 583 |
+
logs.append(f"μ€λ₯ λ©μμ§: {e}")
|
| 584 |
+
logs.append("\n[Traceback]")
|
| 585 |
+
logs.append(traceback.format_exc())
|
| 586 |
+
logs.append("=" * 60)
|
| 587 |
+
return "\n".join(logs)
|
| 588 |
|
| 589 |
|
| 590 |
def preview_tickers():
|