createDataSet / app.py
ykjung
feat: implement chunked Parquet uploads to Hugging Face for memory efficiency and improved data handling
dd5d980
"""
๋‚˜์Šค๋‹ฅ & ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ ์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ๋ฐ ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ
- ์ˆ˜์ง‘: ๋‚˜์Šค๋‹ฅ/๋‰ด์š• ์ „์ฒด ํ‹ฐ์ปค๋ฅผ ์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค๋กœ ์ผ๋ณ„ ๋ฐ์ดํ„ฐ ์กฐํšŒ (์ „์ฒด๊ธฐ๊ฐ„)
- ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ: all ๋ฐ์ดํ„ฐ์…‹ + ์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ์…‹ ์ž๋™ ์ƒ์„ฑ
"""
import gradio as gr
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi
import os
import time
import logging
import json
import traceback
import gc
import tempfile
import uuid
from urllib.request import Request, urlopen
# ๋กœ๊น… ์„ค์ •
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("yfinance").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
# ํ—ˆ๊น…ํŽ˜์ด์Šค ํ† ํฐ (Spaces ์‹œํฌ๋ฆฟ์—์„œ ๊ฐ€์ ธ์˜ด)
HF_TOKEN = os.environ.get("HF_TOKEN", "")
def get_ny_today_str():
"""๋‰ด์š• ํ˜„์ง€ ๋‚ ์งœ(YYYY-MM-DD) ๋ฐ˜ํ™˜"""
ny_tz = ZoneInfo("America/New_York")
return datetime.now(ny_tz).strftime("%Y-%m-%d")
def is_us_market_open_now():
"""๋ฏธ๊ตญ ์ •๊ทœ์žฅ(๋‰ด์š•์‹œ๊ฐ„ 09:30~16:00) ์žฅ์ค‘ ์—ฌ๋ถ€ ๋ฐ˜ํ™˜"""
ny_tz = ZoneInfo("America/New_York")
now_ny = datetime.now(ny_tz)
# ์›”(0)~๊ธˆ(4)๋งŒ ์ •๊ทœ์žฅ
if now_ny.weekday() >= 5:
return False, now_ny
minutes = now_ny.hour * 60 + now_ny.minute
market_open = 9 * 60 + 30
market_close = 16 * 60
return market_open <= minutes < market_close, now_ny
def fetch_tradingview_realtime(tickers, batch_size=400):
"""TradingView Screener API๋กœ ํ‹ฐ์ปค๋ณ„ ์˜ค๋Š˜ OHLCV ์กฐํšŒ"""
if not tickers:
return []
results = {}
today_str = get_ny_today_str()
for i in range(0, len(tickers), batch_size):
batch = tickers[i:i + batch_size]
payload = {
"symbols": {
"tickers": [
*[f"NASDAQ:{t}" for t in batch],
*[f"NYSE:{t}" for t in batch],
*[f"AMEX:{t}" for t in batch],
]
},
"columns": ["close", "open", "high", "low", "volume", "exchange"],
"options": {"lang": "en"},
"markets": ["america"],
"range": [0, max(50, len(batch) * 3)]
}
req = Request(
"https://scanner.tradingview.com/america/scan",
data=json.dumps(payload).encode("utf-8"),
headers={
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
},
method="POST"
)
with urlopen(req, timeout=30) as resp:
body = resp.read().decode("utf-8")
parsed = json.loads(body)
for item in parsed.get("data", []):
symbol = item.get("s", "")
if ":" not in symbol:
continue
_, ticker = symbol.split(":", 1)
if ticker in results:
continue
data = item.get("d", [])
if len(data) < 5:
continue
close_val, open_val, high_val, low_val, volume_val = data[:5]
if close_val is None:
continue
results[ticker] = {
"Ticker": ticker,
"Date": today_str,
"Open": round(float(open_val), 4) if open_val is not None else None,
"High": round(float(high_val), 4) if high_val is not None else None,
"Low": round(float(low_val), 4) if low_val is not None else None,
"Close": round(float(close_val), 4) if close_val is not None else None,
"Volume": int(volume_val) if volume_val is not None else None
}
time.sleep(0.2)
return list(results.values())
def load_hf_dataset_as_df(repo_name, hf_token):
"""HF Hub ๋ฐ์ดํ„ฐ์…‹์„ pandas DataFrame์œผ๋กœ ๋กœ๋“œ"""
ds = load_dataset(repo_name, split="train", token=hf_token)
df = ds.to_pandas()
# ์ปฌ๋Ÿผ ํ‘œ์ค€ํ™”
required_cols = ["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"]
for col in required_cols:
if col not in df.columns:
df[col] = None
df = df[required_cols]
df["Ticker"] = df["Ticker"].astype(str).str.upper()
df["Date"] = df["Date"].astype(str)
return df
def run_realtime_update(
hf_token,
all_dataset_name,
recent_dataset_name,
progress=gr.Progress()
):
"""
์‹ค์‹œ๊ฐ„(์žฅ์ค‘) ๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ
- ์žฅ์ค‘ ์—ฌ๋ถ€ ๋ฉ”์‹œ์ง€ ์ถœ๋ ฅ(์ธ๋จธํƒ€์ž„ ์ž๋™ ๋ฐ˜์˜)
- TradingView Screener๋กœ ๋ฐ์ดํ„ฐ์…‹ ํ‹ฐ์ปค ์ผ๊ด„ ์กฐํšŒ
- all: ์˜ค๋Š˜ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€(append-only)
- 30d: ์˜ค๋ž˜๋œ ๋ฐ์ดํ„ฐ ์ œ๊ฑฐ ํ›„ ์˜ค๋Š˜ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜
"""
if not hf_token:
return "โŒ ํ—ˆ๊น…ํŽ˜์ด์Šค ํ† ํฐ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. HF_TOKEN ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋˜๋Š” ์ž…๋ ฅ์ฐฝ์— ํ† ํฐ์„ ๋„ฃ์–ด์ฃผ์„ธ์š”."
logs = []
logs.append("=" * 60)
logs.append("โšก ์‹ค์‹œ๊ฐ„ ๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ ์‹œ์ž‘")
logs.append(f"โฐ ์‹œ์ž‘ ์‹œ๊ฐ„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logs.append("=" * 60)
# 0) ์žฅ์ค‘ ์ƒํƒœ ํ™•์ธ (DST ์ž๋™ ๋ฐ˜์˜)
progress(0.05, desc="์žฅ์ค‘ ์—ฌ๋ถ€ ํ™•์ธ ์ค‘...")
is_open, now_ny = is_us_market_open_now()
if is_open:
logs.append(f"๐ŸŸข ์žฅ์ค‘์ž…๋‹ˆ๋‹ค. (๋‰ด์š•์‹œ๊ฐ„ {now_ny.strftime('%Y-%m-%d %H:%M:%S')})")
else:
logs.append(f"๐ŸŸก ์žฅ์ค‘์ด ์•„๋‹™๋‹ˆ๋‹ค. (๋‰ด์š•์‹œ๊ฐ„ {now_ny.strftime('%Y-%m-%d %H:%M:%S')})")
today_str = get_ny_today_str()
# 1) ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
progress(0.15, desc="๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์ค‘...")
logs.append("\n๐Ÿ“ฅ [1๋‹จ๊ณ„] ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์ค‘...")
all_df = load_hf_dataset_as_df(all_dataset_name, hf_token)
recent_df = load_hf_dataset_as_df(recent_dataset_name, hf_token)
# ์กฐํšŒ ํ‹ฐ์ปค๋Š” ๋ฐ์ดํ„ฐ์…‹์— ์žˆ๋Š” ํ‹ฐ์ปค ๊ธฐ์ค€
tickers = sorted(recent_df["Ticker"].dropna().astype(str).str.upper().unique().tolist())
if not tickers:
tickers = sorted(all_df["Ticker"].dropna().astype(str).str.upper().unique().tolist())
if not tickers:
return "\n".join(logs) + "\n\nโŒ ๋ฐ์ดํ„ฐ์…‹์— ํ‹ฐ์ปค๊ฐ€ ์—†์–ด ์‹ค์‹œ๊ฐ„ ์กฐํšŒ๋ฅผ ์ง„ํ–‰ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
logs.append(f" - ์กฐํšŒ ๋Œ€์ƒ ํ‹ฐ์ปค ์ˆ˜: {len(tickers)}")
# 2) ์˜ค๋Š˜ ๋ฐ์ดํ„ฐ ์ด๋ฏธ ์žˆ๋Š”์ง€ ํ™•์ธ
progress(0.25, desc="์˜ค๋Š˜ ๋ฐ์ดํ„ฐ ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ ์ค‘...")
today_rows = all_df[all_df["Date"] == today_str]
today_tickers = set(today_rows["Ticker"].astype(str).str.upper().tolist())
if set(tickers).issubset(today_tickers):
logs.append("\nโœ… ์ด๋ฏธ ์ˆ˜์ง‘ํ–ˆ์Šต๋‹ˆ๋‹ค")
return "\n".join(logs)
# 3) TradingView ์‹ค์‹œ๊ฐ„ ์กฐํšŒ
progress(0.45, desc="TradingView ์‹ค์‹œ๊ฐ„ ์กฐํšŒ ์ค‘...")
logs.append("\n๐Ÿ“ก [2๋‹จ๊ณ„] TradingView Screener ์กฐํšŒ ์ค‘...")
realtime_rows = fetch_tradingview_realtime(tickers)
if not realtime_rows:
return "\n".join(logs) + "\n\nโŒ TradingView์—์„œ ์‹ค์‹œ๊ฐ„ ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์˜ค์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
realtime_df = pd.DataFrame(realtime_rows)
realtime_df["Ticker"] = realtime_df["Ticker"].astype(str).str.upper()
realtime_df["Date"] = realtime_df["Date"].astype(str)
logs.append(f" - ์ˆ˜์‹  ์„ฑ๊ณต ํ‹ฐ์ปค ์ˆ˜: {realtime_df['Ticker'].nunique()}")
# 4) all ๋ฐ์ดํ„ฐ์…‹ ์—…๋ฐ์ดํŠธ (์ถ”๊ฐ€๋งŒ)
progress(0.65, desc="all ๋ฐ์ดํ„ฐ์…‹ ์—…๋ฐ์ดํŠธ ์ค‘...")
logs.append("\n๐Ÿงฉ [3๋‹จ๊ณ„] all ๋ฐ์ดํ„ฐ์…‹ ์—…๋ฐ์ดํŠธ(์ถ”๊ฐ€๋งŒ)...")
existing_today = set(all_df.loc[all_df["Date"] == today_str, "Ticker"].astype(str).str.upper().tolist())
add_all_df = realtime_df[~realtime_df["Ticker"].isin(existing_today)].copy()
if not add_all_df.empty:
all_updated_df = pd.concat([all_df, add_all_df], ignore_index=True)
else:
all_updated_df = all_df
logs.append(f" - all ์ถ”๊ฐ€ ๊ฑด์ˆ˜: {len(add_all_df)}")
# 5) 30d ๋ฐ์ดํ„ฐ์…‹ ์—…๋ฐ์ดํŠธ (์˜ค๋ž˜๋œ ๋‚ ์งœ ์ œ๊ฑฐ + ์ถ”๊ฐ€/๊ฐฑ์‹ )
progress(0.78, desc="30d ๋ฐ์ดํ„ฐ์…‹ ์—…๋ฐ์ดํŠธ ์ค‘...")
logs.append("\n๐Ÿ—“๏ธ [4๋‹จ๊ณ„] 30d ๋ฐ์ดํ„ฐ์…‹ ์—…๋ฐ์ดํŠธ(์˜ค๋ž˜๋œ ๋ฐ์ดํ„ฐ ์ œ๊ฑฐ + ์ถ”๊ฐ€)...")
# ๊ฐ™์€ ํ‹ฐ์ปค/์˜ค๋Š˜ ๋‚ ์งœ๊ฐ€ ๊ธฐ์กด์— ์žˆ์œผ๋ฉด ๊ต์ฒด๋ฅผ ์œ„ํ•ด ์ œ๊ฑฐ
update_tickers = set(realtime_df["Ticker"].tolist())
recent_df_wo_today = recent_df[
~((recent_df["Date"] == today_str) & (recent_df["Ticker"].isin(update_tickers)))
].copy()
recent_merged = pd.concat([recent_df_wo_today, realtime_df], ignore_index=True)
recent_updated_df = filter_last_30_days(recent_merged)
# 6) ์—…๋กœ๋“œ
progress(0.9, desc="ํ—ˆ๊น…ํŽ˜์ด์Šค ์—…๋กœ๋“œ ์ค‘...")
logs.append("\n๐Ÿš€ [5๋‹จ๊ณ„] ํ—ˆ๊น…ํŽ˜์ด์Šค ์—…๋กœ๋“œ ์ค‘...")
result_all = upload_dataset_to_hf(all_updated_df, all_dataset_name, hf_token)
result_30d = upload_dataset_to_hf(recent_updated_df, recent_dataset_name, hf_token)
logs.append(f" {result_all}")
logs.append(f" {result_30d}")
progress(1.0, desc="์™„๋ฃŒ!")
logs.append("\n" + "=" * 60)
logs.append("โœ… ์‹ค์‹œ๊ฐ„ ๋ฐ์ดํ„ฐ ์—…๋ฐ์ดํŠธ ์™„๋ฃŒ")
logs.append(f"๐Ÿ“… ๋ฐ˜์˜ ๋‚ ์งœ(๋‰ด์š• ๊ธฐ์ค€): {today_str}")
logs.append("=" * 60)
return "\n".join(logs)
def get_all_us_tickers():
"""
์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค ์Šคํฌ๋ฆฌ๋„ˆ(yf.screen)๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋‚˜์Šค๋‹ฅ + ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ ํ‹ฐ์ปค ๋ชฉ๋ก์„ ๊ฐ€์ ธ์˜ด.
- ๋‚˜์Šค๋‹ฅ์€ 3๊ฐœ ๋งˆ์ผ“์œผ๋กœ ๊ตฌ์„ฑ: NMS(๊ธ€๋กœ๋ฒŒ์…€๋ ‰ํŠธ), NGM(๊ธ€๋กœ๋ฒŒ๋งˆ์ผ“), NCM(์บํ”ผํ„ธ๋งˆ์ผ“)
- ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ: NYQ
- yfinance ๋‚ด์žฅ ๊ธฐ๋Šฅ์ด๋ผ ๋ณ„๋„ ๋ฐ์ดํ„ฐ ์†Œ์Šค ๋ถˆํ•„์š”, HF Spaces์—์„œ๋„ ๋™์ž‘
๋ฐ˜ํ™˜: (nasdaq_tickers, nyse_tickers, all_tickers)
"""
def _fetch_exchange_tickers(exchange_code):
"""์•ผํ›„ ์Šคํฌ๋ฆฌ๋„ˆ์—์„œ ํŠน์ • ๊ฑฐ๋ž˜์†Œ์˜ ์ „์ฒด ํ‹ฐ์ปค๋ฅผ ํŽ˜์ด์ง•์œผ๋กœ ๊ฐ€์ ธ์˜ค๊ธฐ"""
query = yf.EquityQuery("eq", ["exchange", exchange_code])
symbols = []
offset = 0
while True:
result = yf.screen(query, size=250, offset=offset)
quotes = result.get("quotes", [])
if not quotes:
break
for quote in quotes:
sym = quote.get("symbol", "")
if sym:
# [ํ•„ํ„ฐ]
# 1. '-', '.', '$'๊ฐ€ ํฌํ•จ๋œ ํ‹ฐ์ปค (์šฐ์„ ์ฃผ, ์œ ๋‹› ๋“ฑ) ์ œ์™ธ
# 2. ํ‹ฐ์ปค๊ฐ€ 5์ž์ด๋ฉด์„œ ๋งˆ์ง€๋ง‰์ด W(Warrant), R(Right), U(Unit)์ธ ํŒŒ์ƒ ์ข…๋ชฉ ์ œ์™ธ
is_special = any(c in sym for c in ["-", ".", "$"])
is_derivative = len(sym) == 5 and sym[-1] in ["W", "R", "U"]
if not (is_special or is_derivative):
symbols.append(sym)
offset += len(quotes)
total = result.get("total", 0)
if offset >= total:
break
return sorted(list(set(symbols)))
try:
# ๋‚˜์Šค๋‹ฅ: 3๊ฐœ ๋งˆ์ผ“ ํ•ฉ์‚ฐ
# NMS = NASDAQ Global Select Market
# NGM = NASDAQ Global Market
# NCM = NASDAQ Capital Market
nasdaq_tickers = []
for market_code in ["NMS", "NGM", "NCM"]:
tickers = _fetch_exchange_tickers(market_code)
logger.info(f" ๋‚˜์Šค๋‹ฅ {market_code}: {len(tickers)}๊ฐœ ๋กœ๋“œ")
nasdaq_tickers.extend(tickers)
nasdaq_tickers = sorted(list(set(nasdaq_tickers)))
# ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ(NYSE): NYQ
nyse_tickers = _fetch_exchange_tickers("NYQ")
logger.info(f" ๋‰ด์š• NYQ: {len(nyse_tickers)}๊ฐœ ๋กœ๋“œ")
all_tickers = sorted(list(set(nasdaq_tickers + nyse_tickers)))
logger.info(f"๋‚˜์Šค๋‹ฅ: {len(nasdaq_tickers)}๊ฐœ, ๋‰ด์š•: {len(nyse_tickers)}๊ฐœ, ์ „์ฒด: {len(all_tickers)}๊ฐœ ๋กœ๋“œ ์™„๋ฃŒ")
return nasdaq_tickers, nyse_tickers, all_tickers
except Exception as e:
logger.error(f"์•ผํ›„ ์Šคํฌ๋ฆฌ๋„ˆ ํ‹ฐ์ปค ๋กœ๋“œ ์‹คํŒจ: {e}")
return [], [], []
def fetch_ticker_data(ticker, period="max", max_retries=3):
"""
๊ฐœ๋ณ„ ํ‹ฐ์ปค์˜ ๊ธฐ๊ฐ„๋ณ„ ์ผ๋ณ„ ๋ฐ์ดํ„ฐ๋ฅผ ์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค์—์„œ ์กฐํšŒ
- period: yfinance history ๊ธฐ๊ฐ„ ํŒŒ๋ผ๋ฏธํ„ฐ (์˜ˆ: max, 10y, 5y, 1y, 6mo, 3mo)
- interval="1d" : ์ผ๋ณ„ ๋ฐ์ดํ„ฐ
"""
def _parse_valid_periods(error_message):
marker = "must be one of:"
if marker not in error_message:
return []
raw = error_message.split(marker, 1)[1]
return [p.strip().strip("'").strip('"') for p in raw.split(",") if p.strip()]
def _choose_fallback_period(requested_period, valid_periods):
if not valid_periods:
return None
preferred_order = ["max", "10y", "5y", "2y", "1y", "6mo", "3mo", "1mo", "5d", "1d"]
for candidate in preferred_order:
if candidate in valid_periods:
return candidate
if requested_period in valid_periods:
return requested_period
return valid_periods[0]
effective_period = period
for attempt in range(max_retries):
try:
stock = yf.Ticker(ticker)
# ๊ธฐ๊ฐ„๋ณ„, ์ผ๋ณ„ ๋ฐ์ดํ„ฐ ์กฐํšŒ
hist = stock.history(period=effective_period, interval="1d")
if hist.empty:
logger.warning(f"[{ticker}] ๋ฐ์ดํ„ฐ ์—†์Œ (๋นˆ ๊ฒฐ๊ณผ)")
return None
# ์ธ๋ฑ์Šค(๋‚ ์งœ)๋ฅผ ์ปฌ๋Ÿผ์œผ๋กœ ๋ณ€ํ™˜
hist = hist.reset_index()
# ticker ์ปฌ๋Ÿผ ์ถ”๊ฐ€ (๋‚˜์ค‘์— ํ‹ฐ์ปค๋ณ„ ๊ตฌ๋ถ„์šฉ)
hist["Ticker"] = ticker
# ๋‚ ์งœ ์ปฌ๋Ÿผ์„ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ (๋ฐ์ดํ„ฐ์…‹ ํ˜ธํ™˜์„ฑ)
if "Date" in hist.columns:
hist["Date"] = hist["Date"].dt.strftime("%Y-%m-%d")
elif "Datetime" in hist.columns:
hist.rename(columns={"Datetime": "Date"}, inplace=True)
hist["Date"] = pd.to_datetime(hist["Date"]).dt.strftime("%Y-%m-%d")
# ํ•„์š”ํ•œ ์ปฌ๋Ÿผ๋งŒ ์„ ํƒ
columns_to_keep = ["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"]
available_cols = [c for c in columns_to_keep if c in hist.columns]
hist = hist[available_cols]
# ์ˆซ์ž ์ปฌ๋Ÿผ ์†Œ์ˆ˜์  ์ •๋ฆฌ
numeric_cols = ["Open", "High", "Low", "Close"]
for col in numeric_cols:
if col in hist.columns:
hist[col] = hist[col].round(4)
# --- ์žฅ์ค‘ ๋ฐ์ดํ„ฐ(๋ฏธํ™•์ • ์ข…๊ฐ€) ์ œ์™ธ ๋กœ์ง ---
# zoneinfo๋Š” Python ๋‚ด์žฅ์ด๋ผ ๋ณ„๋„ ์„ค์น˜ ๋ถˆํ•„์š”, ์ธ๋จธํƒ€์ž„(EDT/EST) ์ž๋™ ์ฒ˜๋ฆฌ
ny_tz = ZoneInfo("America/New_York")
now_ny = datetime.now(ny_tz)
today_ny = now_ny.strftime("%Y-%m-%d")
# ์ •๊ทœ์žฅ ๋งˆ๊ฐ: ๋‰ด์š• ํ˜„์ง€ ์‹œ๊ฐ„ 16:00 (์ธ๋จธํƒ€์ž„์ด๋“  ์•„๋‹ˆ๋“  ๋™์ผ)
# ์—ฌ์œ ๋ฅผ ๋‘๊ณ  16:30 ์ดํ›„๋ฉด ์ข…๊ฐ€ ํ™•์ •์œผ๋กœ ํŒ๋‹จ
market_closed = now_ny.hour >= 17 or (now_ny.hour == 16 and now_ny.minute >= 30)
if not hist.empty and hist.iloc[-1]["Date"] == today_ny:
if not market_closed:
# ์•„์ง ์žฅ์ค‘์ด๊ฑฐ๋‚˜ ๋งˆ๊ฐ ์งํ›„ โ†’ ์˜ค๋Š˜ ๋ฐ์ดํ„ฐ ์ œ์™ธ (์ข…๊ฐ€ ๋ฏธํ™•์ •)
logger.info(f"[{ticker}] ์žฅ์ค‘ ๋ฐ์ดํ„ฐ({today_ny}) ์ œ์™ธ (ํ˜„์žฌ ๋‰ด์š•์‹œ๊ฐ„: {now_ny.strftime('%H:%M')})")
hist = hist.iloc[:-1]
else:
# ์žฅ ๋งˆ๊ฐ ํ›„ โ†’ ์˜ค๋Š˜ ์ข…๊ฐ€ ํ™•์ •, ํฌํ•จ
logger.info(f"[{ticker}] ์žฅ ๋งˆ๊ฐ ํ›„ ๋ฐ์ดํ„ฐ({today_ny}) ํฌํ•จ")
# -----------------------------------------------
return hist
except Exception as e:
error_message = str(e)
if "must be one of:" in error_message:
valid_periods = _parse_valid_periods(error_message)
fallback_period = _choose_fallback_period(effective_period, valid_periods)
if fallback_period and fallback_period != effective_period:
logger.info(
f"[{ticker}] period '{effective_period}' ๋ฏธ์ง€์›, '{fallback_period}'๋กœ ์ž๋™ ์ „ํ™˜ ํ›„ ์žฌ์‹œ๋„"
)
effective_period = fallback_period
continue
logger.warning(f"[{ticker}] ์กฐํšŒ ์‹คํŒจ (์‹œ๋„ {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(1) # ์žฌ์‹œ๋„ ์ „ ๋Œ€๊ธฐ
continue
return None
def filter_last_30_days(df):
"""์ „์ฒด ๋ฐ์ดํ„ฐ์—์„œ ํ‹ฐ์ปค๋ณ„ ์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ๋งŒ ํ•„ํ„ฐ๋ง"""
if df.empty:
return df
df_copy = df.copy()
df_copy["_date_parsed"] = pd.to_datetime(df_copy["Date"], errors="coerce")
invalid_date_count = int(df_copy["_date_parsed"].isna().sum())
if invalid_date_count > 0:
logger.warning(f"Date ํŒŒ์‹ฑ ์‹คํŒจ ํ–‰ {invalid_date_count}๊ฐœ๋Š” 30์ผ ํ•„ํ„ฐ์—์„œ ์ œ์™ธ๋ฉ๋‹ˆ๋‹ค.")
df_copy = df_copy[df_copy["_date_parsed"].notna()].copy()
if df_copy.empty:
return pd.DataFrame(columns=df.columns)
max_date_by_ticker = df_copy.groupby("Ticker")["_date_parsed"].transform("max")
cutoff_by_ticker = max_date_by_ticker - pd.Timedelta(days=30)
result = df_copy[df_copy["_date_parsed"] >= cutoff_by_ticker].copy()
if result.empty:
return pd.DataFrame(columns=df.columns)
result = result.reset_index(drop=True)
result.drop(columns=["_date_parsed"], inplace=True)
return result
def upload_dataset_to_hf(df, repo_name, hf_token, max_retries=3, retry_wait_sec=2):
"""๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ์—…๋กœ๋“œ(์žฌ์‹œ๋„/์ง„๋‹จ ์ •๋ณด ํฌํ•จ)"""
if df is None or df.empty:
return {
"ok": False,
"repo": repo_name,
"rows": 0,
"attempts": 0,
"elapsed_sec": 0.0,
"error": "์—…๋กœ๋“œํ•  ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.",
"traceback": "",
}
last_error = ""
last_traceback = ""
start_ts = time.time()
for attempt in range(1, max_retries + 1):
try:
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset.push_to_hub(
repo_name,
token=hf_token,
private=False # ๊ณต๊ฐœ ๋ฐ์ดํ„ฐ์…‹
)
return {
"ok": True,
"repo": repo_name,
"rows": len(df),
"attempts": attempt,
"elapsed_sec": time.time() - start_ts,
"error": "",
"traceback": "",
}
except Exception as e:
last_error = str(e)
last_traceback = traceback.format_exc()
logger.warning(f"[{repo_name}] ์—…๋กœ๋“œ ์‹คํŒจ (์‹œ๋„ {attempt}/{max_retries}): {last_error}")
if attempt < max_retries:
time.sleep(retry_wait_sec * attempt)
return {
"ok": False,
"repo": repo_name,
"rows": len(df),
"attempts": max_retries,
"elapsed_sec": time.time() - start_ts,
"error": last_error,
"traceback": last_traceback,
}
def append_parquet_chunk_to_hf(df, repo_name, hf_token, subdir="data/chunks", max_retries=3, retry_wait_sec=2):
"""๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ Parquet ์ฒญํฌ ํŒŒ์ผ๋กœ ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹ ์ €์žฅ์†Œ์— ์ถ”๊ฐ€ ์—…๋กœ๋“œ"""
if df is None or df.empty:
return {
"ok": False,
"repo": repo_name,
"rows": 0,
"attempts": 0,
"elapsed_sec": 0.0,
"error": "์—…๋กœ๋“œํ•  ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.",
"traceback": "",
}
api = HfApi()
last_error = ""
last_traceback = ""
start_ts = time.time()
for attempt in range(1, max_retries + 1):
temp_path = None
try:
api.create_repo(
repo_id=repo_name,
repo_type="dataset",
token=hf_token,
private=False,
exist_ok=True,
)
chunk_name = f"chunk-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid.uuid4().hex[:8]}.parquet"
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
temp_path = tmp.name
df.to_parquet(temp_path, index=False)
path_in_repo = f"{subdir}/{chunk_name}"
api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=path_in_repo,
repo_id=repo_name,
repo_type="dataset",
token=hf_token,
)
return {
"ok": True,
"repo": repo_name,
"rows": len(df),
"attempts": attempt,
"elapsed_sec": time.time() - start_ts,
"error": "",
"traceback": "",
}
except Exception as e:
last_error = str(e)
last_traceback = traceback.format_exc()
logger.warning(f"[{repo_name}] ์ฒญํฌ ์—…๋กœ๋“œ ์‹คํŒจ (์‹œ๋„ {attempt}/{max_retries}): {last_error}")
if attempt < max_retries:
time.sleep(retry_wait_sec * attempt)
finally:
if temp_path and os.path.exists(temp_path):
try:
os.remove(temp_path)
except Exception:
pass
return {
"ok": False,
"repo": repo_name,
"rows": len(df),
"attempts": max_retries,
"elapsed_sec": time.time() - start_ts,
"error": last_error,
"traceback": last_traceback,
}
def run_pipeline(
hf_token,
all_dataset_name,
recent_dataset_name,
batch_size,
period,
checkpoint_batch_size,
progress=gr.Progress()
):
"""
์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰
1. ๋‚˜์Šค๋‹ฅ & ๋‰ด์š• ํ‹ฐ์ปค ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ
2. ํ‹ฐ์ปค๋ณ„ ์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค ์ผ๋ณ„ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘
3. ์ „์ฒด๊ธฐ๊ฐ„ ๋ฐ์ดํ„ฐ์…‹ (all) ์ƒ์„ฑ
4. ์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ
"""
if not hf_token:
return "โŒ ํ—ˆ๊น…ํŽ˜์ด์Šค ํ† ํฐ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. HF_TOKEN ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋˜๋Š” ์ž…๋ ฅ์ฐฝ์— ํ† ํฐ์„ ๋„ฃ์–ด์ฃผ์„ธ์š”."
logs = []
try:
def _df_stats(df, label):
if df is None or df.empty:
return f"{label}: 0ํ–‰"
mem_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
return f"{label}: {len(df)}ํ–‰ x {len(df.columns)}์—ด, ๋ฉ”๋ชจ๋ฆฌ ์•ฝ {mem_mb:.2f}MB"
def _append_upload_result(log_prefix, result):
status_icon = "โœ…" if result["ok"] else "โŒ"
logs.append(
f" - {log_prefix}: {status_icon} {result['repo']} | "
f"rows={result['rows']} | attempts={result['attempts']} | elapsed={result['elapsed_sec']:.1f}s"
)
if not result["ok"]:
logs.append(f" ์˜ค๋ฅ˜: {result['error']}")
if result["traceback"]:
logs.append(" Traceback:")
logs.append(result["traceback"])
logs.append("=" * 60)
logs.append("๐Ÿ“Š ์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ํŒŒ์ดํ”„๋ผ์ธ ์‹œ์ž‘")
logs.append(f"โฐ ์‹œ์ž‘ ์‹œ๊ฐ„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logs.append("=" * 60)
logs.append("โ„น๏ธ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ ๋ชจ๋“œ: 100๊ฐœ ๋‹จ์œ„ ๋“ฑ ์ฒญํฌ ์—…๋กœ๋“œ ํ›„ ๋ฒ„ํผ๋ฅผ ์ฆ‰์‹œ ๋น„์›๋‹ˆ๋‹ค.")
# ========== 1๋‹จ๊ณ„: ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ==========
progress(0, desc="๋‚˜์Šค๋‹ฅ & ๋‰ด์š• ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ์ค‘...")
logs.append("\n๐Ÿ” [1๋‹จ๊ณ„] ๋‚˜์Šค๋‹ฅ & ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘ ์ค‘...")
nasdaq_tickers, nyse_tickers, all_tickers = get_all_us_tickers()
logs.append(f" - ๋‚˜์Šค๋‹ฅ: {len(nasdaq_tickers)}๊ฐœ")
logs.append(f" - ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ: {len(nyse_tickers)}๊ฐœ")
logs.append(f" - ์ „์ฒด: {len(all_tickers)}๊ฐœ")
if not all_tickers:
logs.append("\nโš ๏ธ Yahoo Screener์—์„œ ํ‹ฐ์ปค๋ฅผ ๊ฐ€์ ธ์˜ค์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. HF ๋ฐ์ดํ„ฐ์…‹ ๊ธฐ๋ฐ˜ ํด๋ฐฑ์„ ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค.")
fallback_tickers = []
fallback_errors = []
try:
recent_df = load_hf_dataset_as_df(recent_dataset_name, hf_token)
fallback_tickers = sorted(
recent_df["Ticker"].dropna().astype(str).str.upper().unique().tolist()
)
if fallback_tickers:
logs.append(f" - ํด๋ฐฑ ์†Œ์Šค: recent ๋ฐ์ดํ„ฐ์…‹ ({recent_dataset_name})")
except Exception as e:
fallback_errors.append(f"recent ๋กœ๋“œ ์‹คํŒจ: {e}")
if not fallback_tickers:
try:
all_existing_df = load_hf_dataset_as_df(all_dataset_name, hf_token)
fallback_tickers = sorted(
all_existing_df["Ticker"].dropna().astype(str).str.upper().unique().tolist()
)
if fallback_tickers:
logs.append(f" - ํด๋ฐฑ ์†Œ์Šค: all ๋ฐ์ดํ„ฐ์…‹ ({all_dataset_name})")
except Exception as e:
fallback_errors.append(f"all ๋กœ๋“œ ์‹คํŒจ: {e}")
if fallback_tickers:
all_tickers = fallback_tickers
logs.append(f" - ํด๋ฐฑ ํ‹ฐ์ปค ์ˆ˜: {len(all_tickers)}๊ฐœ")
else:
if fallback_errors:
logs.append(" - ํด๋ฐฑ ์‹คํŒจ ์ƒ์„ธ:")
for err in fallback_errors:
logs.append(f" * {err}")
logs.append("\n๊ฐ€๋Šฅํ•œ ์›์ธ:")
logs.append(" 1) Yahoo Screener ์ผ์‹œ ์žฅ์• /์ฐจ๋‹จ")
logs.append(" 2) ๋„คํŠธ์›Œํฌ/์ง€์—ญ ์ œํ•œ")
logs.append(" 3) yfinance API ๋ณ€๊ฒฝ")
return "\n".join(logs) + "\n\nโŒ ํ‹ฐ์ปค ๋ชฉ๋ก์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
# ========== 2๋‹จ๊ณ„: ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ + ์žฌ๊ฐœ ๋Œ€์ƒ ๊ณ„์‚ฐ ==========
progress(0.08, desc="๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์ค‘...")
logs.append("\n๐Ÿ“‚ [2๋‹จ๊ณ„] ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ๋ฐ ์žฌ๊ฐœ ๋Œ€์ƒ ๊ณ„์‚ฐ...")
recent_for_resume = pd.DataFrame(columns=["Ticker"])
try:
recent_for_resume = load_hf_dataset_as_df(recent_dataset_name, hf_token)
logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํ„ฐ: {len(recent_for_resume)}ํ–‰")
except Exception as e:
logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์‹คํŒจ(์‹ ๊ทœ ์ˆ˜์ง‘ ๊ธฐ์ค€์œผ๋กœ ์ง„ํ–‰): {e}")
existing_tickers = set(recent_for_resume["Ticker"].dropna().astype(str).str.upper().tolist())
if not existing_tickers:
logs.append(" - ๊ธฐ์กด ํ‹ฐ์ปค ์ •๋ณด๊ฐ€ ๋น„์–ด ์žˆ์–ด ์ „์ฒด ๋Œ€์ƒ ๊ธฐ์ค€์œผ๋กœ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.")
pending_tickers = [ticker for ticker in all_tickers if ticker not in existing_tickers]
logs.append(f" - ๊ธฐ์กด ์ˆ˜์ง‘ ํ‹ฐ์ปค: {len(existing_tickers)}๊ฐœ")
logs.append(f" - ์ด๋ฒˆ ์‹คํ–‰ ๋Œ€์ƒ ํ‹ฐ์ปค: {len(pending_tickers)}๊ฐœ")
if not pending_tickers:
progress(1.0, desc="์™„๋ฃŒ!")
logs.append("\nโœ… ์ด๋ฏธ ์ˆ˜์ง‘๋œ ํ‹ฐ์ปค์ž…๋‹ˆ๋‹ค. ์ถ”๊ฐ€ ์ˆ˜์ง‘ํ•  ๋Œ€์ƒ์ด ์—†์Šต๋‹ˆ๋‹ค.")
return "\n".join(logs)
# ========== 3๋‹จ๊ณ„: ์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ==========
logs.append(f"\n๐Ÿ“ฅ [3๋‹จ๊ณ„] ์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ์‹œ์ž‘ (์ด {len(pending_tickers)}๊ฐœ ํ‹ฐ์ปค)")
logs.append(f" - ๋ฐฐ์น˜ ํฌ๊ธฐ: {batch_size}")
logs.append(f" - ์กฐํšŒ ๊ธฐ๊ฐ„(period): {period}")
logs.append(f" - ์ฒดํฌํฌ์ธํŠธ ์—…๋กœ๋“œ ๊ฐ„๊ฒฉ: {checkpoint_batch_size}๊ฐœ ํ‹ฐ์ปค")
logs.append(f" โš ๏ธ ๋ฐ˜๋ณต๋ฌธ์ด๋ผ ์˜ค๋ž˜ ๊ฑธ๋ฆฝ๋‹ˆ๋‹ค. ์ „์ฒด ํ‹ฐ์ปค ์ˆ˜์— ๋”ฐ๋ผ ์ˆ˜ ์‹œ๊ฐ„ ์†Œ์š”๋  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
all_data_frames = []
recent_30d_frames = []
success_count = 0
fail_count = 0
last_checkpoint_success_index = 0
total = len(pending_tickers)
def _upload_checkpoint(end_index):
nonlocal last_checkpoint_success_index
if success_count <= last_checkpoint_success_index:
return
logs.append(
f"\n๐Ÿ’พ [์ฒดํฌํฌ์ธํŠธ] {end_index}/{total} ์ฒ˜๋ฆฌ ์‹œ์  ์ค‘๊ฐ„ ์—…๋กœ๋“œ ์‹œ์ž‘ "
f"(๋ˆ„์  ์„ฑ๊ณต {success_count}๊ฐœ)"
)
if not all_data_frames:
return
checkpoint_all_df = pd.concat(all_data_frames, ignore_index=True)
checkpoint_recent_df = pd.concat(recent_30d_frames, ignore_index=True)
logs.append(f" - {_df_stats(checkpoint_all_df, 'all ์ฒญํฌ')}")
logs.append(f" - {_df_stats(checkpoint_recent_df, '30d ์ฒญํฌ')}")
result_all_ckpt = append_parquet_chunk_to_hf(
checkpoint_all_df,
all_dataset_name,
hf_token,
subdir="data/chunks/all"
)
result_30d_ckpt = append_parquet_chunk_to_hf(
checkpoint_recent_df,
recent_dataset_name,
hf_token,
subdir="data/chunks/30d"
)
_append_upload_result("all ์ฒดํฌํฌ์ธํŠธ", result_all_ckpt)
_append_upload_result("30d ์ฒดํฌํฌ์ธํŠธ", result_30d_ckpt)
if not result_all_ckpt["ok"] or not result_30d_ckpt["ok"]:
raise RuntimeError("์ฒดํฌํฌ์ธํŠธ ์—…๋กœ๋“œ ์‹คํŒจ๋กœ ํŒŒ์ดํ”„๋ผ์ธ์„ ์ค‘๋‹จํ•ฉ๋‹ˆ๋‹ค.")
all_data_frames.clear()
recent_30d_frames.clear()
gc.collect()
last_checkpoint_success_index = success_count
for i, ticker in enumerate(pending_tickers):
# ์ง„ํ–‰๋ฅ  ์—…๋ฐ์ดํŠธ
progress_pct = 0.1 + ((i + 1) / total) * 0.75
progress(progress_pct, desc=f"์ˆ˜์ง‘ ์ค‘: {ticker} ({i + 1}/{total})")
ticker_df = fetch_ticker_data(ticker, period=period)
if ticker_df is not None and not ticker_df.empty:
all_data_frames.append(ticker_df)
recent_30d_frames.append(filter_last_30_days(ticker_df))
success_count += 1
else:
fail_count += 1
# ๋ฐฐ์น˜ ๋‹จ์œ„๋กœ ๋กœ๊ทธ ์ถœ๋ ฅ
if (i + 1) % batch_size == 0 or (i + 1) == total:
logs.append(f" ์ง„ํ–‰: {i + 1}/{total} (์„ฑ๊ณต: {success_count}, ์‹คํŒจ: {fail_count})")
if checkpoint_batch_size > 0 and ((i + 1) % checkpoint_batch_size == 0):
checkpoint_progress = min(0.89, max(progress_pct, 0.82))
progress(checkpoint_progress, desc=f"์ค‘๊ฐ„ ์—…๋กœ๋“œ ์ค‘... ({i + 1}/{total})")
_upload_checkpoint(i + 1)
# API ํ˜ธ์ถœ ๊ฐ„ ์งง์€ ๋Œ€๊ธฐ (์•ผํ›„ ์ฐจ๋‹จ ๋ฐฉ์ง€)
if (i + 1) % 10 == 0:
time.sleep(0.5)
logs.append(f"\n๐Ÿ“Š ์ˆ˜์ง‘ ์™„๋ฃŒ: ์„ฑ๊ณต {success_count}๊ฐœ / ์‹คํŒจ {fail_count}๊ฐœ")
if success_count == 0:
return "\n".join(logs) + "\n\nโŒ ์ˆ˜์ง‘๋œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
# ========== 4๋‹จ๊ณ„: ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ์ฒดํฌํฌ์ธํŠธ ๋ฐ˜์˜ ==========
progress(0.9, desc="๋งˆ์ง€๋ง‰ ์ฒดํฌํฌ์ธํŠธ ๋ฐ˜์˜ ์ค‘...")
logs.append("\n๐Ÿ”ง [4๋‹จ๊ณ„] ๋งˆ์ง€๋ง‰ ๋ฏธ๋ฐ˜์˜ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜ ์ค‘...")
if success_count > last_checkpoint_success_index:
logs.append("\n๐Ÿ’พ [์ตœ์ข…๋ฐ˜์˜] ์ค‘๊ฐ„ ์—…๋กœ๋“œ ์—†์ด ๋ˆ„์ ๋œ ๋ฐ์ดํ„ฐ ๋ฐ˜์˜")
_upload_checkpoint(total)
progress(0.97, desc="์ฒญํฌ ์—…๋กœ๋“œ ์ƒํƒœ ๋งˆ๋ฌด๋ฆฌ ์ค‘...")
logs.append("\n๐Ÿš€ [5๋‹จ๊ณ„] ์ฒญํฌ ์—…๋กœ๋“œ ๋ชจ๋“œ ์™„๋ฃŒ")
logs.append(" - all/30d ๋ชจ๋‘ ์ฒญํฌ ํŒŒ์ผ ๊ธฐ์ค€์œผ๋กœ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
logs.append(" - ๋‹ค์Œ ์‹คํ–‰ ์‹œ 30d ํ‹ฐ์ปค ๋ชฉ๋ก ๊ธฐ์ค€์œผ๋กœ ์ž๋™ ์Šคํ‚ต/์žฌ๊ฐœ๋ฉ๋‹ˆ๋‹ค.")
# ========== ์™„๋ฃŒ ==========
progress(1.0, desc="์™„๋ฃŒ!")
logs.append("\n" + "=" * 60)
logs.append(f"โœ… ํŒŒ์ดํ”„๋ผ์ธ ์™„๋ฃŒ!")
logs.append(f"โฐ ์ข…๋ฃŒ ์‹œ๊ฐ„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logs.append("=" * 60)
return "\n".join(logs)
except Exception as e:
logger.exception("run_pipeline ์‹คํ–‰ ์ค‘ ์˜ˆ์™ธ ๋ฐœ์ƒ")
logs.append("\n" + "=" * 60)
logs.append("โŒ ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰ ์ค‘ ์˜ˆ์™ธ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค.")
logs.append(f"์˜ค๋ฅ˜ ๋ฉ”์‹œ์ง€: {e}")
logs.append("\n[Traceback]")
logs.append(traceback.format_exc())
logs.append("=" * 60)
return "\n".join(logs)
def preview_tickers():
"""ํ‹ฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ๋ณด๊ธฐ (์ˆ˜์ง‘ ์ „ ํ™•์ธ์šฉ)"""
nasdaq, nyse, combined = get_all_us_tickers()
if not combined:
return """โŒ ํ‹ฐ์ปค ๋ชฉ๋ก์„ ๊ฐ€์ ธ์˜ค์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.
๊ฐ€๋Šฅํ•œ ์›์ธ:
1) Yahoo Screener ์ผ์‹œ ์žฅ์• /์ฐจ๋‹จ
2) ๋„คํŠธ์›Œํฌ/์ง€์—ญ ์ œํ•œ
3) yfinance API ๋ณ€๊ฒฝ
์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•˜๊ฑฐ๋‚˜, ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰ ์‹œ HF ๋ฐ์ดํ„ฐ์…‹ ํด๋ฐฑ์ด ๋™์ž‘ํ•˜๋Š”์ง€ ํ™•์ธํ•ด ์ฃผ์„ธ์š”.
"""
info = f"""๐Ÿ“Š ํ‹ฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ๋ณด๊ธฐ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
๋‚˜์Šค๋‹ฅ: {len(nasdaq)}๊ฐœ
๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ: {len(nyse)}๊ฐœ
์ „์ฒด: {len(combined)}๊ฐœ
๋‚˜์Šค๋‹ฅ ์•ž 20๊ฐœ: {', '.join(nasdaq[:20])}...
๋‰ด์š• ์•ž 20๊ฐœ: {', '.join(nyse[:20])}...
"""
return info
# ========== Gradio UI ๊ตฌ์„ฑ ==========
with gr.Blocks(
title="์ฃผ์‹ ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ๊ธฐ"
) as demo:
gr.Markdown("""
# ๐Ÿ“ˆ ๋‚˜์Šค๋‹ฅ & ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ ์ฃผ์‹ ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ๊ธฐ
**์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค์—์„œ ์ „์ฒด ํ‹ฐ์ปค์˜ ์ผ๋ณ„ ๋ฐ์ดํ„ฐ๋ฅผ ์ˆ˜์ง‘ํ•˜์—ฌ ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ์ž๋™ ์—…๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.**
### ํŒŒ์ดํ”„๋ผ์ธ ํ๋ฆ„
1. ๐Ÿ” ๋‚˜์Šค๋‹ฅ & ๋‰ด์š•์ฆ๊ถŒ๊ฑฐ๋ž˜์†Œ ์ „์ฒด ํ‹ฐ์ปค ๋ชฉ๋ก ์ˆ˜์ง‘
2. ๐Ÿ“ฅ ํ‹ฐ์ปค๋ณ„ ์•ผํ›„ ํŒŒ์ด๋‚ธ์Šค ์ผ๋ณ„ ๋ฐ์ดํ„ฐ ์กฐํšŒ (`period` ์„ค์ • ๊ฐ€๋Šฅ)
3. ๐Ÿ“ฆ **all ๋ฐ์ดํ„ฐ์…‹** ์ƒ์„ฑ (์ „์ฒด๊ธฐ๊ฐ„ ๋ฐ์ดํ„ฐ)
4. ๐Ÿ—“๏ธ ํ‹ฐ์ปค๋ณ„ ์ตœ๊ทผ 30์ผ ํ•„ํ„ฐ๋ง โ†’ **30์ผ ๋ฐ์ดํ„ฐ์…‹** ์ƒ์„ฑ
5. ๐Ÿš€ ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ์—…๋กœ๋“œ
> โš ๏ธ ์ „์ฒด ํ‹ฐ์ปค๋ฅผ ๋ฐ˜๋ณต ์กฐํšŒํ•˜๋ฏ€๋กœ **์ˆ˜ ์‹œ๊ฐ„์ด ์†Œ์š”**๋  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### โš™๏ธ ์„ค์ •")
hf_token_input = gr.Textbox(
label="ํ—ˆ๊น…ํŽ˜์ด์Šค ํ† ํฐ (HF_TOKEN)",
value=HF_TOKEN,
type="password",
placeholder="hf_xxxxx...",
info="Spaces ์‹œํฌ๋ฆฟ์— HF_TOKEN์ด ์„ค์ •๋˜์–ด ์žˆ์œผ๋ฉด ์ž๋™์œผ๋กœ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค."
)
all_dataset_input = gr.Textbox(
label="์ „์ฒด๊ธฐ๊ฐ„ ๋ฐ์ดํ„ฐ์…‹ ์ด๋ฆ„ (all)",
value="younginpiniti/us-stocks-daily-all",
placeholder="username/dataset-name",
info="์ „์ฒด๊ธฐ๊ฐ„ ์ผ๋ณ„ ๋ฐ์ดํ„ฐ๊ฐ€ ์ €์žฅ๋  ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹"
)
recent_dataset_input = gr.Textbox(
label="์ตœ๊ทผ 30์ผ ๋ฐ์ดํ„ฐ์…‹ ์ด๋ฆ„ (30d)",
value="younginpiniti/us-stocks-daily-30d",
placeholder="username/dataset-name",
info="์ตœ๊ทผ 30์ผ ์ผ๋ณ„ ๋ฐ์ดํ„ฐ๊ฐ€ ์ €์žฅ๋  ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹"
)
batch_size_input = gr.Slider(
label="๋กœ๊ทธ ์ถœ๋ ฅ ๋ฐฐ์น˜ ํฌ๊ธฐ",
minimum=10,
maximum=500,
value=100,
step=10,
info="๋ช‡ ๊ฐœ ํ‹ฐ์ปค๋งˆ๋‹ค ๋กœ๊ทธ๋ฅผ ์ถœ๋ ฅํ• ์ง€ ์„ค์ •"
)
period_input = gr.Dropdown(
label="์กฐํšŒ ๊ธฐ๊ฐ„ (Yahoo period)",
choices=["max", "10y", "5y", "2y", "1y", "6mo", "3mo", "1mo"],
value="max",
info="์ „์ฒด ์ˆ˜์ง‘ ์‹œ๊ฐ„์ด ๊ธธ๋ฉด 10y/5y ๋“ฑ์œผ๋กœ ์ค„์—ฌ ์‹คํ–‰ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค."
)
checkpoint_batch_input = gr.Dropdown(
label="์ค‘๊ฐ„ ์—…๋กœ๋“œ ๊ฐ„๊ฒฉ (ํ‹ฐ์ปค ์ˆ˜)",
choices=[0, 50, 100, 200, 500],
value=100,
info="0์ด๋ฉด ์ค‘๊ฐ„ ์—…๋กœ๋“œ ์—†์ด ๋งˆ์ง€๋ง‰์—๋งŒ ์—…๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."
)
with gr.Row():
preview_btn = gr.Button("๐Ÿ‘€ ํ‹ฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ๋ณด๊ธฐ", variant="secondary")
start_btn = gr.Button("๐Ÿš€ ํŒŒ์ดํ”„๋ผ์ธ ์‹œ์ž‘", variant="primary")
realtime_btn = gr.Button("โšก ์‹ค์‹œ๊ฐ„ ์‹œ์ž‘", variant="secondary")
with gr.Column(scale=2):
gr.Markdown("### ๐Ÿ“‹ ์‹คํ–‰ ๋กœ๊ทธ")
output_log = gr.Textbox(
label="๋กœ๊ทธ ์ถœ๋ ฅ",
lines=30,
max_lines=50,
interactive=False
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
preview_btn.click(
fn=preview_tickers,
inputs=[],
outputs=[output_log]
)
start_btn.click(
fn=run_pipeline,
inputs=[
hf_token_input,
all_dataset_input,
recent_dataset_input,
batch_size_input,
period_input,
checkpoint_batch_input
],
outputs=[output_log]
)
realtime_btn.click(
fn=run_realtime_update,
inputs=[
hf_token_input,
all_dataset_input,
recent_dataset_input
],
outputs=[output_log]
)
gr.Markdown("""
---
### ๐Ÿ“Œ ๋ฐ์ดํ„ฐ์…‹ ๊ตฌ์กฐ
| ์ปฌ๋Ÿผ | ์„ค๋ช… | ์˜ˆ์‹œ |
|------|------|------|
| `Ticker` | ์ข…๋ชฉ ํ‹ฐ์ปค ์‹ฌ๋ณผ | AAPL, MSFT, TSLA |
| `Date` | ๊ฑฐ๋ž˜์ผ (YYYY-MM-DD) | 2024-01-15 |
| `Open` | ์‹œ๊ฐ€ | 185.3200 |
| `High` | ๊ณ ๊ฐ€ | 187.0400 |
| `Low` | ์ €๊ฐ€ | 184.2100 |
| `Close` | ์ข…๊ฐ€ | 186.0000 |
| `Volume` | ๊ฑฐ๋ž˜๋Ÿ‰ | 45123456 |
> ๐Ÿ’ก **ํŒ**: ํ‹ฐ์ปค๋ณ„๋กœ ์ „์ฒ˜๋ฆฌํ•  ๋•Œ๋Š” `Ticker` ์ปฌ๋Ÿผ์œผ๋กœ ๊ทธ๋ฃนํ•‘ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.
> ```python
> from datasets import load_dataset
> ds = load_dataset("younginpiniti/us-stocks-daily-all")
> df = ds["train"].to_pandas()
> aapl = df[df["Ticker"] == "AAPL"]
> ```
""")
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft())