Spaces:
Sleeping
Sleeping
ykjung
feat: implement chunked Parquet uploads to Hugging Face for memory efficiency and improved data handling
dd5d980 | """ | |
| ๋์ค๋ฅ & ๋ด์์ฆ๊ถ๊ฑฐ๋์ ์ฃผ์ ๋ฐ์ดํฐ ์์ง ๋ฐ ํ๊น ํ์ด์ค ๋ฐ์ดํฐ์ ์์ฑ | |
| - ์์ง: ๋์ค๋ฅ/๋ด์ ์ ์ฒด ํฐ์ปค๋ฅผ ์ผํ ํ์ด๋ธ์ค๋ก ์ผ๋ณ ๋ฐ์ดํฐ ์กฐํ (์ ์ฒด๊ธฐ๊ฐ) | |
| - ๋ฐ์ดํฐ์ ์์ฑ: all ๋ฐ์ดํฐ์ + ์ต๊ทผ 30์ผ ๋ฐ์ดํฐ์ ์๋ ์์ฑ | |
| """ | |
| import gradio as gr | |
| import yfinance as yf | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| from zoneinfo import ZoneInfo | |
| from datasets import Dataset, load_dataset | |
| from huggingface_hub import HfApi | |
| import os | |
| import time | |
| import logging | |
| import json | |
| import traceback | |
| import gc | |
| import tempfile | |
| import uuid | |
| from urllib.request import Request, urlopen | |
| # ๋ก๊น ์ค์ | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| logging.getLogger("yfinance").setLevel(logging.ERROR) | |
| logging.getLogger("httpx").setLevel(logging.WARNING) | |
| logging.getLogger("httpcore").setLevel(logging.WARNING) | |
| # ํ๊น ํ์ด์ค ํ ํฐ (Spaces ์ํฌ๋ฆฟ์์ ๊ฐ์ ธ์ด) | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| def get_ny_today_str(): | |
| """๋ด์ ํ์ง ๋ ์ง(YYYY-MM-DD) ๋ฐํ""" | |
| ny_tz = ZoneInfo("America/New_York") | |
| return datetime.now(ny_tz).strftime("%Y-%m-%d") | |
| def is_us_market_open_now(): | |
| """๋ฏธ๊ตญ ์ ๊ท์ฅ(๋ด์์๊ฐ 09:30~16:00) ์ฅ์ค ์ฌ๋ถ ๋ฐํ""" | |
| ny_tz = ZoneInfo("America/New_York") | |
| now_ny = datetime.now(ny_tz) | |
| # ์(0)~๊ธ(4)๋ง ์ ๊ท์ฅ | |
| if now_ny.weekday() >= 5: | |
| return False, now_ny | |
| minutes = now_ny.hour * 60 + now_ny.minute | |
| market_open = 9 * 60 + 30 | |
| market_close = 16 * 60 | |
| return market_open <= minutes < market_close, now_ny | |
| def fetch_tradingview_realtime(tickers, batch_size=400): | |
| """TradingView Screener API๋ก ํฐ์ปค๋ณ ์ค๋ OHLCV ์กฐํ""" | |
| if not tickers: | |
| return [] | |
| results = {} | |
| today_str = get_ny_today_str() | |
| for i in range(0, len(tickers), batch_size): | |
| batch = tickers[i:i + batch_size] | |
| payload = { | |
| "symbols": { | |
| "tickers": [ | |
| *[f"NASDAQ:{t}" for t in batch], | |
| *[f"NYSE:{t}" for t in batch], | |
| *[f"AMEX:{t}" for t in batch], | |
| ] | |
| }, | |
| "columns": ["close", "open", "high", "low", "volume", "exchange"], | |
| "options": {"lang": "en"}, | |
| "markets": ["america"], | |
| "range": [0, max(50, len(batch) * 3)] | |
| } | |
| req = Request( | |
| "https://scanner.tradingview.com/america/scan", | |
| data=json.dumps(payload).encode("utf-8"), | |
| headers={ | |
| "Content-Type": "application/json", | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| }, | |
| method="POST" | |
| ) | |
| with urlopen(req, timeout=30) as resp: | |
| body = resp.read().decode("utf-8") | |
| parsed = json.loads(body) | |
| for item in parsed.get("data", []): | |
| symbol = item.get("s", "") | |
| if ":" not in symbol: | |
| continue | |
| _, ticker = symbol.split(":", 1) | |
| if ticker in results: | |
| continue | |
| data = item.get("d", []) | |
| if len(data) < 5: | |
| continue | |
| close_val, open_val, high_val, low_val, volume_val = data[:5] | |
| if close_val is None: | |
| continue | |
| results[ticker] = { | |
| "Ticker": ticker, | |
| "Date": today_str, | |
| "Open": round(float(open_val), 4) if open_val is not None else None, | |
| "High": round(float(high_val), 4) if high_val is not None else None, | |
| "Low": round(float(low_val), 4) if low_val is not None else None, | |
| "Close": round(float(close_val), 4) if close_val is not None else None, | |
| "Volume": int(volume_val) if volume_val is not None else None | |
| } | |
| time.sleep(0.2) | |
| return list(results.values()) | |
| def load_hf_dataset_as_df(repo_name, hf_token): | |
| """HF Hub ๋ฐ์ดํฐ์ ์ pandas DataFrame์ผ๋ก ๋ก๋""" | |
| ds = load_dataset(repo_name, split="train", token=hf_token) | |
| df = ds.to_pandas() | |
| # ์ปฌ๋ผ ํ์คํ | |
| required_cols = ["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"] | |
| for col in required_cols: | |
| if col not in df.columns: | |
| df[col] = None | |
| df = df[required_cols] | |
| df["Ticker"] = df["Ticker"].astype(str).str.upper() | |
| df["Date"] = df["Date"].astype(str) | |
| return df | |
| def run_realtime_update( | |
| hf_token, | |
| all_dataset_name, | |
| recent_dataset_name, | |
| progress=gr.Progress() | |
| ): | |
| """ | |
| ์ค์๊ฐ(์ฅ์ค) ๋ฐ์ดํฐ ์ ๋ฐ์ดํธ | |
| - ์ฅ์ค ์ฌ๋ถ ๋ฉ์์ง ์ถ๋ ฅ(์ธ๋จธํ์ ์๋ ๋ฐ์) | |
| - TradingView Screener๋ก ๋ฐ์ดํฐ์ ํฐ์ปค ์ผ๊ด ์กฐํ | |
| - all: ์ค๋ ๋ฐ์ดํฐ ์ถ๊ฐ(append-only) | |
| - 30d: ์ค๋๋ ๋ฐ์ดํฐ ์ ๊ฑฐ ํ ์ค๋ ๋ฐ์ดํฐ ๋ฐ์ | |
| """ | |
| if not hf_token: | |
| return "โ ํ๊น ํ์ด์ค ํ ํฐ์ด ํ์ํฉ๋๋ค. HF_TOKEN ํ๊ฒฝ๋ณ์ ๋๋ ์ ๋ ฅ์ฐฝ์ ํ ํฐ์ ๋ฃ์ด์ฃผ์ธ์." | |
| logs = [] | |
| logs.append("=" * 60) | |
| logs.append("โก ์ค์๊ฐ ๋ฐ์ดํฐ ์ ๋ฐ์ดํธ ์์") | |
| logs.append(f"โฐ ์์ ์๊ฐ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| logs.append("=" * 60) | |
| # 0) ์ฅ์ค ์ํ ํ์ธ (DST ์๋ ๋ฐ์) | |
| progress(0.05, desc="์ฅ์ค ์ฌ๋ถ ํ์ธ ์ค...") | |
| is_open, now_ny = is_us_market_open_now() | |
| if is_open: | |
| logs.append(f"๐ข ์ฅ์ค์ ๋๋ค. (๋ด์์๊ฐ {now_ny.strftime('%Y-%m-%d %H:%M:%S')})") | |
| else: | |
| logs.append(f"๐ก ์ฅ์ค์ด ์๋๋๋ค. (๋ด์์๊ฐ {now_ny.strftime('%Y-%m-%d %H:%M:%S')})") | |
| today_str = get_ny_today_str() | |
| # 1) ๋ฐ์ดํฐ์ ๋ก๋ | |
| progress(0.15, desc="๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ก๋ ์ค...") | |
| logs.append("\n๐ฅ [1๋จ๊ณ] ๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ก๋ ์ค...") | |
| all_df = load_hf_dataset_as_df(all_dataset_name, hf_token) | |
| recent_df = load_hf_dataset_as_df(recent_dataset_name, hf_token) | |
| # ์กฐํ ํฐ์ปค๋ ๋ฐ์ดํฐ์ ์ ์๋ ํฐ์ปค ๊ธฐ์ค | |
| tickers = sorted(recent_df["Ticker"].dropna().astype(str).str.upper().unique().tolist()) | |
| if not tickers: | |
| tickers = sorted(all_df["Ticker"].dropna().astype(str).str.upper().unique().tolist()) | |
| if not tickers: | |
| return "\n".join(logs) + "\n\nโ ๋ฐ์ดํฐ์ ์ ํฐ์ปค๊ฐ ์์ด ์ค์๊ฐ ์กฐํ๋ฅผ ์งํํ ์ ์์ต๋๋ค." | |
| logs.append(f" - ์กฐํ ๋์ ํฐ์ปค ์: {len(tickers)}") | |
| # 2) ์ค๋ ๋ฐ์ดํฐ ์ด๋ฏธ ์๋์ง ํ์ธ | |
| progress(0.25, desc="์ค๋ ๋ฐ์ดํฐ ์กด์ฌ ์ฌ๋ถ ํ์ธ ์ค...") | |
| today_rows = all_df[all_df["Date"] == today_str] | |
| today_tickers = set(today_rows["Ticker"].astype(str).str.upper().tolist()) | |
| if set(tickers).issubset(today_tickers): | |
| logs.append("\nโ ์ด๋ฏธ ์์งํ์ต๋๋ค") | |
| return "\n".join(logs) | |
| # 3) TradingView ์ค์๊ฐ ์กฐํ | |
| progress(0.45, desc="TradingView ์ค์๊ฐ ์กฐํ ์ค...") | |
| logs.append("\n๐ก [2๋จ๊ณ] TradingView Screener ์กฐํ ์ค...") | |
| realtime_rows = fetch_tradingview_realtime(tickers) | |
| if not realtime_rows: | |
| return "\n".join(logs) + "\n\nโ TradingView์์ ์ค์๊ฐ ๋ฐ์ดํฐ๋ฅผ ๊ฐ์ ธ์ค์ง ๋ชปํ์ต๋๋ค." | |
| realtime_df = pd.DataFrame(realtime_rows) | |
| realtime_df["Ticker"] = realtime_df["Ticker"].astype(str).str.upper() | |
| realtime_df["Date"] = realtime_df["Date"].astype(str) | |
| logs.append(f" - ์์ ์ฑ๊ณต ํฐ์ปค ์: {realtime_df['Ticker'].nunique()}") | |
| # 4) all ๋ฐ์ดํฐ์ ์ ๋ฐ์ดํธ (์ถ๊ฐ๋ง) | |
| progress(0.65, desc="all ๋ฐ์ดํฐ์ ์ ๋ฐ์ดํธ ์ค...") | |
| logs.append("\n๐งฉ [3๋จ๊ณ] all ๋ฐ์ดํฐ์ ์ ๋ฐ์ดํธ(์ถ๊ฐ๋ง)...") | |
| existing_today = set(all_df.loc[all_df["Date"] == today_str, "Ticker"].astype(str).str.upper().tolist()) | |
| add_all_df = realtime_df[~realtime_df["Ticker"].isin(existing_today)].copy() | |
| if not add_all_df.empty: | |
| all_updated_df = pd.concat([all_df, add_all_df], ignore_index=True) | |
| else: | |
| all_updated_df = all_df | |
| logs.append(f" - all ์ถ๊ฐ ๊ฑด์: {len(add_all_df)}") | |
| # 5) 30d ๋ฐ์ดํฐ์ ์ ๋ฐ์ดํธ (์ค๋๋ ๋ ์ง ์ ๊ฑฐ + ์ถ๊ฐ/๊ฐฑ์ ) | |
| progress(0.78, desc="30d ๋ฐ์ดํฐ์ ์ ๋ฐ์ดํธ ์ค...") | |
| logs.append("\n๐๏ธ [4๋จ๊ณ] 30d ๋ฐ์ดํฐ์ ์ ๋ฐ์ดํธ(์ค๋๋ ๋ฐ์ดํฐ ์ ๊ฑฐ + ์ถ๊ฐ)...") | |
| # ๊ฐ์ ํฐ์ปค/์ค๋ ๋ ์ง๊ฐ ๊ธฐ์กด์ ์์ผ๋ฉด ๊ต์ฒด๋ฅผ ์ํด ์ ๊ฑฐ | |
| update_tickers = set(realtime_df["Ticker"].tolist()) | |
| recent_df_wo_today = recent_df[ | |
| ~((recent_df["Date"] == today_str) & (recent_df["Ticker"].isin(update_tickers))) | |
| ].copy() | |
| recent_merged = pd.concat([recent_df_wo_today, realtime_df], ignore_index=True) | |
| recent_updated_df = filter_last_30_days(recent_merged) | |
| # 6) ์ ๋ก๋ | |
| progress(0.9, desc="ํ๊น ํ์ด์ค ์ ๋ก๋ ์ค...") | |
| logs.append("\n๐ [5๋จ๊ณ] ํ๊น ํ์ด์ค ์ ๋ก๋ ์ค...") | |
| result_all = upload_dataset_to_hf(all_updated_df, all_dataset_name, hf_token) | |
| result_30d = upload_dataset_to_hf(recent_updated_df, recent_dataset_name, hf_token) | |
| logs.append(f" {result_all}") | |
| logs.append(f" {result_30d}") | |
| progress(1.0, desc="์๋ฃ!") | |
| logs.append("\n" + "=" * 60) | |
| logs.append("โ ์ค์๊ฐ ๋ฐ์ดํฐ ์ ๋ฐ์ดํธ ์๋ฃ") | |
| logs.append(f"๐ ๋ฐ์ ๋ ์ง(๋ด์ ๊ธฐ์ค): {today_str}") | |
| logs.append("=" * 60) | |
| return "\n".join(logs) | |
| def get_all_us_tickers(): | |
| """ | |
| ์ผํ ํ์ด๋ธ์ค ์คํฌ๋ฆฌ๋(yf.screen)๋ฅผ ์ฌ์ฉํ์ฌ ๋์ค๋ฅ + ๋ด์์ฆ๊ถ๊ฑฐ๋์ ํฐ์ปค ๋ชฉ๋ก์ ๊ฐ์ ธ์ด. | |
| - ๋์ค๋ฅ์ 3๊ฐ ๋ง์ผ์ผ๋ก ๊ตฌ์ฑ: NMS(๊ธ๋ก๋ฒ์ ๋ ํธ), NGM(๊ธ๋ก๋ฒ๋ง์ผ), NCM(์บํผํธ๋ง์ผ) | |
| - ๋ด์์ฆ๊ถ๊ฑฐ๋์: NYQ | |
| - yfinance ๋ด์ฅ ๊ธฐ๋ฅ์ด๋ผ ๋ณ๋ ๋ฐ์ดํฐ ์์ค ๋ถํ์, HF Spaces์์๋ ๋์ | |
| ๋ฐํ: (nasdaq_tickers, nyse_tickers, all_tickers) | |
| """ | |
| def _fetch_exchange_tickers(exchange_code): | |
| """์ผํ ์คํฌ๋ฆฌ๋์์ ํน์ ๊ฑฐ๋์์ ์ ์ฒด ํฐ์ปค๋ฅผ ํ์ด์ง์ผ๋ก ๊ฐ์ ธ์ค๊ธฐ""" | |
| query = yf.EquityQuery("eq", ["exchange", exchange_code]) | |
| symbols = [] | |
| offset = 0 | |
| while True: | |
| result = yf.screen(query, size=250, offset=offset) | |
| quotes = result.get("quotes", []) | |
| if not quotes: | |
| break | |
| for quote in quotes: | |
| sym = quote.get("symbol", "") | |
| if sym: | |
| # [ํํฐ] | |
| # 1. '-', '.', '$'๊ฐ ํฌํจ๋ ํฐ์ปค (์ฐ์ ์ฃผ, ์ ๋ ๋ฑ) ์ ์ธ | |
| # 2. ํฐ์ปค๊ฐ 5์์ด๋ฉด์ ๋ง์ง๋ง์ด W(Warrant), R(Right), U(Unit)์ธ ํ์ ์ข ๋ชฉ ์ ์ธ | |
| is_special = any(c in sym for c in ["-", ".", "$"]) | |
| is_derivative = len(sym) == 5 and sym[-1] in ["W", "R", "U"] | |
| if not (is_special or is_derivative): | |
| symbols.append(sym) | |
| offset += len(quotes) | |
| total = result.get("total", 0) | |
| if offset >= total: | |
| break | |
| return sorted(list(set(symbols))) | |
| try: | |
| # ๋์ค๋ฅ: 3๊ฐ ๋ง์ผ ํฉ์ฐ | |
| # NMS = NASDAQ Global Select Market | |
| # NGM = NASDAQ Global Market | |
| # NCM = NASDAQ Capital Market | |
| nasdaq_tickers = [] | |
| for market_code in ["NMS", "NGM", "NCM"]: | |
| tickers = _fetch_exchange_tickers(market_code) | |
| logger.info(f" ๋์ค๋ฅ {market_code}: {len(tickers)}๊ฐ ๋ก๋") | |
| nasdaq_tickers.extend(tickers) | |
| nasdaq_tickers = sorted(list(set(nasdaq_tickers))) | |
| # ๋ด์์ฆ๊ถ๊ฑฐ๋์(NYSE): NYQ | |
| nyse_tickers = _fetch_exchange_tickers("NYQ") | |
| logger.info(f" ๋ด์ NYQ: {len(nyse_tickers)}๊ฐ ๋ก๋") | |
| all_tickers = sorted(list(set(nasdaq_tickers + nyse_tickers))) | |
| logger.info(f"๋์ค๋ฅ: {len(nasdaq_tickers)}๊ฐ, ๋ด์: {len(nyse_tickers)}๊ฐ, ์ ์ฒด: {len(all_tickers)}๊ฐ ๋ก๋ ์๋ฃ") | |
| return nasdaq_tickers, nyse_tickers, all_tickers | |
| except Exception as e: | |
| logger.error(f"์ผํ ์คํฌ๋ฆฌ๋ ํฐ์ปค ๋ก๋ ์คํจ: {e}") | |
| return [], [], [] | |
| def fetch_ticker_data(ticker, period="max", max_retries=3): | |
| """ | |
| ๊ฐ๋ณ ํฐ์ปค์ ๊ธฐ๊ฐ๋ณ ์ผ๋ณ ๋ฐ์ดํฐ๋ฅผ ์ผํ ํ์ด๋ธ์ค์์ ์กฐํ | |
| - period: yfinance history ๊ธฐ๊ฐ ํ๋ผ๋ฏธํฐ (์: max, 10y, 5y, 1y, 6mo, 3mo) | |
| - interval="1d" : ์ผ๋ณ ๋ฐ์ดํฐ | |
| """ | |
| def _parse_valid_periods(error_message): | |
| marker = "must be one of:" | |
| if marker not in error_message: | |
| return [] | |
| raw = error_message.split(marker, 1)[1] | |
| return [p.strip().strip("'").strip('"') for p in raw.split(",") if p.strip()] | |
| def _choose_fallback_period(requested_period, valid_periods): | |
| if not valid_periods: | |
| return None | |
| preferred_order = ["max", "10y", "5y", "2y", "1y", "6mo", "3mo", "1mo", "5d", "1d"] | |
| for candidate in preferred_order: | |
| if candidate in valid_periods: | |
| return candidate | |
| if requested_period in valid_periods: | |
| return requested_period | |
| return valid_periods[0] | |
| effective_period = period | |
| for attempt in range(max_retries): | |
| try: | |
| stock = yf.Ticker(ticker) | |
| # ๊ธฐ๊ฐ๋ณ, ์ผ๋ณ ๋ฐ์ดํฐ ์กฐํ | |
| hist = stock.history(period=effective_period, interval="1d") | |
| if hist.empty: | |
| logger.warning(f"[{ticker}] ๋ฐ์ดํฐ ์์ (๋น ๊ฒฐ๊ณผ)") | |
| return None | |
| # ์ธ๋ฑ์ค(๋ ์ง)๋ฅผ ์ปฌ๋ผ์ผ๋ก ๋ณํ | |
| hist = hist.reset_index() | |
| # ticker ์ปฌ๋ผ ์ถ๊ฐ (๋์ค์ ํฐ์ปค๋ณ ๊ตฌ๋ถ์ฉ) | |
| hist["Ticker"] = ticker | |
| # ๋ ์ง ์ปฌ๋ผ์ ๋ฌธ์์ด๋ก ๋ณํ (๋ฐ์ดํฐ์ ํธํ์ฑ) | |
| if "Date" in hist.columns: | |
| hist["Date"] = hist["Date"].dt.strftime("%Y-%m-%d") | |
| elif "Datetime" in hist.columns: | |
| hist.rename(columns={"Datetime": "Date"}, inplace=True) | |
| hist["Date"] = pd.to_datetime(hist["Date"]).dt.strftime("%Y-%m-%d") | |
| # ํ์ํ ์ปฌ๋ผ๋ง ์ ํ | |
| columns_to_keep = ["Ticker", "Date", "Open", "High", "Low", "Close", "Volume"] | |
| available_cols = [c for c in columns_to_keep if c in hist.columns] | |
| hist = hist[available_cols] | |
| # ์ซ์ ์ปฌ๋ผ ์์์ ์ ๋ฆฌ | |
| numeric_cols = ["Open", "High", "Low", "Close"] | |
| for col in numeric_cols: | |
| if col in hist.columns: | |
| hist[col] = hist[col].round(4) | |
| # --- ์ฅ์ค ๋ฐ์ดํฐ(๋ฏธํ์ ์ข ๊ฐ) ์ ์ธ ๋ก์ง --- | |
| # zoneinfo๋ Python ๋ด์ฅ์ด๋ผ ๋ณ๋ ์ค์น ๋ถํ์, ์ธ๋จธํ์(EDT/EST) ์๋ ์ฒ๋ฆฌ | |
| ny_tz = ZoneInfo("America/New_York") | |
| now_ny = datetime.now(ny_tz) | |
| today_ny = now_ny.strftime("%Y-%m-%d") | |
| # ์ ๊ท์ฅ ๋ง๊ฐ: ๋ด์ ํ์ง ์๊ฐ 16:00 (์ธ๋จธํ์์ด๋ ์๋๋ ๋์ผ) | |
| # ์ฌ์ ๋ฅผ ๋๊ณ 16:30 ์ดํ๋ฉด ์ข ๊ฐ ํ์ ์ผ๋ก ํ๋จ | |
| market_closed = now_ny.hour >= 17 or (now_ny.hour == 16 and now_ny.minute >= 30) | |
| if not hist.empty and hist.iloc[-1]["Date"] == today_ny: | |
| if not market_closed: | |
| # ์์ง ์ฅ์ค์ด๊ฑฐ๋ ๋ง๊ฐ ์งํ โ ์ค๋ ๋ฐ์ดํฐ ์ ์ธ (์ข ๊ฐ ๋ฏธํ์ ) | |
| logger.info(f"[{ticker}] ์ฅ์ค ๋ฐ์ดํฐ({today_ny}) ์ ์ธ (ํ์ฌ ๋ด์์๊ฐ: {now_ny.strftime('%H:%M')})") | |
| hist = hist.iloc[:-1] | |
| else: | |
| # ์ฅ ๋ง๊ฐ ํ โ ์ค๋ ์ข ๊ฐ ํ์ , ํฌํจ | |
| logger.info(f"[{ticker}] ์ฅ ๋ง๊ฐ ํ ๋ฐ์ดํฐ({today_ny}) ํฌํจ") | |
| # ----------------------------------------------- | |
| return hist | |
| except Exception as e: | |
| error_message = str(e) | |
| if "must be one of:" in error_message: | |
| valid_periods = _parse_valid_periods(error_message) | |
| fallback_period = _choose_fallback_period(effective_period, valid_periods) | |
| if fallback_period and fallback_period != effective_period: | |
| logger.info( | |
| f"[{ticker}] period '{effective_period}' ๋ฏธ์ง์, '{fallback_period}'๋ก ์๋ ์ ํ ํ ์ฌ์๋" | |
| ) | |
| effective_period = fallback_period | |
| continue | |
| logger.warning(f"[{ticker}] ์กฐํ ์คํจ (์๋ {attempt + 1}/{max_retries}): {e}") | |
| if attempt < max_retries - 1: | |
| time.sleep(1) # ์ฌ์๋ ์ ๋๊ธฐ | |
| continue | |
| return None | |
| def filter_last_30_days(df): | |
| """์ ์ฒด ๋ฐ์ดํฐ์์ ํฐ์ปค๋ณ ์ต๊ทผ 30์ผ ๋ฐ์ดํฐ๋ง ํํฐ๋ง""" | |
| if df.empty: | |
| return df | |
| df_copy = df.copy() | |
| df_copy["_date_parsed"] = pd.to_datetime(df_copy["Date"], errors="coerce") | |
| invalid_date_count = int(df_copy["_date_parsed"].isna().sum()) | |
| if invalid_date_count > 0: | |
| logger.warning(f"Date ํ์ฑ ์คํจ ํ {invalid_date_count}๊ฐ๋ 30์ผ ํํฐ์์ ์ ์ธ๋ฉ๋๋ค.") | |
| df_copy = df_copy[df_copy["_date_parsed"].notna()].copy() | |
| if df_copy.empty: | |
| return pd.DataFrame(columns=df.columns) | |
| max_date_by_ticker = df_copy.groupby("Ticker")["_date_parsed"].transform("max") | |
| cutoff_by_ticker = max_date_by_ticker - pd.Timedelta(days=30) | |
| result = df_copy[df_copy["_date_parsed"] >= cutoff_by_ticker].copy() | |
| if result.empty: | |
| return pd.DataFrame(columns=df.columns) | |
| result = result.reset_index(drop=True) | |
| result.drop(columns=["_date_parsed"], inplace=True) | |
| return result | |
| def upload_dataset_to_hf(df, repo_name, hf_token, max_retries=3, retry_wait_sec=2): | |
| """๋ฐ์ดํฐํ๋ ์์ ํ๊น ํ์ด์ค ๋ฐ์ดํฐ์ ์ผ๋ก ์ ๋ก๋(์ฌ์๋/์ง๋จ ์ ๋ณด ํฌํจ)""" | |
| if df is None or df.empty: | |
| return { | |
| "ok": False, | |
| "repo": repo_name, | |
| "rows": 0, | |
| "attempts": 0, | |
| "elapsed_sec": 0.0, | |
| "error": "์ ๋ก๋ํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.", | |
| "traceback": "", | |
| } | |
| last_error = "" | |
| last_traceback = "" | |
| start_ts = time.time() | |
| for attempt in range(1, max_retries + 1): | |
| try: | |
| dataset = Dataset.from_pandas(df, preserve_index=False) | |
| dataset.push_to_hub( | |
| repo_name, | |
| token=hf_token, | |
| private=False # ๊ณต๊ฐ ๋ฐ์ดํฐ์ | |
| ) | |
| return { | |
| "ok": True, | |
| "repo": repo_name, | |
| "rows": len(df), | |
| "attempts": attempt, | |
| "elapsed_sec": time.time() - start_ts, | |
| "error": "", | |
| "traceback": "", | |
| } | |
| except Exception as e: | |
| last_error = str(e) | |
| last_traceback = traceback.format_exc() | |
| logger.warning(f"[{repo_name}] ์ ๋ก๋ ์คํจ (์๋ {attempt}/{max_retries}): {last_error}") | |
| if attempt < max_retries: | |
| time.sleep(retry_wait_sec * attempt) | |
| return { | |
| "ok": False, | |
| "repo": repo_name, | |
| "rows": len(df), | |
| "attempts": max_retries, | |
| "elapsed_sec": time.time() - start_ts, | |
| "error": last_error, | |
| "traceback": last_traceback, | |
| } | |
| def append_parquet_chunk_to_hf(df, repo_name, hf_token, subdir="data/chunks", max_retries=3, retry_wait_sec=2): | |
| """๋ฐ์ดํฐํ๋ ์์ Parquet ์ฒญํฌ ํ์ผ๋ก ํ๊น ํ์ด์ค ๋ฐ์ดํฐ์ ์ ์ฅ์์ ์ถ๊ฐ ์ ๋ก๋""" | |
| if df is None or df.empty: | |
| return { | |
| "ok": False, | |
| "repo": repo_name, | |
| "rows": 0, | |
| "attempts": 0, | |
| "elapsed_sec": 0.0, | |
| "error": "์ ๋ก๋ํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.", | |
| "traceback": "", | |
| } | |
| api = HfApi() | |
| last_error = "" | |
| last_traceback = "" | |
| start_ts = time.time() | |
| for attempt in range(1, max_retries + 1): | |
| temp_path = None | |
| try: | |
| api.create_repo( | |
| repo_id=repo_name, | |
| repo_type="dataset", | |
| token=hf_token, | |
| private=False, | |
| exist_ok=True, | |
| ) | |
| chunk_name = f"chunk-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid.uuid4().hex[:8]}.parquet" | |
| with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: | |
| temp_path = tmp.name | |
| df.to_parquet(temp_path, index=False) | |
| path_in_repo = f"{subdir}/{chunk_name}" | |
| api.upload_file( | |
| path_or_fileobj=temp_path, | |
| path_in_repo=path_in_repo, | |
| repo_id=repo_name, | |
| repo_type="dataset", | |
| token=hf_token, | |
| ) | |
| return { | |
| "ok": True, | |
| "repo": repo_name, | |
| "rows": len(df), | |
| "attempts": attempt, | |
| "elapsed_sec": time.time() - start_ts, | |
| "error": "", | |
| "traceback": "", | |
| } | |
| except Exception as e: | |
| last_error = str(e) | |
| last_traceback = traceback.format_exc() | |
| logger.warning(f"[{repo_name}] ์ฒญํฌ ์ ๋ก๋ ์คํจ (์๋ {attempt}/{max_retries}): {last_error}") | |
| if attempt < max_retries: | |
| time.sleep(retry_wait_sec * attempt) | |
| finally: | |
| if temp_path and os.path.exists(temp_path): | |
| try: | |
| os.remove(temp_path) | |
| except Exception: | |
| pass | |
| return { | |
| "ok": False, | |
| "repo": repo_name, | |
| "rows": len(df), | |
| "attempts": max_retries, | |
| "elapsed_sec": time.time() - start_ts, | |
| "error": last_error, | |
| "traceback": last_traceback, | |
| } | |
| def run_pipeline( | |
| hf_token, | |
| all_dataset_name, | |
| recent_dataset_name, | |
| batch_size, | |
| period, | |
| checkpoint_batch_size, | |
| progress=gr.Progress() | |
| ): | |
| """ | |
| ์ ์ฒด ํ์ดํ๋ผ์ธ ์คํ | |
| 1. ๋์ค๋ฅ & ๋ด์ ํฐ์ปค ๋ชฉ๋ก ๊ฐ์ ธ์ค๊ธฐ | |
| 2. ํฐ์ปค๋ณ ์ผํ ํ์ด๋ธ์ค ์ผ๋ณ ๋ฐ์ดํฐ ์์ง | |
| 3. ์ ์ฒด๊ธฐ๊ฐ ๋ฐ์ดํฐ์ (all) ์์ฑ | |
| 4. ์ต๊ทผ 30์ผ ๋ฐ์ดํฐ์ ์์ฑ | |
| """ | |
| if not hf_token: | |
| return "โ ํ๊น ํ์ด์ค ํ ํฐ์ด ํ์ํฉ๋๋ค. HF_TOKEN ํ๊ฒฝ๋ณ์ ๋๋ ์ ๋ ฅ์ฐฝ์ ํ ํฐ์ ๋ฃ์ด์ฃผ์ธ์." | |
| logs = [] | |
| try: | |
| def _df_stats(df, label): | |
| if df is None or df.empty: | |
| return f"{label}: 0ํ" | |
| mem_mb = df.memory_usage(deep=True).sum() / (1024 * 1024) | |
| return f"{label}: {len(df)}ํ x {len(df.columns)}์ด, ๋ฉ๋ชจ๋ฆฌ ์ฝ {mem_mb:.2f}MB" | |
| def _append_upload_result(log_prefix, result): | |
| status_icon = "โ " if result["ok"] else "โ" | |
| logs.append( | |
| f" - {log_prefix}: {status_icon} {result['repo']} | " | |
| f"rows={result['rows']} | attempts={result['attempts']} | elapsed={result['elapsed_sec']:.1f}s" | |
| ) | |
| if not result["ok"]: | |
| logs.append(f" ์ค๋ฅ: {result['error']}") | |
| if result["traceback"]: | |
| logs.append(" Traceback:") | |
| logs.append(result["traceback"]) | |
| logs.append("=" * 60) | |
| logs.append("๐ ์ฃผ์ ๋ฐ์ดํฐ ์์ง ํ์ดํ๋ผ์ธ ์์") | |
| logs.append(f"โฐ ์์ ์๊ฐ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| logs.append("=" * 60) | |
| logs.append("โน๏ธ ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ ๋ชจ๋: 100๊ฐ ๋จ์ ๋ฑ ์ฒญํฌ ์ ๋ก๋ ํ ๋ฒํผ๋ฅผ ์ฆ์ ๋น์๋๋ค.") | |
| # ========== 1๋จ๊ณ: ํฐ์ปค ๋ชฉ๋ก ์์ง ========== | |
| progress(0, desc="๋์ค๋ฅ & ๋ด์ ํฐ์ปค ๋ชฉ๋ก ์์ง ์ค...") | |
| logs.append("\n๐ [1๋จ๊ณ] ๋์ค๋ฅ & ๋ด์์ฆ๊ถ๊ฑฐ๋์ ํฐ์ปค ๋ชฉ๋ก ์์ง ์ค...") | |
| nasdaq_tickers, nyse_tickers, all_tickers = get_all_us_tickers() | |
| logs.append(f" - ๋์ค๋ฅ: {len(nasdaq_tickers)}๊ฐ") | |
| logs.append(f" - ๋ด์์ฆ๊ถ๊ฑฐ๋์: {len(nyse_tickers)}๊ฐ") | |
| logs.append(f" - ์ ์ฒด: {len(all_tickers)}๊ฐ") | |
| if not all_tickers: | |
| logs.append("\nโ ๏ธ Yahoo Screener์์ ํฐ์ปค๋ฅผ ๊ฐ์ ธ์ค์ง ๋ชปํ์ต๋๋ค. HF ๋ฐ์ดํฐ์ ๊ธฐ๋ฐ ํด๋ฐฑ์ ์๋ํฉ๋๋ค.") | |
| fallback_tickers = [] | |
| fallback_errors = [] | |
| try: | |
| recent_df = load_hf_dataset_as_df(recent_dataset_name, hf_token) | |
| fallback_tickers = sorted( | |
| recent_df["Ticker"].dropna().astype(str).str.upper().unique().tolist() | |
| ) | |
| if fallback_tickers: | |
| logs.append(f" - ํด๋ฐฑ ์์ค: recent ๋ฐ์ดํฐ์ ({recent_dataset_name})") | |
| except Exception as e: | |
| fallback_errors.append(f"recent ๋ก๋ ์คํจ: {e}") | |
| if not fallback_tickers: | |
| try: | |
| all_existing_df = load_hf_dataset_as_df(all_dataset_name, hf_token) | |
| fallback_tickers = sorted( | |
| all_existing_df["Ticker"].dropna().astype(str).str.upper().unique().tolist() | |
| ) | |
| if fallback_tickers: | |
| logs.append(f" - ํด๋ฐฑ ์์ค: all ๋ฐ์ดํฐ์ ({all_dataset_name})") | |
| except Exception as e: | |
| fallback_errors.append(f"all ๋ก๋ ์คํจ: {e}") | |
| if fallback_tickers: | |
| all_tickers = fallback_tickers | |
| logs.append(f" - ํด๋ฐฑ ํฐ์ปค ์: {len(all_tickers)}๊ฐ") | |
| else: | |
| if fallback_errors: | |
| logs.append(" - ํด๋ฐฑ ์คํจ ์์ธ:") | |
| for err in fallback_errors: | |
| logs.append(f" * {err}") | |
| logs.append("\n๊ฐ๋ฅํ ์์ธ:") | |
| logs.append(" 1) Yahoo Screener ์ผ์ ์ฅ์ /์ฐจ๋จ") | |
| logs.append(" 2) ๋คํธ์ํฌ/์ง์ญ ์ ํ") | |
| logs.append(" 3) yfinance API ๋ณ๊ฒฝ") | |
| return "\n".join(logs) + "\n\nโ ํฐ์ปค ๋ชฉ๋ก์ ๊ฐ์ ธ์ฌ ์ ์์ต๋๋ค." | |
| # ========== 2๋จ๊ณ: ๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ก๋ + ์ฌ๊ฐ ๋์ ๊ณ์ฐ ========== | |
| progress(0.08, desc="๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ก๋ ์ค...") | |
| logs.append("\n๐ [2๋จ๊ณ] ๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ก๋ ๋ฐ ์ฌ๊ฐ ๋์ ๊ณ์ฐ...") | |
| recent_for_resume = pd.DataFrame(columns=["Ticker"]) | |
| try: | |
| recent_for_resume = load_hf_dataset_as_df(recent_dataset_name, hf_token) | |
| logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํฐ: {len(recent_for_resume)}ํ") | |
| except Exception as e: | |
| logs.append(f" - ๊ธฐ์กด 30d ๋ฐ์ดํฐ ๋ก๋ ์คํจ(์ ๊ท ์์ง ๊ธฐ์ค์ผ๋ก ์งํ): {e}") | |
| existing_tickers = set(recent_for_resume["Ticker"].dropna().astype(str).str.upper().tolist()) | |
| if not existing_tickers: | |
| logs.append(" - ๊ธฐ์กด ํฐ์ปค ์ ๋ณด๊ฐ ๋น์ด ์์ด ์ ์ฒด ๋์ ๊ธฐ์ค์ผ๋ก ์งํํฉ๋๋ค.") | |
| pending_tickers = [ticker for ticker in all_tickers if ticker not in existing_tickers] | |
| logs.append(f" - ๊ธฐ์กด ์์ง ํฐ์ปค: {len(existing_tickers)}๊ฐ") | |
| logs.append(f" - ์ด๋ฒ ์คํ ๋์ ํฐ์ปค: {len(pending_tickers)}๊ฐ") | |
| if not pending_tickers: | |
| progress(1.0, desc="์๋ฃ!") | |
| logs.append("\nโ ์ด๋ฏธ ์์ง๋ ํฐ์ปค์ ๋๋ค. ์ถ๊ฐ ์์งํ ๋์์ด ์์ต๋๋ค.") | |
| return "\n".join(logs) | |
| # ========== 3๋จ๊ณ: ์ผํ ํ์ด๋ธ์ค ๋ฐ์ดํฐ ์์ง ========== | |
| logs.append(f"\n๐ฅ [3๋จ๊ณ] ์ผํ ํ์ด๋ธ์ค ๋ฐ์ดํฐ ์์ง ์์ (์ด {len(pending_tickers)}๊ฐ ํฐ์ปค)") | |
| logs.append(f" - ๋ฐฐ์น ํฌ๊ธฐ: {batch_size}") | |
| logs.append(f" - ์กฐํ ๊ธฐ๊ฐ(period): {period}") | |
| logs.append(f" - ์ฒดํฌํฌ์ธํธ ์ ๋ก๋ ๊ฐ๊ฒฉ: {checkpoint_batch_size}๊ฐ ํฐ์ปค") | |
| logs.append(f" โ ๏ธ ๋ฐ๋ณต๋ฌธ์ด๋ผ ์ค๋ ๊ฑธ๋ฆฝ๋๋ค. ์ ์ฒด ํฐ์ปค ์์ ๋ฐ๋ผ ์ ์๊ฐ ์์๋ ์ ์์ต๋๋ค.") | |
| all_data_frames = [] | |
| recent_30d_frames = [] | |
| success_count = 0 | |
| fail_count = 0 | |
| last_checkpoint_success_index = 0 | |
| total = len(pending_tickers) | |
| def _upload_checkpoint(end_index): | |
| nonlocal last_checkpoint_success_index | |
| if success_count <= last_checkpoint_success_index: | |
| return | |
| logs.append( | |
| f"\n๐พ [์ฒดํฌํฌ์ธํธ] {end_index}/{total} ์ฒ๋ฆฌ ์์ ์ค๊ฐ ์ ๋ก๋ ์์ " | |
| f"(๋์ ์ฑ๊ณต {success_count}๊ฐ)" | |
| ) | |
| if not all_data_frames: | |
| return | |
| checkpoint_all_df = pd.concat(all_data_frames, ignore_index=True) | |
| checkpoint_recent_df = pd.concat(recent_30d_frames, ignore_index=True) | |
| logs.append(f" - {_df_stats(checkpoint_all_df, 'all ์ฒญํฌ')}") | |
| logs.append(f" - {_df_stats(checkpoint_recent_df, '30d ์ฒญํฌ')}") | |
| result_all_ckpt = append_parquet_chunk_to_hf( | |
| checkpoint_all_df, | |
| all_dataset_name, | |
| hf_token, | |
| subdir="data/chunks/all" | |
| ) | |
| result_30d_ckpt = append_parquet_chunk_to_hf( | |
| checkpoint_recent_df, | |
| recent_dataset_name, | |
| hf_token, | |
| subdir="data/chunks/30d" | |
| ) | |
| _append_upload_result("all ์ฒดํฌํฌ์ธํธ", result_all_ckpt) | |
| _append_upload_result("30d ์ฒดํฌํฌ์ธํธ", result_30d_ckpt) | |
| if not result_all_ckpt["ok"] or not result_30d_ckpt["ok"]: | |
| raise RuntimeError("์ฒดํฌํฌ์ธํธ ์ ๋ก๋ ์คํจ๋ก ํ์ดํ๋ผ์ธ์ ์ค๋จํฉ๋๋ค.") | |
| all_data_frames.clear() | |
| recent_30d_frames.clear() | |
| gc.collect() | |
| last_checkpoint_success_index = success_count | |
| for i, ticker in enumerate(pending_tickers): | |
| # ์งํ๋ฅ ์ ๋ฐ์ดํธ | |
| progress_pct = 0.1 + ((i + 1) / total) * 0.75 | |
| progress(progress_pct, desc=f"์์ง ์ค: {ticker} ({i + 1}/{total})") | |
| ticker_df = fetch_ticker_data(ticker, period=period) | |
| if ticker_df is not None and not ticker_df.empty: | |
| all_data_frames.append(ticker_df) | |
| recent_30d_frames.append(filter_last_30_days(ticker_df)) | |
| success_count += 1 | |
| else: | |
| fail_count += 1 | |
| # ๋ฐฐ์น ๋จ์๋ก ๋ก๊ทธ ์ถ๋ ฅ | |
| if (i + 1) % batch_size == 0 or (i + 1) == total: | |
| logs.append(f" ์งํ: {i + 1}/{total} (์ฑ๊ณต: {success_count}, ์คํจ: {fail_count})") | |
| if checkpoint_batch_size > 0 and ((i + 1) % checkpoint_batch_size == 0): | |
| checkpoint_progress = min(0.89, max(progress_pct, 0.82)) | |
| progress(checkpoint_progress, desc=f"์ค๊ฐ ์ ๋ก๋ ์ค... ({i + 1}/{total})") | |
| _upload_checkpoint(i + 1) | |
| # API ํธ์ถ ๊ฐ ์งง์ ๋๊ธฐ (์ผํ ์ฐจ๋จ ๋ฐฉ์ง) | |
| if (i + 1) % 10 == 0: | |
| time.sleep(0.5) | |
| logs.append(f"\n๐ ์์ง ์๋ฃ: ์ฑ๊ณต {success_count}๊ฐ / ์คํจ {fail_count}๊ฐ") | |
| if success_count == 0: | |
| return "\n".join(logs) + "\n\nโ ์์ง๋ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค." | |
| # ========== 4๋จ๊ณ: ๋ง์ง๋ง ๋ฏธ๋ฐ์ ์ฒดํฌํฌ์ธํธ ๋ฐ์ ========== | |
| progress(0.9, desc="๋ง์ง๋ง ์ฒดํฌํฌ์ธํธ ๋ฐ์ ์ค...") | |
| logs.append("\n๐ง [4๋จ๊ณ] ๋ง์ง๋ง ๋ฏธ๋ฐ์ ๋ฐ์ดํฐ ๋ฐ์ ์ค...") | |
| if success_count > last_checkpoint_success_index: | |
| logs.append("\n๐พ [์ต์ข ๋ฐ์] ์ค๊ฐ ์ ๋ก๋ ์์ด ๋์ ๋ ๋ฐ์ดํฐ ๋ฐ์") | |
| _upload_checkpoint(total) | |
| progress(0.97, desc="์ฒญํฌ ์ ๋ก๋ ์ํ ๋ง๋ฌด๋ฆฌ ์ค...") | |
| logs.append("\n๐ [5๋จ๊ณ] ์ฒญํฌ ์ ๋ก๋ ๋ชจ๋ ์๋ฃ") | |
| logs.append(" - all/30d ๋ชจ๋ ์ฒญํฌ ํ์ผ ๊ธฐ์ค์ผ๋ก ์ ์ฅ๋์์ต๋๋ค.") | |
| logs.append(" - ๋ค์ ์คํ ์ 30d ํฐ์ปค ๋ชฉ๋ก ๊ธฐ์ค์ผ๋ก ์๋ ์คํต/์ฌ๊ฐ๋ฉ๋๋ค.") | |
| # ========== ์๋ฃ ========== | |
| progress(1.0, desc="์๋ฃ!") | |
| logs.append("\n" + "=" * 60) | |
| logs.append(f"โ ํ์ดํ๋ผ์ธ ์๋ฃ!") | |
| logs.append(f"โฐ ์ข ๋ฃ ์๊ฐ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| logs.append("=" * 60) | |
| return "\n".join(logs) | |
| except Exception as e: | |
| logger.exception("run_pipeline ์คํ ์ค ์์ธ ๋ฐ์") | |
| logs.append("\n" + "=" * 60) | |
| logs.append("โ ํ์ดํ๋ผ์ธ ์คํ ์ค ์์ธ๊ฐ ๋ฐ์ํ์ต๋๋ค.") | |
| logs.append(f"์ค๋ฅ ๋ฉ์์ง: {e}") | |
| logs.append("\n[Traceback]") | |
| logs.append(traceback.format_exc()) | |
| logs.append("=" * 60) | |
| return "\n".join(logs) | |
| def preview_tickers(): | |
| """ํฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ๋ณด๊ธฐ (์์ง ์ ํ์ธ์ฉ)""" | |
| nasdaq, nyse, combined = get_all_us_tickers() | |
| if not combined: | |
| return """โ ํฐ์ปค ๋ชฉ๋ก์ ๊ฐ์ ธ์ค์ง ๋ชปํ์ต๋๋ค. | |
| ๊ฐ๋ฅํ ์์ธ: | |
| 1) Yahoo Screener ์ผ์ ์ฅ์ /์ฐจ๋จ | |
| 2) ๋คํธ์ํฌ/์ง์ญ ์ ํ | |
| 3) yfinance API ๋ณ๊ฒฝ | |
| ์ ์ ํ ๋ค์ ์๋ํ๊ฑฐ๋, ํ์ดํ๋ผ์ธ ์คํ ์ HF ๋ฐ์ดํฐ์ ํด๋ฐฑ์ด ๋์ํ๋์ง ํ์ธํด ์ฃผ์ธ์. | |
| """ | |
| info = f"""๐ ํฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ๋ณด๊ธฐ | |
| โโโโโโโโโโโโโโโโโโโโโ | |
| ๋์ค๋ฅ: {len(nasdaq)}๊ฐ | |
| ๋ด์์ฆ๊ถ๊ฑฐ๋์: {len(nyse)}๊ฐ | |
| ์ ์ฒด: {len(combined)}๊ฐ | |
| ๋์ค๋ฅ ์ 20๊ฐ: {', '.join(nasdaq[:20])}... | |
| ๋ด์ ์ 20๊ฐ: {', '.join(nyse[:20])}... | |
| """ | |
| return info | |
| # ========== Gradio UI ๊ตฌ์ฑ ========== | |
| with gr.Blocks( | |
| title="์ฃผ์ ๋ฐ์ดํฐ์ ์์ฑ๊ธฐ" | |
| ) as demo: | |
| gr.Markdown(""" | |
| # ๐ ๋์ค๋ฅ & ๋ด์์ฆ๊ถ๊ฑฐ๋์ ์ฃผ์ ๋ฐ์ดํฐ์ ์์ฑ๊ธฐ | |
| **์ผํ ํ์ด๋ธ์ค์์ ์ ์ฒด ํฐ์ปค์ ์ผ๋ณ ๋ฐ์ดํฐ๋ฅผ ์์งํ์ฌ ํ๊น ํ์ด์ค ๋ฐ์ดํฐ์ ์ผ๋ก ์๋ ์ ๋ก๋ํฉ๋๋ค.** | |
| ### ํ์ดํ๋ผ์ธ ํ๋ฆ | |
| 1. ๐ ๋์ค๋ฅ & ๋ด์์ฆ๊ถ๊ฑฐ๋์ ์ ์ฒด ํฐ์ปค ๋ชฉ๋ก ์์ง | |
| 2. ๐ฅ ํฐ์ปค๋ณ ์ผํ ํ์ด๋ธ์ค ์ผ๋ณ ๋ฐ์ดํฐ ์กฐํ (`period` ์ค์ ๊ฐ๋ฅ) | |
| 3. ๐ฆ **all ๋ฐ์ดํฐ์ ** ์์ฑ (์ ์ฒด๊ธฐ๊ฐ ๋ฐ์ดํฐ) | |
| 4. ๐๏ธ ํฐ์ปค๋ณ ์ต๊ทผ 30์ผ ํํฐ๋ง โ **30์ผ ๋ฐ์ดํฐ์ ** ์์ฑ | |
| 5. ๐ ํ๊น ํ์ด์ค ํ๋ธ์ ์ ๋ก๋ | |
| > โ ๏ธ ์ ์ฒด ํฐ์ปค๋ฅผ ๋ฐ๋ณต ์กฐํํ๋ฏ๋ก **์ ์๊ฐ์ด ์์**๋ ์ ์์ต๋๋ค. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### โ๏ธ ์ค์ ") | |
| hf_token_input = gr.Textbox( | |
| label="ํ๊น ํ์ด์ค ํ ํฐ (HF_TOKEN)", | |
| value=HF_TOKEN, | |
| type="password", | |
| placeholder="hf_xxxxx...", | |
| info="Spaces ์ํฌ๋ฆฟ์ HF_TOKEN์ด ์ค์ ๋์ด ์์ผ๋ฉด ์๋์ผ๋ก ๋ถ๋ฌ์ต๋๋ค." | |
| ) | |
| all_dataset_input = gr.Textbox( | |
| label="์ ์ฒด๊ธฐ๊ฐ ๋ฐ์ดํฐ์ ์ด๋ฆ (all)", | |
| value="younginpiniti/us-stocks-daily-all", | |
| placeholder="username/dataset-name", | |
| info="์ ์ฒด๊ธฐ๊ฐ ์ผ๋ณ ๋ฐ์ดํฐ๊ฐ ์ ์ฅ๋ ํ๊น ํ์ด์ค ๋ฐ์ดํฐ์ " | |
| ) | |
| recent_dataset_input = gr.Textbox( | |
| label="์ต๊ทผ 30์ผ ๋ฐ์ดํฐ์ ์ด๋ฆ (30d)", | |
| value="younginpiniti/us-stocks-daily-30d", | |
| placeholder="username/dataset-name", | |
| info="์ต๊ทผ 30์ผ ์ผ๋ณ ๋ฐ์ดํฐ๊ฐ ์ ์ฅ๋ ํ๊น ํ์ด์ค ๋ฐ์ดํฐ์ " | |
| ) | |
| batch_size_input = gr.Slider( | |
| label="๋ก๊ทธ ์ถ๋ ฅ ๋ฐฐ์น ํฌ๊ธฐ", | |
| minimum=10, | |
| maximum=500, | |
| value=100, | |
| step=10, | |
| info="๋ช ๊ฐ ํฐ์ปค๋ง๋ค ๋ก๊ทธ๋ฅผ ์ถ๋ ฅํ ์ง ์ค์ " | |
| ) | |
| period_input = gr.Dropdown( | |
| label="์กฐํ ๊ธฐ๊ฐ (Yahoo period)", | |
| choices=["max", "10y", "5y", "2y", "1y", "6mo", "3mo", "1mo"], | |
| value="max", | |
| info="์ ์ฒด ์์ง ์๊ฐ์ด ๊ธธ๋ฉด 10y/5y ๋ฑ์ผ๋ก ์ค์ฌ ์คํํ ์ ์์ต๋๋ค." | |
| ) | |
| checkpoint_batch_input = gr.Dropdown( | |
| label="์ค๊ฐ ์ ๋ก๋ ๊ฐ๊ฒฉ (ํฐ์ปค ์)", | |
| choices=[0, 50, 100, 200, 500], | |
| value=100, | |
| info="0์ด๋ฉด ์ค๊ฐ ์ ๋ก๋ ์์ด ๋ง์ง๋ง์๋ง ์ ๋ก๋ํฉ๋๋ค." | |
| ) | |
| with gr.Row(): | |
| preview_btn = gr.Button("๐ ํฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ๋ณด๊ธฐ", variant="secondary") | |
| start_btn = gr.Button("๐ ํ์ดํ๋ผ์ธ ์์", variant="primary") | |
| realtime_btn = gr.Button("โก ์ค์๊ฐ ์์", variant="secondary") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### ๐ ์คํ ๋ก๊ทธ") | |
| output_log = gr.Textbox( | |
| label="๋ก๊ทธ ์ถ๋ ฅ", | |
| lines=30, | |
| max_lines=50, | |
| interactive=False | |
| ) | |
| # ์ด๋ฒคํธ ์ฐ๊ฒฐ | |
| preview_btn.click( | |
| fn=preview_tickers, | |
| inputs=[], | |
| outputs=[output_log] | |
| ) | |
| start_btn.click( | |
| fn=run_pipeline, | |
| inputs=[ | |
| hf_token_input, | |
| all_dataset_input, | |
| recent_dataset_input, | |
| batch_size_input, | |
| period_input, | |
| checkpoint_batch_input | |
| ], | |
| outputs=[output_log] | |
| ) | |
| realtime_btn.click( | |
| fn=run_realtime_update, | |
| inputs=[ | |
| hf_token_input, | |
| all_dataset_input, | |
| recent_dataset_input | |
| ], | |
| outputs=[output_log] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### ๐ ๋ฐ์ดํฐ์ ๊ตฌ์กฐ | |
| | ์ปฌ๋ผ | ์ค๋ช | ์์ | | |
| |------|------|------| | |
| | `Ticker` | ์ข ๋ชฉ ํฐ์ปค ์ฌ๋ณผ | AAPL, MSFT, TSLA | | |
| | `Date` | ๊ฑฐ๋์ผ (YYYY-MM-DD) | 2024-01-15 | | |
| | `Open` | ์๊ฐ | 185.3200 | | |
| | `High` | ๊ณ ๊ฐ | 187.0400 | | |
| | `Low` | ์ ๊ฐ | 184.2100 | | |
| | `Close` | ์ข ๊ฐ | 186.0000 | | |
| | `Volume` | ๊ฑฐ๋๋ | 45123456 | | |
| > ๐ก **ํ**: ํฐ์ปค๋ณ๋ก ์ ์ฒ๋ฆฌํ ๋๋ `Ticker` ์ปฌ๋ผ์ผ๋ก ๊ทธ๋ฃนํํ๋ฉด ๋ฉ๋๋ค. | |
| > ```python | |
| > from datasets import load_dataset | |
| > ds = load_dataset("younginpiniti/us-stocks-daily-all") | |
| > df = ds["train"].to_pandas() | |
| > aapl = df[df["Ticker"] == "AAPL"] | |
| > ``` | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(theme=gr.themes.Soft()) | |