ostock-backend / model /src /data /data_integration.py
johnaness's picture
Deploy OStock FastAPI backend to HF Space (Docker SDK, port 7860)
4be2d4d
"""
๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ๋ฐ ์ „์ฒ˜๋ฆฌ ๋ชจ๋“ˆ
"""
import pandas as pd
import yfinance as yf
from .technical_indicators import add_technical_indicators
from .optimization import run_technical_optimization
from .financial_data import process_financial_data
from .economic_data import get_economic_data
from .hierarchical_embedding import get_industry_data, add_industry_encoding, combine_stocks_for_embedding
from src.config import FRED_API_KEY
def process_stock_data(tickers, start_date, end_date, fred_api_key=None):
"""
์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ์˜ ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ์„ ์‹คํ–‰ํ•˜๋Š” ํ•จ์ˆ˜
"""
# API ํ‚ค ๊ธฐ๋ณธ๊ฐ’ ์ฒ˜๋ฆฌ
if fred_api_key is None:
fred_api_key = FRED_API_KEY
print(f"์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘: {len(tickers)}๊ฐœ ์ข…๋ชฉ")
# 1. ๊ธฐ์ˆ ์  ์ง€ํ‘œ ์ตœ์ ํ™”
optimal_params = run_technical_optimization(tickers, start_date, end_date)
# 2. ์ตœ์ ํ™”๋œ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ
all_data = {}
for ticker in tickers:
df = yf.download(ticker, start=start_date, end=end_date, auto_adjust=True)
if isinstance(df.columns, pd.MultiIndex):
df.columns = df.columns.droplevel(1)
# ์ตœ์ ํ™”๋œ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๊ธฐ์ˆ ์  ์ง€ํ‘œ ์ถ”๊ฐ€
df_with_indicators = add_technical_indicators(
df.copy(),
ema_params=optimal_params['ema'],
macd_params=optimal_params['macd'],
cmf_period=optimal_params['cmf'],
rsi_params=optimal_params['rsi']
)
# ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ
df_with_indicators = df_with_indicators.dropna()
all_data[ticker] = df_with_indicators
print(f"{ticker} ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {len(df_with_indicators)}ํ–‰")
# 3. ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ๋ฐ ํ†ตํ•ฉ
print("\n===== ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘ =====\n")
financial_data_all = {}
common_features = None
# ๋ชจ๋“  ์ข…๋ชฉ์˜ ์žฌ๋ฌด ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘
for ticker in tickers:
if ticker not in all_data:
continue
financial_data = process_financial_data(ticker, all_data, end_date)
if financial_data is not None and not financial_data.empty:
financial_features = [col for col in financial_data.columns if col != 'Close']
if financial_features:
financial_data_all[ticker] = financial_data
# ๊ณตํ†ต ํŠน์„ฑ ์ถ”์ 
if common_features is None:
common_features = set(financial_features)
else:
common_features = common_features.intersection(set(financial_features))
# 4. ๊ฒฝ์ œ ์ง€ํ‘œ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
print("\n===== ๊ฒฝ์ œ ์ง€ํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘ =====\n")
econ_df = get_economic_data(start_date, end_date, fred_api_key)
# 5. ๋ฐ์ดํ„ฐ ์ตœ์ข… ํ†ตํ•ฉ (๊ธฐ์ˆ  ์ง€ํ‘œ + ์žฌ๋ฌด์ œํ‘œ + ๊ฒฝ์ œ ์ง€ํ‘œ)
print("\n===== ์ตœ์ข… ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ =====\n")
# ๊ณตํ†ต ์žฌ๋ฌด ํŠน์„ฑ ์„ ํƒ
if common_features:
selected_common_features = list(common_features)[:20] # ์ƒ์œ„ 20๊ฐœ
print(f"๋ชจ๋“  ์ข…๋ชฉ์— ๊ณตํ†ต์ธ ์žฌ๋ฌด ํŠน์„ฑ ์ค‘ {len(selected_common_features)}๊ฐœ ์„ ํƒ")
else:
selected_common_features = []
print("๊ณตํ†ต ์žฌ๋ฌด ํŠน์„ฑ์ด ์—†์Šต๋‹ˆ๋‹ค.")
# ๊ฐ ์ข…๋ชฉ๋ณ„ ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
for ticker in tickers:
if ticker in all_data:
try:
# ๊ธฐ๋ณธ ๋ฐ์ดํ„ฐ (๊ธฐ์ˆ ์  ์ง€ํ‘œ ํฌํ•จ)
stock_data = all_data[ticker].copy()
# ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€ (์„ ํƒ๋œ ๊ณตํ†ต ํŠน์„ฑ๋งŒ)
if ticker in financial_data_all and selected_common_features:
fin_data = financial_data_all[ticker][selected_common_features]
stock_data = stock_data.join(fin_data, how='left')
# ๊ฒฝ์ œ ์ง€ํ‘œ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
stock_data = stock_data.join(econ_df, how='left')
# ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
for col in stock_data.columns:
if col != 'Close' and stock_data[col].isna().any():
stock_data[col] = stock_data[col].interpolate(method='linear')
stock_data = stock_data.dropna()
# ์ตœ์ข… ๋ฐ์ดํ„ฐ ์ €์žฅ
all_data[ticker] = stock_data
print(f"{ticker} ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ์™„๋ฃŒ: {stock_data.shape[1]}๊ฐœ ํŠน์„ฑ")
except Exception as e:
print(f"{ticker} ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ์‹คํŒจ: {e}")
# 6. ์‚ฐ์—… ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ ๋ฐ ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
industry_df = get_industry_data(tickers)
# 7. ์ข…๋ชฉ ์ž„๋ฒ ๋”ฉ์„ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
combined_data = combine_stocks_for_embedding(all_data, tickers)
# 8. ์‚ฐ์—… ์ •๋ณด ์ธ์ฝ”๋”ฉ ์ถ”๊ฐ€
final_data, industry_encoders = add_industry_encoding(combined_data, industry_df)
return final_data, all_data, industry_encoders