ostock-backend / model /src /data /processors.py
johnaness's picture
Deploy OStock FastAPI backend to HF Space (Docker SDK, port 7860)
4be2d4d
"""
๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ฐ ์ „์ฒ˜๋ฆฌ ๋ชจ๋“ˆ
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from .time_utils import hermite_cubic_spline, calculate_time_derivative
from .normalize import tanh_scale
def process_data(data, use_spline=False, n_interpolation_points=5):
"""
์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜
"""
# ํ‹ฐ์ปค๋ฅผ ์ˆซ์ž ID๋กœ ๋ณ€ํ™˜
ticker_encoder = LabelEncoder()
all_tickers = data['ticker'].unique()
ticker_encoder.fit(all_tickers)
data['ticker_id'] = ticker_encoder.transform(data['ticker'])
# ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
data = data.fillna(method='ffill')
# ํ‹ฐ์ปค๋ณ„ ๋ฐ์ดํ„ฐ ์ €์žฅ (์›๋ณธ ๋ฐ ์Šคํ”Œ๋ผ์ธ ๋ณด๊ฐ„ ๊ฒฐ๊ณผ)
ticker_data = {}
for ticker in all_tickers:
ticker_df = data[data['ticker'] == ticker].copy()
ticker_data[ticker] = ticker_df
# ์Šคํ”Œ๋ผ์ธ ๋ณด๊ฐ„ ์ ์šฉ (์˜ต์…˜)
if use_spline and n_interpolation_points > 0:
# ๋‚ ์งœ๋ฅผ ์ˆซ์ž ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜ (ํƒ€์ž„์Šคํƒฌํ”„)
time_points = ticker_df.index.astype(np.int64) // 10**9 # ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜
# ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ ์ปฌ๋Ÿผ๋งŒ ์„ ํƒ
numeric_cols = ticker_df.select_dtypes(include=[np.number]).columns
numeric_data = ticker_df[numeric_cols].values
# ์—๋ฅด๋ฏธํŠธ ํ๋น… ์Šคํ”Œ๋ผ์ธ ๋ณด๊ฐ„ ์ ์šฉ
interpolated_data, interp_times = hermite_cubic_spline(
numeric_data,
n_interpolation_points=n_interpolation_points,
time_points=time_points
)
# ๋ณด๊ฐ„ ๊ฒฐ๊ณผ ์ €์žฅ
ticker_data[f"{ticker}_spline"] = {
'data': interpolated_data,
'times': interp_times,
'columns': numeric_cols
}
print(f"{ticker}: ์›๋ณธ ๋ฐ์ดํ„ฐ {len(ticker_df)}๊ฐœ โ†’ ๋ณด๊ฐ„ ํ›„ {len(interpolated_data)}๊ฐœ ํฌ์ธํŠธ")
return data, ticker_encoder, ticker_data
def load_stock_data(ticker_string):
"""
ํ‹ฐ์ปค ๋ฌธ์ž์—ด๋กœ๋ถ€ํ„ฐ ์ฃผ์‹ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
"""
# ํ‹ฐ์ปค ๋ฌธ์ž์—ด ์ฒ˜๋ฆฌ
ticker_string = ticker_string[:-5] if ticker_string.endswith('_data') else ticker_string
training_tickers = ticker_string.split('_')
all_tickers = '_'.join(training_tickers)
# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
filename = f"./data/{all_tickers}_data.csv"
data = pd.read_csv(filename, parse_dates=['Date'])
data = data.set_index('Date')
data.sort_index(inplace=True)
return data, training_tickers
def prepare_data(data, window_size=60, n_interpolation_points=None):
"""
์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ๋ฅผ ์œˆ๋„์šฐ ๊ธฐ๋ฐ˜ ์‹œํ€€์Šค๋กœ ์ค€๋น„
"""
# ํ‹ฐ์ปค๋ฅผ ์ˆซ์ž ID๋กœ ๋ณ€ํ™˜
ticker_encoder = LabelEncoder()
all_tickers = data['ticker'].unique()
ticker_encoder.fit(all_tickers)
data['ticker_id'] = ticker_encoder.transform(data['ticker'])
# ๋กœ๊ทธ ์ˆ˜์ต๋ฅ  ๊ณ„์‚ฐ
data['log_return'] = data.groupby('ticker')['Close'].transform(lambda x: np.log(x).diff())
data['log_return'] = data['log_return'].fillna(0)
x_train_list, y_train_list, ticker_train_list, dt_train_list = [], [], [], []
x_val_list, y_val_list, ticker_val_list, dt_val_list = [], [], [], []
x_test_list, y_test_list, ticker_test_list, dt_test_list = [], [], [], []
# ์Šค์ผ€์ผ๋Ÿฌ ์ €์žฅ
scalers = {}
for ticker in data['ticker'].unique():
ticker_df = data[data['ticker']==ticker]
# ๋‚ ์งœ ๊ฐ„๊ฒฉ ๊ณ„์‚ฐ
ticker_df = ticker_df.sort_index() # ๋‚ ์งœ์ˆœ ์ •๋ ฌ
ticker_df['days_diff'] = (ticker_df.index.to_series().diff().dt.days).fillna(1.0)
# ํŠน์„ฑ, ๋ผ๋ฒจ, ID, ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ์ค€๋น„
drop_columns = ['ticker', 'Close', 'days_diff', 'ticker_id', 'log_return']
drop_columns = [col for col in drop_columns if col in ticker_df.columns]
# ์—ฌ๊ธฐ์— ์ •๊ทœํ™” ์ถ”๊ฐ€
feature_cols = [col for col in ticker_df.columns if col not in drop_columns]
if len(feature_cols) > 0:
# tanh ์ •๊ทœํ™” ์ ์šฉ
scaled_features, scaler = tanh_scale(
ticker_df[feature_cols].values,
verbose=False
)
scalers[ticker] = {
'scaler': scaler,
'feature_cols': feature_cols
}
# ์ •๊ทœํ™”๋œ ํŠน์„ฑ์œผ๋กœ ๋Œ€์ฒด
features = scaled_features
else:
features = np.array([]) # ํŠน์„ฑ์ด ์—†๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ
labels = ticker_df[['log_return']].values
ids = ticker_df['ticker_id'].values
time_diffs = ticker_df['days_diff'].values
# ์‹œํ€€์Šค ์ƒ์„ฑ (์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ํฌํ•จ)
seq_X, seq_Y, seq_ID, seq_dt = [], [], [], []
for i in range(len(features) - window_size):
seq_X.append(features[i:i+window_size])
seq_Y.append(labels[i+window_size])
seq_ID.append(ids[i+window_size])
seq_dt.append(time_diffs[i+1:i+window_size+1])
if not seq_X:
continue
# ๋ฐฐ์—ด ๋ณ€ํ™˜
seq_X = np.stack(seq_X)
seq_Y = np.stack(seq_Y)
seq_ID = np.array(seq_ID)
seq_dt = np.stack(seq_dt) # ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ๋ฐ์ดํ„ฐ๋ฅผ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
# ํ›ˆ๋ จ/๊ฒ€์ฆ/ํ…Œ์ŠคํŠธ ๋ถ„ํ• 
n = len(seq_X)
test_size = int(n*0.2)
val_size = int(n*0.1)
train_end = n - test_size - val_size
# ๊ฐ ์„ธํŠธ์— ์ถ”๊ฐ€
x_train_list.append(seq_X[:train_end])
y_train_list.append(seq_Y[:train_end])
ticker_train_list.append(seq_ID[:train_end])
dt_train_list.append(seq_dt[:train_end]) # ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
if val_size > 0:
x_val_list.append(seq_X[train_end:train_end+val_size])
y_val_list.append(seq_Y[train_end:train_end+val_size])
ticker_val_list.append(seq_ID[train_end:train_end+val_size])
dt_val_list.append(seq_dt[train_end:train_end+val_size]) # ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
if test_size > 0:
x_test_list.append(seq_X[-test_size:])
y_test_list.append(seq_Y[-test_size:])
ticker_test_list.append(seq_ID[-test_size:])
dt_test_list.append(seq_dt[-test_size:]) # ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
# ๋ฐ์ดํ„ฐ ์—†์„ ๊ฒฝ์šฐ ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
if not x_train_list:
raise ValueError("๋ฐ์ดํ„ฐ ์ค€๋น„ ์ค‘ ์˜ค๋ฅ˜: ํ•™์Šต ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
# ๋ฐ์ดํ„ฐ์…‹ ๋ณ‘ํ•ฉ
x_train = np.concatenate(x_train_list, axis=0)
y_train = np.concatenate(y_train_list, axis=0)
ticker_train = np.concatenate(ticker_train_list, axis=0)
time_diffs_train = np.concatenate(dt_train_list, axis=0) # ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
# ๊ฒ€์ฆ ๋ฐ ํ…Œ์ŠคํŠธ ์„ธํŠธ ์ฒ˜๋ฆฌ
if x_val_list:
x_val = np.concatenate(x_val_list, axis=0)
y_val = np.concatenate(y_val_list, axis=0)
ticker_val = np.concatenate(ticker_val_list, axis=0)
time_diffs_val = np.concatenate(dt_val_list, axis=0)
else:
# ๋น„์–ด์žˆ๋Š” ๊ฒฝ์šฐ ์˜ฌ๋ฐ”๋ฅธ shape์˜ ๋นˆ ๋ฐฐ์—ด ์ƒ์„ฑ
x_val = np.empty((0, window_size, x_train.shape[2]))
y_val = np.empty((0, 1))
ticker_val = np.empty((0,))
time_diffs_val = np.empty((0, window_size))
if x_test_list:
x_test = np.concatenate(x_test_list, axis=0)
y_test = np.concatenate(y_test_list, axis=0)
ticker_test = np.concatenate(ticker_test_list, axis=0)
time_diffs_test = np.concatenate(dt_test_list, axis=0)
else:
# ๋น„์–ด์žˆ๋Š” ๊ฒฝ์šฐ ์˜ฌ๋ฐ”๋ฅธ shape์˜ ๋นˆ ๋ฐฐ์—ด ์ƒ์„ฑ
x_test = np.empty((0, window_size, x_train.shape[2]))
y_test = np.empty((0, 1))
ticker_test = np.empty((0,))
time_diffs_test = np.empty((0, window_size))
# ๋ผ๋ฒจ์˜ ๋ฏธ๋ถ„(์ด์‚ฐ) ๊ณ„์‚ฐ
y_train_dt = calculate_time_derivative(y_train)
y_val_dt = calculate_time_derivative(y_val) if len(y_val) > 0 else None
y_test_dt = calculate_time_derivative(y_test) if len(y_test) > 0 else None
# ์ „์ฒด ๋ฐ์ดํ„ฐ์˜ ๋‚ ์งœ ๋ฒ”์œ„ ์ €์žฅ
date_min = data.index.min()
date_max = data.index.max()
# ํŠน์„ฑ ์ˆ˜ ์ถœ๋ ฅ
print(f"์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ: ํŠน์„ฑ ์ˆ˜={x_train.shape[2]}, ํ•™์Šต ์ƒ˜ํ”Œ ์ˆ˜={x_train.shape[0]}")
result_dict = {
'x_train': x_train,
'y_train': y_train,
'ticker_train': ticker_train,
'y_train_dt': y_train_dt,
'time_diffs_train': time_diffs_train,
'x_val': x_val,
'y_val': y_val,
'ticker_val': ticker_val,
'y_val_dt': y_val_dt,
'time_diffs_val': time_diffs_val,
'x_test': x_test,
'y_test': y_test,
'ticker_test': ticker_test,
'y_test_dt': y_test_dt,
'time_diffs_test': time_diffs_test,
# ๋‚ ์งœ ๋ฒ”์œ„ ์ •๋ณด ์ถ”๊ฐ€
'start_date': date_min,
'end_date': date_max,
'scalers': scalers,
'data': data
}
return result_dict, ticker_encoder, data