Spaces:
Sleeping
Sleeping
| """ | |
| ๋ฐ์ดํฐ ๋ก๋ฉ ๋ฐ ์ ์ฒ๋ฆฌ ๋ชจ๋ | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder | |
| from .time_utils import hermite_cubic_spline, calculate_time_derivative | |
| from .normalize import tanh_scale | |
| def process_data(data, use_spline=False, n_interpolation_points=5): | |
| """ | |
| ์ฃผ์ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ํจ์ | |
| """ | |
| # ํฐ์ปค๋ฅผ ์ซ์ ID๋ก ๋ณํ | |
| ticker_encoder = LabelEncoder() | |
| all_tickers = data['ticker'].unique() | |
| ticker_encoder.fit(all_tickers) | |
| data['ticker_id'] = ticker_encoder.transform(data['ticker']) | |
| # ๊ฒฐ์ธก์น ์ฒ๋ฆฌ | |
| data = data.fillna(method='ffill') | |
| # ํฐ์ปค๋ณ ๋ฐ์ดํฐ ์ ์ฅ (์๋ณธ ๋ฐ ์คํ๋ผ์ธ ๋ณด๊ฐ ๊ฒฐ๊ณผ) | |
| ticker_data = {} | |
| for ticker in all_tickers: | |
| ticker_df = data[data['ticker'] == ticker].copy() | |
| ticker_data[ticker] = ticker_df | |
| # ์คํ๋ผ์ธ ๋ณด๊ฐ ์ ์ฉ (์ต์ ) | |
| if use_spline and n_interpolation_points > 0: | |
| # ๋ ์ง๋ฅผ ์ซ์ ํํ๋ก ๋ณํ (ํ์์คํฌํ) | |
| time_points = ticker_df.index.astype(np.int64) // 10**9 # ์ด ๋จ์๋ก ๋ณํ | |
| # ์์นํ ๋ฐ์ดํฐ ์ปฌ๋ผ๋ง ์ ํ | |
| numeric_cols = ticker_df.select_dtypes(include=[np.number]).columns | |
| numeric_data = ticker_df[numeric_cols].values | |
| # ์๋ฅด๋ฏธํธ ํ๋น ์คํ๋ผ์ธ ๋ณด๊ฐ ์ ์ฉ | |
| interpolated_data, interp_times = hermite_cubic_spline( | |
| numeric_data, | |
| n_interpolation_points=n_interpolation_points, | |
| time_points=time_points | |
| ) | |
| # ๋ณด๊ฐ ๊ฒฐ๊ณผ ์ ์ฅ | |
| ticker_data[f"{ticker}_spline"] = { | |
| 'data': interpolated_data, | |
| 'times': interp_times, | |
| 'columns': numeric_cols | |
| } | |
| print(f"{ticker}: ์๋ณธ ๋ฐ์ดํฐ {len(ticker_df)}๊ฐ โ ๋ณด๊ฐ ํ {len(interpolated_data)}๊ฐ ํฌ์ธํธ") | |
| return data, ticker_encoder, ticker_data | |
| def load_stock_data(ticker_string): | |
| """ | |
| ํฐ์ปค ๋ฌธ์์ด๋ก๋ถํฐ ์ฃผ์ ๋ฐ์ดํฐ๋ฅผ ๋ก๋ํฉ๋๋ค. | |
| """ | |
| # ํฐ์ปค ๋ฌธ์์ด ์ฒ๋ฆฌ | |
| ticker_string = ticker_string[:-5] if ticker_string.endswith('_data') else ticker_string | |
| training_tickers = ticker_string.split('_') | |
| all_tickers = '_'.join(training_tickers) | |
| # ๋ฐ์ดํฐ ๋ก๋ | |
| filename = f"./data/{all_tickers}_data.csv" | |
| data = pd.read_csv(filename, parse_dates=['Date']) | |
| data = data.set_index('Date') | |
| data.sort_index(inplace=True) | |
| return data, training_tickers | |
| def prepare_data(data, window_size=60, n_interpolation_points=None): | |
| """ | |
| ์๊ณ์ด ๋ฐ์ดํฐ๋ฅผ ์๋์ฐ ๊ธฐ๋ฐ ์ํ์ค๋ก ์ค๋น | |
| """ | |
| # ํฐ์ปค๋ฅผ ์ซ์ ID๋ก ๋ณํ | |
| ticker_encoder = LabelEncoder() | |
| all_tickers = data['ticker'].unique() | |
| ticker_encoder.fit(all_tickers) | |
| data['ticker_id'] = ticker_encoder.transform(data['ticker']) | |
| # ๋ก๊ทธ ์์ต๋ฅ ๊ณ์ฐ | |
| data['log_return'] = data.groupby('ticker')['Close'].transform(lambda x: np.log(x).diff()) | |
| data['log_return'] = data['log_return'].fillna(0) | |
| x_train_list, y_train_list, ticker_train_list, dt_train_list = [], [], [], [] | |
| x_val_list, y_val_list, ticker_val_list, dt_val_list = [], [], [], [] | |
| x_test_list, y_test_list, ticker_test_list, dt_test_list = [], [], [], [] | |
| # ์ค์ผ์ผ๋ฌ ์ ์ฅ | |
| scalers = {} | |
| for ticker in data['ticker'].unique(): | |
| ticker_df = data[data['ticker']==ticker] | |
| # ๋ ์ง ๊ฐ๊ฒฉ ๊ณ์ฐ | |
| ticker_df = ticker_df.sort_index() # ๋ ์ง์ ์ ๋ ฌ | |
| ticker_df['days_diff'] = (ticker_df.index.to_series().diff().dt.days).fillna(1.0) | |
| # ํน์ฑ, ๋ผ๋ฒจ, ID, ์๊ฐ ๊ฐ๊ฒฉ ์ค๋น | |
| drop_columns = ['ticker', 'Close', 'days_diff', 'ticker_id', 'log_return'] | |
| drop_columns = [col for col in drop_columns if col in ticker_df.columns] | |
| # ์ฌ๊ธฐ์ ์ ๊ทํ ์ถ๊ฐ | |
| feature_cols = [col for col in ticker_df.columns if col not in drop_columns] | |
| if len(feature_cols) > 0: | |
| # tanh ์ ๊ทํ ์ ์ฉ | |
| scaled_features, scaler = tanh_scale( | |
| ticker_df[feature_cols].values, | |
| verbose=False | |
| ) | |
| scalers[ticker] = { | |
| 'scaler': scaler, | |
| 'feature_cols': feature_cols | |
| } | |
| # ์ ๊ทํ๋ ํน์ฑ์ผ๋ก ๋์ฒด | |
| features = scaled_features | |
| else: | |
| features = np.array([]) # ํน์ฑ์ด ์๋ ๊ฒฝ์ฐ ์ฒ๋ฆฌ | |
| labels = ticker_df[['log_return']].values | |
| ids = ticker_df['ticker_id'].values | |
| time_diffs = ticker_df['days_diff'].values | |
| # ์ํ์ค ์์ฑ (์๊ฐ ๊ฐ๊ฒฉ ํฌํจ) | |
| seq_X, seq_Y, seq_ID, seq_dt = [], [], [], [] | |
| for i in range(len(features) - window_size): | |
| seq_X.append(features[i:i+window_size]) | |
| seq_Y.append(labels[i+window_size]) | |
| seq_ID.append(ids[i+window_size]) | |
| seq_dt.append(time_diffs[i+1:i+window_size+1]) | |
| if not seq_X: | |
| continue | |
| # ๋ฐฐ์ด ๋ณํ | |
| seq_X = np.stack(seq_X) | |
| seq_Y = np.stack(seq_Y) | |
| seq_ID = np.array(seq_ID) | |
| seq_dt = np.stack(seq_dt) # ์๊ฐ ๊ฐ๊ฒฉ ๋ฐ์ดํฐ๋ฅผ numpy ๋ฐฐ์ด๋ก ๋ณํ | |
| # ํ๋ จ/๊ฒ์ฆ/ํ ์คํธ ๋ถํ | |
| n = len(seq_X) | |
| test_size = int(n*0.2) | |
| val_size = int(n*0.1) | |
| train_end = n - test_size - val_size | |
| # ๊ฐ ์ธํธ์ ์ถ๊ฐ | |
| x_train_list.append(seq_X[:train_end]) | |
| y_train_list.append(seq_Y[:train_end]) | |
| ticker_train_list.append(seq_ID[:train_end]) | |
| dt_train_list.append(seq_dt[:train_end]) # ์๊ฐ ๊ฐ๊ฒฉ ๋ฐ์ดํฐ ์ถ๊ฐ | |
| if val_size > 0: | |
| x_val_list.append(seq_X[train_end:train_end+val_size]) | |
| y_val_list.append(seq_Y[train_end:train_end+val_size]) | |
| ticker_val_list.append(seq_ID[train_end:train_end+val_size]) | |
| dt_val_list.append(seq_dt[train_end:train_end+val_size]) # ์๊ฐ ๊ฐ๊ฒฉ ๋ฐ์ดํฐ ์ถ๊ฐ | |
| if test_size > 0: | |
| x_test_list.append(seq_X[-test_size:]) | |
| y_test_list.append(seq_Y[-test_size:]) | |
| ticker_test_list.append(seq_ID[-test_size:]) | |
| dt_test_list.append(seq_dt[-test_size:]) # ์๊ฐ ๊ฐ๊ฒฉ ๋ฐ์ดํฐ ์ถ๊ฐ | |
| # ๋ฐ์ดํฐ ์์ ๊ฒฝ์ฐ ์์ธ ์ฒ๋ฆฌ | |
| if not x_train_list: | |
| raise ValueError("๋ฐ์ดํฐ ์ค๋น ์ค ์ค๋ฅ: ํ์ต ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.") | |
| # ๋ฐ์ดํฐ์ ๋ณํฉ | |
| x_train = np.concatenate(x_train_list, axis=0) | |
| y_train = np.concatenate(y_train_list, axis=0) | |
| ticker_train = np.concatenate(ticker_train_list, axis=0) | |
| time_diffs_train = np.concatenate(dt_train_list, axis=0) # ์๊ฐ ๊ฐ๊ฒฉ ๋ฐ์ดํฐ ๋ณํฉ | |
| # ๊ฒ์ฆ ๋ฐ ํ ์คํธ ์ธํธ ์ฒ๋ฆฌ | |
| if x_val_list: | |
| x_val = np.concatenate(x_val_list, axis=0) | |
| y_val = np.concatenate(y_val_list, axis=0) | |
| ticker_val = np.concatenate(ticker_val_list, axis=0) | |
| time_diffs_val = np.concatenate(dt_val_list, axis=0) | |
| else: | |
| # ๋น์ด์๋ ๊ฒฝ์ฐ ์ฌ๋ฐ๋ฅธ shape์ ๋น ๋ฐฐ์ด ์์ฑ | |
| x_val = np.empty((0, window_size, x_train.shape[2])) | |
| y_val = np.empty((0, 1)) | |
| ticker_val = np.empty((0,)) | |
| time_diffs_val = np.empty((0, window_size)) | |
| if x_test_list: | |
| x_test = np.concatenate(x_test_list, axis=0) | |
| y_test = np.concatenate(y_test_list, axis=0) | |
| ticker_test = np.concatenate(ticker_test_list, axis=0) | |
| time_diffs_test = np.concatenate(dt_test_list, axis=0) | |
| else: | |
| # ๋น์ด์๋ ๊ฒฝ์ฐ ์ฌ๋ฐ๋ฅธ shape์ ๋น ๋ฐฐ์ด ์์ฑ | |
| x_test = np.empty((0, window_size, x_train.shape[2])) | |
| y_test = np.empty((0, 1)) | |
| ticker_test = np.empty((0,)) | |
| time_diffs_test = np.empty((0, window_size)) | |
| # ๋ผ๋ฒจ์ ๋ฏธ๋ถ(์ด์ฐ) ๊ณ์ฐ | |
| y_train_dt = calculate_time_derivative(y_train) | |
| y_val_dt = calculate_time_derivative(y_val) if len(y_val) > 0 else None | |
| y_test_dt = calculate_time_derivative(y_test) if len(y_test) > 0 else None | |
| # ์ ์ฒด ๋ฐ์ดํฐ์ ๋ ์ง ๋ฒ์ ์ ์ฅ | |
| date_min = data.index.min() | |
| date_max = data.index.max() | |
| # ํน์ฑ ์ ์ถ๋ ฅ | |
| print(f"์ ์ฒ๋ฆฌ ์๋ฃ: ํน์ฑ ์={x_train.shape[2]}, ํ์ต ์ํ ์={x_train.shape[0]}") | |
| result_dict = { | |
| 'x_train': x_train, | |
| 'y_train': y_train, | |
| 'ticker_train': ticker_train, | |
| 'y_train_dt': y_train_dt, | |
| 'time_diffs_train': time_diffs_train, | |
| 'x_val': x_val, | |
| 'y_val': y_val, | |
| 'ticker_val': ticker_val, | |
| 'y_val_dt': y_val_dt, | |
| 'time_diffs_val': time_diffs_val, | |
| 'x_test': x_test, | |
| 'y_test': y_test, | |
| 'ticker_test': ticker_test, | |
| 'y_test_dt': y_test_dt, | |
| 'time_diffs_test': time_diffs_test, | |
| # ๋ ์ง ๋ฒ์ ์ ๋ณด ์ถ๊ฐ | |
| 'start_date': date_min, | |
| 'end_date': date_max, | |
| 'scalers': scalers, | |
| 'data': data | |
| } | |
| return result_dict, ticker_encoder, data |