| import os | |
| import sys | |
| import pandas as pd | |
| import torch | |
| import torch.nn as nn | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| import json | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet') | |
| combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet') | |
| resources_dir = os.path.join(base_dir, 'resources') | |
| os.makedirs(resources_dir, exist_ok=True) | |
| original_df = pd.read_parquet(orig_data_path) | |
| combined_df = pd.read_parquet(combined_data_path) | |
| for df in [original_df, combined_df]: | |
| df.sort_values(['Ticker', 'Date'], inplace=True) | |
| df.reset_index(drop=True, inplace=True) | |
| def add_trend_label(df): | |
| df['Next_Close'] = df.groupby('Ticker')['Close'].shift(-1) | |
| df['Trend'] = (df['Next_Close'] > df['Close']).astype(int) | |
| df.dropna(subset=['Next_Close'], inplace=True) | |
| return df | |
| original_df = add_trend_label(original_df) | |
| combined_df = add_trend_label(combined_df) | |
| le = LabelEncoder() | |
| original_df['TickerID'] = le.fit_transform(original_df['Ticker']) | |
| combined_df['TickerID'] = le.transform(combined_df['Ticker']) | |
| num_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] | |
| feature_cols = num_cols + ['TickerID'] | |
| target_col = 'Trend' | |
| original_df = original_df.sort_values(['TickerID', 'Date']).reset_index(drop=True) | |
| combined_df = combined_df.sort_values(['TickerID', 'Date']).reset_index(drop=True) | |
| X_orig = original_df[feature_cols] | |
| y_orig = original_df[target_col] | |
| X_mix = combined_df[feature_cols] | |
| y_mix = combined_df[target_col] | |
| split_idx = int(len(X_orig) * 0.8) | |
| split_idx_mix = int(len(X_mix) * 0.8) | |
| X_train_orig, X_test = X_orig.iloc[:split_idx].copy(), X_orig.iloc[split_idx:].copy() | |
| y_train_orig, y_test = y_orig.iloc[:split_idx].copy(), y_orig.iloc[split_idx:].copy() | |
| X_train_mix, _ = X_mix.iloc[:split_idx_mix].copy(), X_mix.iloc[split_idx_mix:].copy() | |
| y_train_mix, _ = y_mix.iloc[:split_idx_mix].copy(), y_mix.iloc[split_idx_mix:].copy() | |
| scaler = StandardScaler() | |
| scaler.fit(X_train_orig[num_cols]) | |
| X_train_orig.loc[:, num_cols] = scaler.transform(X_train_orig[num_cols]) | |
| X_train_mix.loc[:, num_cols] = scaler.transform(X_train_mix[num_cols]) | |
| X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols]) | |
| def to_tensor(X, y): | |
| X_num = torch.tensor(X[num_cols].values, dtype=torch.float32) | |
| X_ticker = torch.tensor(X['TickerID'].values, dtype=torch.long) | |
| y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1) | |
| return X_num, X_ticker, y | |
| X_train_orig_num, X_train_orig_ticker, y_train_orig_t = to_tensor(X_train_orig, y_train_orig) | |
| X_train_mix_num, X_train_mix_ticker, y_train_mix_t = to_tensor(X_train_mix, y_train_mix) | |
| X_test_num, X_test_ticker, y_test_t = to_tensor(X_test, y_test) | |
| n_tickers_total = max( | |
| X_train_orig_ticker.max().item(), | |
| X_train_mix_ticker.max().item(), | |
| X_test_ticker.max().item() | |
| ) + 1 | |
| class TrendNN(nn.Module): | |
| def __init__(self, n_tickers, input_dim): | |
| super().__init__() | |
| self.ticker_embed = nn.Embedding(n_tickers, 8) | |
| self.net = nn.Sequential( | |
| nn.Linear(input_dim + 8, 64), | |
| nn.ReLU(), | |
| nn.Linear(64, 32), | |
| nn.ReLU(), | |
| nn.Linear(32, 1), | |
| nn.Sigmoid() | |
| ) | |
| def forward(self, x_num, ticker_id): | |
| ticker_vec = self.ticker_embed(ticker_id) | |
| x = torch.cat([x_num, ticker_vec], dim=1) | |
| return self.net(x) | |
| def train_model(X_num, X_ticker, y, X_val, X_val_ticker, y_val, name, epochs=100, batch_size=1024): | |
| model = TrendNN(n_tickers=n_tickers_total, input_dim=len(num_cols)) | |
| criterion = nn.BCELoss() | |
| optimizer = torch.optim.Adam(model.parameters(), lr=0.001) | |
| history = {"train_loss": [], "val_loss": [], "val_acc": []} | |
| n_samples = len(X_num) | |
| for epoch in range(epochs): | |
| model.train() | |
| perm = torch.randperm(n_samples) | |
| total_loss = 0 | |
| for i in range(0, n_samples, batch_size): | |
| idx = perm[i:i+batch_size] | |
| batch_X_num, batch_ticker, batch_y = X_num[idx], X_ticker[idx], y[idx] | |
| optimizer.zero_grad() | |
| y_pred = model(batch_X_num, batch_ticker) | |
| loss = criterion(y_pred, batch_y) | |
| loss.backward() | |
| optimizer.step() | |
| total_loss += loss.item() | |
| model.eval() | |
| with torch.no_grad(): | |
| y_val_pred = model(X_val, X_val_ticker) | |
| val_loss = criterion(y_val_pred, y_val).item() | |
| val_acc = ((y_val_pred > 0.5).float() == y_val).float().mean().item() | |
| avg_train_loss = total_loss / (n_samples // batch_size) | |
| history["train_loss"].append(avg_train_loss) | |
| history["val_loss"].append(val_loss) | |
| history["val_acc"].append(val_acc) | |
| if (epoch + 1) % 5 == 0: | |
| print(f"[{name}] Epoch {epoch+1}/{epochs} | " | |
| f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}") | |
| model_path = os.path.join(resources_dir, f"model_{name.lower()}.pt") | |
| torch.save(model.state_dict(), model_path) | |
| return model, history | |
| model_orig, hist_orig = train_model( | |
| X_train_orig_num, X_train_orig_ticker, y_train_orig_t, | |
| X_test_num, X_test_ticker, y_test_t, "Original" | |
| ) | |
| model_mix, hist_mix = train_model( | |
| X_train_mix_num, X_train_mix_ticker, y_train_mix_t, | |
| X_test_num, X_test_ticker, y_test_t, "Combined" | |
| ) | |
| results = { | |
| "original": hist_orig, | |
| "combined": hist_mix | |
| } | |
| with open(os.path.join(resources_dir, 'training_metrics.json'), "w") as f: | |
| json.dump(results, f, indent=4) | |