import os import sys import pandas as pd import torch import torch.nn as nn from sklearn.preprocessing import StandardScaler, LabelEncoder import json sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet') combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet') resources_dir = os.path.join(base_dir, 'resources') os.makedirs(resources_dir, exist_ok=True) original_df = pd.read_parquet(orig_data_path) combined_df = pd.read_parquet(combined_data_path) for df in [original_df, combined_df]: df.sort_values(['Ticker', 'Date'], inplace=True) df.reset_index(drop=True, inplace=True) def add_trend_label(df): df['Next_Close'] = df.groupby('Ticker')['Close'].shift(-1) df['Trend'] = (df['Next_Close'] > df['Close']).astype(int) df.dropna(subset=['Next_Close'], inplace=True) return df original_df = add_trend_label(original_df) combined_df = add_trend_label(combined_df) le = LabelEncoder() original_df['TickerID'] = le.fit_transform(original_df['Ticker']) combined_df['TickerID'] = le.transform(combined_df['Ticker']) num_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] feature_cols = num_cols + ['TickerID'] target_col = 'Trend' original_df = original_df.sort_values(['TickerID', 'Date']).reset_index(drop=True) combined_df = combined_df.sort_values(['TickerID', 'Date']).reset_index(drop=True) X_orig = original_df[feature_cols] y_orig = original_df[target_col] X_mix = combined_df[feature_cols] y_mix = combined_df[target_col] split_idx = int(len(X_orig) * 0.8) split_idx_mix = int(len(X_mix) * 0.8) X_train_orig, X_test = X_orig.iloc[:split_idx].copy(), X_orig.iloc[split_idx:].copy() y_train_orig, y_test = y_orig.iloc[:split_idx].copy(), y_orig.iloc[split_idx:].copy() X_train_mix, _ = X_mix.iloc[:split_idx_mix].copy(), X_mix.iloc[split_idx_mix:].copy() y_train_mix, _ = y_mix.iloc[:split_idx_mix].copy(), y_mix.iloc[split_idx_mix:].copy() scaler = StandardScaler() scaler.fit(X_train_orig[num_cols]) X_train_orig.loc[:, num_cols] = scaler.transform(X_train_orig[num_cols]) X_train_mix.loc[:, num_cols] = scaler.transform(X_train_mix[num_cols]) X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols]) def to_tensor(X, y): X_num = torch.tensor(X[num_cols].values, dtype=torch.float32) X_ticker = torch.tensor(X['TickerID'].values, dtype=torch.long) y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1) return X_num, X_ticker, y X_train_orig_num, X_train_orig_ticker, y_train_orig_t = to_tensor(X_train_orig, y_train_orig) X_train_mix_num, X_train_mix_ticker, y_train_mix_t = to_tensor(X_train_mix, y_train_mix) X_test_num, X_test_ticker, y_test_t = to_tensor(X_test, y_test) n_tickers_total = max( X_train_orig_ticker.max().item(), X_train_mix_ticker.max().item(), X_test_ticker.max().item() ) + 1 class TrendNN(nn.Module): def __init__(self, n_tickers, input_dim): super().__init__() self.ticker_embed = nn.Embedding(n_tickers, 8) self.net = nn.Sequential( nn.Linear(input_dim + 8, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 1), nn.Sigmoid() ) def forward(self, x_num, ticker_id): ticker_vec = self.ticker_embed(ticker_id) x = torch.cat([x_num, ticker_vec], dim=1) return self.net(x) def train_model(X_num, X_ticker, y, X_val, X_val_ticker, y_val, name, epochs=100, batch_size=1024): model = TrendNN(n_tickers=n_tickers_total, input_dim=len(num_cols)) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) history = {"train_loss": [], "val_loss": [], "val_acc": []} n_samples = len(X_num) for epoch in range(epochs): model.train() perm = torch.randperm(n_samples) total_loss = 0 for i in range(0, n_samples, batch_size): idx = perm[i:i+batch_size] batch_X_num, batch_ticker, batch_y = X_num[idx], X_ticker[idx], y[idx] optimizer.zero_grad() y_pred = model(batch_X_num, batch_ticker) loss = criterion(y_pred, batch_y) loss.backward() optimizer.step() total_loss += loss.item() model.eval() with torch.no_grad(): y_val_pred = model(X_val, X_val_ticker) val_loss = criterion(y_val_pred, y_val).item() val_acc = ((y_val_pred > 0.5).float() == y_val).float().mean().item() avg_train_loss = total_loss / (n_samples // batch_size) history["train_loss"].append(avg_train_loss) history["val_loss"].append(val_loss) history["val_acc"].append(val_acc) if (epoch + 1) % 5 == 0: print(f"[{name}] Epoch {epoch+1}/{epochs} | " f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}") model_path = os.path.join(resources_dir, f"model_{name.lower()}.pt") torch.save(model.state_dict(), model_path) return model, history model_orig, hist_orig = train_model( X_train_orig_num, X_train_orig_ticker, y_train_orig_t, X_test_num, X_test_ticker, y_test_t, "Original" ) model_mix, hist_mix = train_model( X_train_mix_num, X_train_mix_ticker, y_train_mix_t, X_test_num, X_test_ticker, y_test_t, "Combined" ) results = { "original": hist_orig, "combined": hist_mix } with open(os.path.join(resources_dir, 'training_metrics.json'), "w") as f: json.dump(results, f, indent=4)