Synthetic_Stock_Data / src /nn_model.py
Raheel Abdul Rehman
Prod Publish
bbf5d55
import os
import sys
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet')
combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet')
resources_dir = os.path.join(base_dir, 'resources')
os.makedirs(resources_dir, exist_ok=True)
original_df = pd.read_parquet(orig_data_path)
combined_df = pd.read_parquet(combined_data_path)
for df in [original_df, combined_df]:
df.sort_values(['Ticker', 'Date'], inplace=True)
df.reset_index(drop=True, inplace=True)
def add_trend_label(df):
df['Next_Close'] = df.groupby('Ticker')['Close'].shift(-1)
df['Trend'] = (df['Next_Close'] > df['Close']).astype(int)
df.dropna(subset=['Next_Close'], inplace=True)
return df
original_df = add_trend_label(original_df)
combined_df = add_trend_label(combined_df)
le = LabelEncoder()
original_df['TickerID'] = le.fit_transform(original_df['Ticker'])
combined_df['TickerID'] = le.transform(combined_df['Ticker'])
num_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
feature_cols = num_cols + ['TickerID']
target_col = 'Trend'
original_df = original_df.sort_values(['TickerID', 'Date']).reset_index(drop=True)
combined_df = combined_df.sort_values(['TickerID', 'Date']).reset_index(drop=True)
X_orig = original_df[feature_cols]
y_orig = original_df[target_col]
X_mix = combined_df[feature_cols]
y_mix = combined_df[target_col]
split_idx = int(len(X_orig) * 0.8)
split_idx_mix = int(len(X_mix) * 0.8)
X_train_orig, X_test = X_orig.iloc[:split_idx].copy(), X_orig.iloc[split_idx:].copy()
y_train_orig, y_test = y_orig.iloc[:split_idx].copy(), y_orig.iloc[split_idx:].copy()
X_train_mix, _ = X_mix.iloc[:split_idx_mix].copy(), X_mix.iloc[split_idx_mix:].copy()
y_train_mix, _ = y_mix.iloc[:split_idx_mix].copy(), y_mix.iloc[split_idx_mix:].copy()
scaler = StandardScaler()
scaler.fit(X_train_orig[num_cols])
X_train_orig.loc[:, num_cols] = scaler.transform(X_train_orig[num_cols])
X_train_mix.loc[:, num_cols] = scaler.transform(X_train_mix[num_cols])
X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])
def to_tensor(X, y):
X_num = torch.tensor(X[num_cols].values, dtype=torch.float32)
X_ticker = torch.tensor(X['TickerID'].values, dtype=torch.long)
y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)
return X_num, X_ticker, y
X_train_orig_num, X_train_orig_ticker, y_train_orig_t = to_tensor(X_train_orig, y_train_orig)
X_train_mix_num, X_train_mix_ticker, y_train_mix_t = to_tensor(X_train_mix, y_train_mix)
X_test_num, X_test_ticker, y_test_t = to_tensor(X_test, y_test)
n_tickers_total = max(
X_train_orig_ticker.max().item(),
X_train_mix_ticker.max().item(),
X_test_ticker.max().item()
) + 1
class TrendNN(nn.Module):
def __init__(self, n_tickers, input_dim):
super().__init__()
self.ticker_embed = nn.Embedding(n_tickers, 8)
self.net = nn.Sequential(
nn.Linear(input_dim + 8, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1),
nn.Sigmoid()
)
def forward(self, x_num, ticker_id):
ticker_vec = self.ticker_embed(ticker_id)
x = torch.cat([x_num, ticker_vec], dim=1)
return self.net(x)
def train_model(X_num, X_ticker, y, X_val, X_val_ticker, y_val, name, epochs=100, batch_size=1024):
model = TrendNN(n_tickers=n_tickers_total, input_dim=len(num_cols))
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
history = {"train_loss": [], "val_loss": [], "val_acc": []}
n_samples = len(X_num)
for epoch in range(epochs):
model.train()
perm = torch.randperm(n_samples)
total_loss = 0
for i in range(0, n_samples, batch_size):
idx = perm[i:i+batch_size]
batch_X_num, batch_ticker, batch_y = X_num[idx], X_ticker[idx], y[idx]
optimizer.zero_grad()
y_pred = model(batch_X_num, batch_ticker)
loss = criterion(y_pred, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
model.eval()
with torch.no_grad():
y_val_pred = model(X_val, X_val_ticker)
val_loss = criterion(y_val_pred, y_val).item()
val_acc = ((y_val_pred > 0.5).float() == y_val).float().mean().item()
avg_train_loss = total_loss / (n_samples // batch_size)
history["train_loss"].append(avg_train_loss)
history["val_loss"].append(val_loss)
history["val_acc"].append(val_acc)
if (epoch + 1) % 5 == 0:
print(f"[{name}] Epoch {epoch+1}/{epochs} | "
f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
model_path = os.path.join(resources_dir, f"model_{name.lower()}.pt")
torch.save(model.state_dict(), model_path)
return model, history
model_orig, hist_orig = train_model(
X_train_orig_num, X_train_orig_ticker, y_train_orig_t,
X_test_num, X_test_ticker, y_test_t, "Original"
)
model_mix, hist_mix = train_model(
X_train_mix_num, X_train_mix_ticker, y_train_mix_t,
X_test_num, X_test_ticker, y_test_t, "Combined"
)
results = {
"original": hist_orig,
"combined": hist_mix
}
with open(os.path.join(resources_dir, 'training_metrics.json'), "w") as f:
json.dump(results, f, indent=4)