|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import optuna |
|
|
import warnings |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import pandas as pd |
|
|
from torch.utils.data import Dataset, DataLoader |
|
|
|
|
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
|
|
from src.logger import get_logger |
|
|
logger = get_logger(__name__) |
|
|
|
|
|
class QuarterlyStockDataset(Dataset): |
|
|
def __init__(self, df, sequence_length=90): |
|
|
try: |
|
|
self.sequence_length = sequence_length |
|
|
self.samples = [] |
|
|
|
|
|
df = df.sort_values(by=["Ticker", "Date"]).reset_index(drop=True) |
|
|
tickers = df['Ticker'].unique() |
|
|
feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] |
|
|
|
|
|
for ticker in tickers: |
|
|
ticker_df = df[df['Ticker'] == ticker] |
|
|
data = ticker_df[feature_cols].values |
|
|
ticker_id = ticker_df['Ticker_Encoded'].iloc[0] |
|
|
|
|
|
for i in range(0, len(data) - sequence_length + 1, sequence_length): |
|
|
window = data[i:i+sequence_length] |
|
|
self.samples.append((torch.tensor(window, dtype=torch.float32), |
|
|
torch.tensor(ticker_id, dtype=torch.long))) |
|
|
|
|
|
print(f"Created {len(self.samples)} quarterly sequences across {len(tickers)} tickers.") |
|
|
except Exception as e: |
|
|
logger.error("Error batching dataset: %s", e) |
|
|
raise |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.samples) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
return self.samples[idx] |
|
|
|
|
|
|
|
|
class LSTMAutoEncoder(nn.Module): |
|
|
def __init__(self, input_dim, num_tickers, embed_dim=8, hidden_size=64, latent_dim=16, num_layers=1): |
|
|
super(LSTMAutoEncoder, self).__init__() |
|
|
self.ticker_embed = nn.Embedding(num_tickers, embed_dim) |
|
|
|
|
|
|
|
|
self.encoder = nn.LSTM(input_dim + embed_dim, hidden_size, num_layers=num_layers, batch_first=True) |
|
|
self.fc_enc = nn.Linear(hidden_size, latent_dim) |
|
|
|
|
|
|
|
|
self.fc_dec = nn.Linear(latent_dim + embed_dim, hidden_size) |
|
|
self.decoder = nn.LSTM(hidden_size, input_dim, num_layers=num_layers, batch_first=True) |
|
|
|
|
|
def forward(self, x, ticker_id): |
|
|
ticker_emb = self.ticker_embed(ticker_id).unsqueeze(1).repeat(1, x.size(1), 1) |
|
|
|
|
|
x_in = torch.cat([x, ticker_emb], dim=2) |
|
|
|
|
|
|
|
|
enc_out, (h, c) = self.encoder(x_in) |
|
|
latent = self.fc_enc(enc_out[:, -1, :]) |
|
|
|
|
|
latent_cat = torch.cat([latent, self.ticker_embed(ticker_id)], dim=1) |
|
|
latent_cat = latent_cat.unsqueeze(1).repeat(1, x.size(1), 1) |
|
|
|
|
|
|
|
|
dec_input = self.fc_dec(latent_cat) |
|
|
out_dec, _ = self.decoder(dec_input) |
|
|
return out_dec |
|
|
|
|
|
def objective(trial, df, sequence_length=90, device='cpu'): |
|
|
try: |
|
|
num_layers = trial.suggest_int("num_layers", 1, 3) |
|
|
hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128]) |
|
|
latent_dim = trial.suggest_categorical("latent_dim", [8, 16, 32]) |
|
|
lr = trial.suggest_loguniform("lr", 1e-4, 1e-2) |
|
|
embed_dim = trial.suggest_categorical("embed_dim", [4, 8, 16]) |
|
|
|
|
|
train_df = df[df['Date'] < '2023-01-01'] |
|
|
val_df = df[(df['Date'] >= '2023-01-01') & (df['Date'] < '2024-01-01')] |
|
|
|
|
|
train_dataset = QuarterlyStockDataset(train_df, sequence_length) |
|
|
val_dataset = QuarterlyStockDataset(val_df, sequence_length) |
|
|
|
|
|
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False) |
|
|
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False) |
|
|
|
|
|
num_tickers = df['Ticker_Encoded'].nunique() |
|
|
model = LSTMAutoEncoder( |
|
|
input_dim=5, num_tickers=num_tickers, embed_dim=embed_dim, |
|
|
hidden_size=hidden_size, latent_dim=latent_dim, num_layers=num_layers |
|
|
).to(device) |
|
|
|
|
|
criterion = nn.MSELoss() |
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=lr) |
|
|
|
|
|
epochs = 20 |
|
|
for epoch in range(epochs): |
|
|
model.train() |
|
|
total_train_loss = 0 |
|
|
for batch_x, batch_ticker in train_loader: |
|
|
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device) |
|
|
optimizer.zero_grad() |
|
|
recon = model(batch_x, batch_ticker) |
|
|
loss = criterion(recon, batch_x) |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
total_train_loss += loss.item() |
|
|
|
|
|
model.eval() |
|
|
total_val_loss = 0 |
|
|
with torch.no_grad(): |
|
|
for batch_x, batch_ticker in val_loader: |
|
|
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device) |
|
|
recon = model(batch_x, batch_ticker) |
|
|
loss = criterion(recon, batch_x) |
|
|
total_val_loss += loss.item() |
|
|
|
|
|
avg_val_loss = total_val_loss / len(val_loader) |
|
|
return avg_val_loss |
|
|
except Exception as e: |
|
|
logger.error("Error training Model : %s", e) |
|
|
raise |
|
|
|
|
|
if __name__ == "__main__": |
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
processed_data_path = os.path.join(base_dir, 'data', 'processed', 'stock_data.parquet') |
|
|
model_path = os.path.join(base_dir, 'models', 'lstm_autoencoder.pth') |
|
|
loss_path = os.path.join(base_dir, 'resources', 'loss_values.json') |
|
|
hyperparams_path = os.path.join(base_dir, 'models', 'hyperparameters.json') |
|
|
|
|
|
df = pd.read_parquet(processed_data_path) |
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
|
|
study = optuna.create_study(direction="minimize") |
|
|
study.optimize(lambda trial: objective(trial, df, device=device), n_trials=10) |
|
|
|
|
|
best_trial = study.best_trial |
|
|
best_params = best_trial.params |
|
|
|
|
|
train_df = df[df['Date'] < '2024-01-01'] |
|
|
full_dataset = QuarterlyStockDataset(train_df, sequence_length=90) |
|
|
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False) |
|
|
|
|
|
num_tickers = df['Ticker_Encoded'].nunique() |
|
|
best_model = LSTMAutoEncoder( |
|
|
input_dim=5, |
|
|
num_tickers=num_tickers, |
|
|
embed_dim=best_params.get('embed_dim', 8), |
|
|
hidden_size=best_params['hidden_size'], |
|
|
latent_dim=best_params['latent_dim'], |
|
|
num_layers=best_params['num_layers'] |
|
|
).to(device) |
|
|
|
|
|
criterion = nn.MSELoss() |
|
|
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr']) |
|
|
|
|
|
epochs = 50 |
|
|
train_losses = [] |
|
|
for epoch in range(epochs): |
|
|
best_model.train() |
|
|
total_loss = 0 |
|
|
for batch_x, batch_ticker in full_loader: |
|
|
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device) |
|
|
optimizer.zero_grad() |
|
|
recon = best_model(batch_x, batch_ticker) |
|
|
loss = criterion(recon, batch_x) |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
total_loss += loss.item() |
|
|
avg_loss = total_loss / len(full_loader) |
|
|
train_losses.append(avg_loss) |
|
|
print(f"Epoch [{epoch+1}/{epochs}] Loss: {avg_loss:.6f}") |
|
|
|
|
|
torch.save(best_model.state_dict(), model_path) |
|
|
with open(loss_path, 'w') as f: |
|
|
json.dump(train_losses, f) |
|
|
with open(hyperparams_path, 'w') as f: |
|
|
json.dump(best_params, f) |
|
|
|
|
|
print(f"Model, losses, and hyperparameters saved successfully.") |