Raheel Abdul Rehman
commited on
Commit
·
bbf5d55
0
Parent(s):
Prod Publish
Browse files- .gitattributes +37 -0
- .gitignore +0 -0
- README.md +3 -0
- app/app.py +86 -0
- charts/NN_train_loss_plot.png +3 -0
- charts/NN_val_accuracy_plot.png +3 -0
- charts/NN_val_loss_plot.png +3 -0
- charts/ae_table_evaluator.png +3 -0
- charts/au_training_loss.png +3 -0
- charts/gan_pca.png +3 -0
- charts/gan_table_evaluator.png +3 -0
- charts/gan_training_loss.png +3 -0
- data/final_data.parquet +3 -0
- data/orig_processed.parquet +3 -0
- models/ae_hyperparameters.json +3 -0
- models/ae_lstm_autoencoder.pth +3 -0
- models/conditional_gan_metrics.json +3 -0
- models/gan_config.json +3 -0
- models/latent_gan_discriminator_conditional.pth +3 -0
- models/latent_gan_generator_conditional.pth +3 -0
- models/model_combined.pt +3 -0
- models/model_original.pt +3 -0
- notebooks/AE_EDA.ipynb +0 -0
- notebooks/GAN_EDA.ipynb +0 -0
- notebooks/NN_EDA.ipynb +0 -0
- requirements.txt +7 -0
- src/ae_decoder.py +130 -0
- src/ae_evaluate.py +76 -0
- src/ae_latent_extract.py +84 -0
- src/ae_model.py +190 -0
- src/gan_evaluate.py +106 -0
- src/gan_generate_synthetic_latent.py +94 -0
- src/gan_model.py +214 -0
- src/nn_model.py +157 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
File without changes
|
README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
app/app.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
|
| 6 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 7 |
+
orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet')
|
| 8 |
+
combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet')
|
| 9 |
+
|
| 10 |
+
orig_df = pd.read_parquet(orig_data_path)
|
| 11 |
+
combined_df = pd.read_parquet(combined_data_path)
|
| 12 |
+
|
| 13 |
+
for df in [orig_df, combined_df]:
|
| 14 |
+
if 'Date' in df.columns:
|
| 15 |
+
df['Date'] = pd.to_datetime(df['Date'])
|
| 16 |
+
|
| 17 |
+
orig_df = orig_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
|
| 18 |
+
combined_df = combined_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
|
| 19 |
+
|
| 20 |
+
FEATURE_COLS = [c for c in ['Open', 'High', 'Low', 'Close', 'Volume'] if c in orig_df.columns]
|
| 21 |
+
|
| 22 |
+
def plot_ticker_data(ticker, feature):
|
| 23 |
+
"""
|
| 24 |
+
Plot last 10 years of time series for selected feature for original and combined datasets.
|
| 25 |
+
"""
|
| 26 |
+
if feature not in FEATURE_COLS:
|
| 27 |
+
return f"Feature '{feature}' not found in dataset."
|
| 28 |
+
|
| 29 |
+
orig_data = orig_df[orig_df['Ticker'] == ticker].sort_values('Date').reset_index(drop=True)
|
| 30 |
+
synth_data = combined_df[combined_df['Ticker'] == ticker].sort_values('Date').reset_index(drop=True)
|
| 31 |
+
|
| 32 |
+
if orig_data.empty and synth_data.empty:
|
| 33 |
+
return f"No data found for ticker: {ticker}"
|
| 34 |
+
if orig_data.empty:
|
| 35 |
+
return f"No original data found for {ticker}"
|
| 36 |
+
if synth_data.empty:
|
| 37 |
+
return f"No combined/synthetic data found for {ticker}"
|
| 38 |
+
|
| 39 |
+
latest_date = min(orig_data['Date'].max(), synth_data['Date'].max())
|
| 40 |
+
cutoff_date = latest_date - pd.DateOffset(years=5)
|
| 41 |
+
|
| 42 |
+
orig_data = orig_data[orig_data['Date'] >= cutoff_date]
|
| 43 |
+
synth_data = synth_data[synth_data['Date'] >= cutoff_date]
|
| 44 |
+
|
| 45 |
+
orig_series = orig_data[['Date', feature]].dropna()
|
| 46 |
+
synth_series = synth_data[['Date', feature]].dropna()
|
| 47 |
+
|
| 48 |
+
fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=False)
|
| 49 |
+
fig.suptitle(f"{ticker} — {feature} (Last 10 Years)", fontsize=14)
|
| 50 |
+
|
| 51 |
+
axes[0].plot(orig_series['Date'], orig_series[feature], linewidth=1.0, alpha=0.9)
|
| 52 |
+
axes[0].set_title("Original Data")
|
| 53 |
+
axes[0].set_ylabel(feature)
|
| 54 |
+
axes[0].grid(True)
|
| 55 |
+
|
| 56 |
+
axes[1].plot(synth_series['Date'], synth_series[feature], linewidth=1.0, alpha=0.9)
|
| 57 |
+
axes[1].set_title("Synthetic Data")
|
| 58 |
+
axes[1].set_ylabel(feature)
|
| 59 |
+
axes[1].grid(True)
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
min_date = min(orig_series['Date'].min(), synth_series['Date'].min())
|
| 63 |
+
max_date = max(orig_series['Date'].max(), synth_series['Date'].max())
|
| 64 |
+
axes[0].set_xlim(min_date, max_date)
|
| 65 |
+
axes[1].set_xlim(min_date, max_date)
|
| 66 |
+
except Exception:
|
| 67 |
+
pass
|
| 68 |
+
|
| 69 |
+
plt.tight_layout(rect=[0, 0, 1, 0.96])
|
| 70 |
+
return fig
|
| 71 |
+
|
| 72 |
+
unique_tickers = sorted(orig_df['Ticker'].unique())
|
| 73 |
+
|
| 74 |
+
demo = gr.Interface(
|
| 75 |
+
fn=plot_ticker_data,
|
| 76 |
+
inputs=[
|
| 77 |
+
gr.Dropdown(unique_tickers, label="Select Stock Ticker"),
|
| 78 |
+
gr.Dropdown(FEATURE_COLS, label="Select Feature (Open/High/Low/Close/Volume)")
|
| 79 |
+
],
|
| 80 |
+
outputs=gr.Plot(label="Time Series Comparison"),
|
| 81 |
+
title="Real vs Synthetic Time Series Viewer",
|
| 82 |
+
description="Pick a ticker and feature to view the last 5 years of data from original and synthetic datasets."
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
demo.launch()
|
charts/NN_train_loss_plot.png
ADDED
|
Git LFS Details
|
charts/NN_val_accuracy_plot.png
ADDED
|
Git LFS Details
|
charts/NN_val_loss_plot.png
ADDED
|
Git LFS Details
|
charts/ae_table_evaluator.png
ADDED
|
Git LFS Details
|
charts/au_training_loss.png
ADDED
|
Git LFS Details
|
charts/gan_pca.png
ADDED
|
Git LFS Details
|
charts/gan_table_evaluator.png
ADDED
|
Git LFS Details
|
charts/gan_training_loss.png
ADDED
|
Git LFS Details
|
data/final_data.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4244f7ab9fa70bfa9aa16a0c03796528c3744f63f5d27d8cfd13ac664646e133
|
| 3 |
+
size 44822597
|
data/orig_processed.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f2357e122e5ccaeaa0cdc08d0b593e947b72603ff96841ddf33674deec8567f
|
| 3 |
+
size 56514266
|
models/ae_hyperparameters.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5619bc38b522cbed5839c4f0ab9193e199ceae0fb901d45b5afecaf1fd7806e7
|
| 3 |
+
size 100
|
models/ae_lstm_autoencoder.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf07dca9322668dcad0023c8bed0ac604a1bba11bea98ac039e8e1e3afa4650e
|
| 3 |
+
size 289111
|
models/conditional_gan_metrics.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07b4ba961a2be1762511554a33e26d2d555799fee7d6bb794966e0e36a7f05b0
|
| 3 |
+
size 12670
|
models/gan_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74d6b40cb6c6794bfe64995bfe5639cb2551cce7983c643e49531ef177ab4fe9
|
| 3 |
+
size 261
|
models/latent_gan_discriminator_conditional.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:666dfd51a80772671418dd2d6dbf36816869b7297349aeb4f878f3539548a099
|
| 3 |
+
size 127690
|
models/latent_gan_generator_conditional.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f538c56f0ffccb35235a6f9280b2452dc522ced62bf0106d1371c174c389ff6
|
| 3 |
+
size 160022
|
models/model_combined.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55baabc1a62fda48e5c3b344bdc9ea74d6e54a8971f8437a55a40ddb8f6cd5b7
|
| 3 |
+
size 31596
|
models/model_original.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:781876452c9e781bead24a6b4880ecc2e5a4f57407b8b78612c0f85fb961b26e
|
| 3 |
+
size 31596
|
notebooks/AE_EDA.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/GAN_EDA.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/NN_EDA.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
pandas
|
| 3 |
+
scikit-learn
|
| 4 |
+
gradio
|
| 5 |
+
matplotlib
|
| 6 |
+
numpy
|
| 7 |
+
pyarrow
|
src/ae_decoder.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import torch
|
| 4 |
+
import pickle
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 10 |
+
|
| 11 |
+
from src.model import LSTMAutoEncoder
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def decode_latent_vectors(
|
| 15 |
+
model_path,
|
| 16 |
+
synthetic_latent_path,
|
| 17 |
+
ticker_map_path,
|
| 18 |
+
output_path,
|
| 19 |
+
model_params,
|
| 20 |
+
seq_len=90,
|
| 21 |
+
device=None
|
| 22 |
+
):
|
| 23 |
+
"""
|
| 24 |
+
Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 28 |
+
|
| 29 |
+
model = LSTMAutoEncoder(
|
| 30 |
+
input_dim=5,
|
| 31 |
+
num_tickers=model_params["num_tickers"],
|
| 32 |
+
embed_dim=model_params["embed_dim"],
|
| 33 |
+
hidden_size=model_params["hidden_size"],
|
| 34 |
+
latent_dim=model_params["latent_dim"],
|
| 35 |
+
num_layers=model_params["num_layers"]
|
| 36 |
+
).to(device)
|
| 37 |
+
|
| 38 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
| 39 |
+
model.eval()
|
| 40 |
+
print(f"Loaded trained LSTM Autoencoder from {model_path}")
|
| 41 |
+
|
| 42 |
+
latent_vectors = np.load(synthetic_latent_path)
|
| 43 |
+
ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers")
|
| 44 |
+
|
| 45 |
+
if not os.path.exists(ticker_ids_path):
|
| 46 |
+
raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}")
|
| 47 |
+
|
| 48 |
+
ticker_ids = np.load(ticker_ids_path)
|
| 49 |
+
|
| 50 |
+
latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device)
|
| 51 |
+
ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device)
|
| 52 |
+
|
| 53 |
+
decoded_batches = []
|
| 54 |
+
batch_size = 128
|
| 55 |
+
|
| 56 |
+
for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"):
|
| 57 |
+
batch_latent = latent_t[i:i + batch_size]
|
| 58 |
+
batch_ticker = ticker_t[i:i + batch_size]
|
| 59 |
+
|
| 60 |
+
with torch.no_grad():
|
| 61 |
+
ticker_emb = model.ticker_embed(batch_ticker)
|
| 62 |
+
latent_cat = torch.cat([batch_latent, ticker_emb], dim=1)
|
| 63 |
+
latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1)
|
| 64 |
+
dec_input = model.fc_dec(latent_cat)
|
| 65 |
+
reconstructed, _ = model.decoder(dec_input)
|
| 66 |
+
|
| 67 |
+
decoded_batches.append(reconstructed.cpu().numpy())
|
| 68 |
+
|
| 69 |
+
decoded = np.concatenate(decoded_batches, axis=0)
|
| 70 |
+
print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}")
|
| 71 |
+
|
| 72 |
+
with open(ticker_map_path, "rb") as f:
|
| 73 |
+
label_encoder = pickle.load(f)
|
| 74 |
+
|
| 75 |
+
if hasattr(label_encoder, "inverse_transform"):
|
| 76 |
+
tickers = label_encoder.inverse_transform(ticker_ids)
|
| 77 |
+
elif isinstance(label_encoder, (np.ndarray, list)):
|
| 78 |
+
tickers = np.array(label_encoder)[ticker_ids]
|
| 79 |
+
elif isinstance(label_encoder, dict):
|
| 80 |
+
tickers = [label_encoder[int(i)] for i in ticker_ids]
|
| 81 |
+
else:
|
| 82 |
+
raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
records = []
|
| 86 |
+
for i in range(len(decoded)):
|
| 87 |
+
ticker = tickers[i]
|
| 88 |
+
for t in range(seq_len):
|
| 89 |
+
o, h, l, c, v = decoded[i, t]
|
| 90 |
+
records.append({
|
| 91 |
+
"Ticker": ticker,
|
| 92 |
+
"Ticker_Encoded": int(ticker_ids[i]),
|
| 93 |
+
"TimeStep": t,
|
| 94 |
+
"Open": o,
|
| 95 |
+
"High": h,
|
| 96 |
+
"Low": l,
|
| 97 |
+
"Close": c,
|
| 98 |
+
"Volume": v
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
decoded_df = pd.DataFrame(records)
|
| 102 |
+
decoded_df.to_parquet(output_path, index=False)
|
| 103 |
+
print(f"Decoded OHLCV data saved to {output_path}")
|
| 104 |
+
|
| 105 |
+
return decoded_df
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 110 |
+
model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
|
| 111 |
+
synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy")
|
| 112 |
+
ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
|
| 113 |
+
output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet")
|
| 114 |
+
|
| 115 |
+
model_params = {
|
| 116 |
+
"num_layers": 2,
|
| 117 |
+
"hidden_size": 64,
|
| 118 |
+
"latent_dim": 32,
|
| 119 |
+
"embed_dim": 16,
|
| 120 |
+
"num_tickers": 503
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
decode_latent_vectors(
|
| 124 |
+
model_path=model_path,
|
| 125 |
+
synthetic_latent_path=synthetic_latent_path,
|
| 126 |
+
ticker_map_path=ticker_map_path,
|
| 127 |
+
output_path=output_path,
|
| 128 |
+
model_params=model_params,
|
| 129 |
+
seq_len=90
|
| 130 |
+
)
|
src/ae_evaluate.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import torch
|
| 4 |
+
import json
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from torch.utils.data import DataLoader
|
| 8 |
+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
|
| 11 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 12 |
+
|
| 13 |
+
from src.model import QuarterlyStockDataset, LSTMAutoEncoder # uses updated version with embeddings
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 17 |
+
processed_data_path = os.path.join(base_dir, 'data', 'processed', 'stock_data.parquet')
|
| 18 |
+
model_path = os.path.join(base_dir, 'models', 'lstm_autoencoder.pth')
|
| 19 |
+
metrics_path = os.path.join(base_dir, 'resources', 'metrics.json')
|
| 20 |
+
|
| 21 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 22 |
+
df = pd.read_parquet(processed_data_path)
|
| 23 |
+
|
| 24 |
+
test_df = df[df['Date'] >= '2024-01-01']
|
| 25 |
+
|
| 26 |
+
sequence_length = 90
|
| 27 |
+
test_dataset = QuarterlyStockDataset(test_df, sequence_length=sequence_length)
|
| 28 |
+
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
|
| 29 |
+
|
| 30 |
+
best_params = {
|
| 31 |
+
'hidden_size': 64,
|
| 32 |
+
'latent_dim': 32,
|
| 33 |
+
'num_layers': 2,
|
| 34 |
+
'embed_dim': 16
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
num_tickers = df['Ticker_Encoded'].nunique()
|
| 38 |
+
|
| 39 |
+
model = LSTMAutoEncoder(
|
| 40 |
+
input_dim=5,
|
| 41 |
+
num_tickers=num_tickers,
|
| 42 |
+
embed_dim=best_params['embed_dim'],
|
| 43 |
+
hidden_size=best_params['hidden_size'],
|
| 44 |
+
latent_dim=best_params['latent_dim'],
|
| 45 |
+
num_layers=best_params['num_layers']
|
| 46 |
+
).to(device)
|
| 47 |
+
|
| 48 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
| 49 |
+
model.eval()
|
| 50 |
+
|
| 51 |
+
all_actual, all_recon = [], []
|
| 52 |
+
|
| 53 |
+
with torch.no_grad():
|
| 54 |
+
for batch_x, batch_ticker in test_loader:
|
| 55 |
+
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
|
| 56 |
+
recon = model(batch_x, batch_ticker)
|
| 57 |
+
all_actual.append(batch_x.cpu().numpy())
|
| 58 |
+
all_recon.append(recon.cpu().numpy())
|
| 59 |
+
X_test = np.concatenate(all_actual, axis=0)
|
| 60 |
+
X_recon = np.concatenate(all_recon, axis=0)
|
| 61 |
+
X_test_flat = X_test.reshape(-1, X_test.shape[-1])
|
| 62 |
+
X_recon_flat = X_recon.reshape(-1, X_recon.shape[-1])
|
| 63 |
+
|
| 64 |
+
mae = mean_absolute_error(X_test_flat, X_recon_flat)
|
| 65 |
+
rmse = np.sqrt(mean_squared_error(X_test_flat, X_recon_flat))
|
| 66 |
+
r2 = r2_score(X_test_flat, X_recon_flat)
|
| 67 |
+
|
| 68 |
+
metrics = {
|
| 69 |
+
"MAE": float(mae),
|
| 70 |
+
"RMSE": float(rmse),
|
| 71 |
+
"R2": float(r2)
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
with open(metrics_path, 'w') as f:
|
| 75 |
+
json.dump(metrics, f, indent=4)
|
| 76 |
+
print(f"✅ Test metrics saved at: {metrics_path}")
|
src/ae_latent_extract.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import torch
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
from torch.utils.data import DataLoader
|
| 7 |
+
|
| 8 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 9 |
+
|
| 10 |
+
from src.model import LSTMAutoEncoder, QuarterlyStockDataset # pylint: disable=import-error
|
| 11 |
+
from src.logger import get_logger
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
try:
|
| 17 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 18 |
+
processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet")
|
| 19 |
+
model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
|
| 20 |
+
latent_vectors_path = os.path.join(base_dir, "..", "GAN", "data", "processed", "latent_vectors.npy")
|
| 21 |
+
ticker_mapping_path = os.path.join(base_dir, "..", "GAN", "data", "processed", "ticker_mapping.npy")
|
| 22 |
+
|
| 23 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
+
|
| 25 |
+
df = pd.read_parquet(processed_data_path)
|
| 26 |
+
tickers = df["Ticker"].unique()
|
| 27 |
+
num_tickers = df["Ticker_Encoded"].nunique()
|
| 28 |
+
|
| 29 |
+
model = LSTMAutoEncoder(
|
| 30 |
+
input_dim=5,
|
| 31 |
+
num_tickers=num_tickers,
|
| 32 |
+
embed_dim=16,
|
| 33 |
+
hidden_size=64,
|
| 34 |
+
latent_dim=32,
|
| 35 |
+
num_layers=2
|
| 36 |
+
).to(device)
|
| 37 |
+
|
| 38 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
| 39 |
+
model.eval()
|
| 40 |
+
|
| 41 |
+
def encode(model, x, ticker_id):
|
| 42 |
+
ticker_emb = model.ticker_embed(ticker_id).unsqueeze(1).repeat(1, x.size(1), 1)
|
| 43 |
+
x_in = torch.cat([x, ticker_emb], dim=2)
|
| 44 |
+
enc_out, _ = model.encoder(x_in)
|
| 45 |
+
latent = model.fc_enc(enc_out[:, -1, :])
|
| 46 |
+
return latent
|
| 47 |
+
|
| 48 |
+
all_latents = []
|
| 49 |
+
all_tickers = []
|
| 50 |
+
|
| 51 |
+
for ticker in tickers:
|
| 52 |
+
ticker_df = df[df["Ticker"] == ticker].copy()
|
| 53 |
+
if len(ticker_df) < 90:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
dataset = QuarterlyStockDataset(ticker_df, sequence_length=90)
|
| 57 |
+
loader = DataLoader(dataset, batch_size=64, shuffle=False)
|
| 58 |
+
|
| 59 |
+
ticker_latents = []
|
| 60 |
+
|
| 61 |
+
with torch.no_grad():
|
| 62 |
+
for batch_x, batch_ticker in loader:
|
| 63 |
+
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
|
| 64 |
+
latent = encode(model, batch_x, batch_ticker)
|
| 65 |
+
ticker_latents.append(latent.cpu().numpy())
|
| 66 |
+
|
| 67 |
+
if ticker_latents:
|
| 68 |
+
ticker_latents = np.concatenate(ticker_latents, axis=0)
|
| 69 |
+
all_latents.append(ticker_latents)
|
| 70 |
+
all_tickers.extend([ticker] * len(ticker_latents))
|
| 71 |
+
logger.info(f"Extracted {len(ticker_latents)} latent vectors for {ticker}.")
|
| 72 |
+
|
| 73 |
+
all_latents = np.concatenate(all_latents, axis=0)
|
| 74 |
+
all_tickers = np.array(all_tickers)
|
| 75 |
+
|
| 76 |
+
np.save(latent_vectors_path, all_latents)
|
| 77 |
+
np.save(ticker_mapping_path, all_tickers)
|
| 78 |
+
|
| 79 |
+
logger.info(f"Saved {len(all_latents)} latent vectors to {latent_vectors_path}")
|
| 80 |
+
logger.info(f"Saved ticker mapping to {ticker_mapping_path}")
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error("Error extracting latent space vectors: %s", e)
|
| 84 |
+
raise
|
src/ae_model.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import optuna
|
| 5 |
+
import warnings
|
| 6 |
+
import torch
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from torch.utils.data import Dataset, DataLoader
|
| 10 |
+
|
| 11 |
+
warnings.simplefilter(action='ignore', category=FutureWarning)
|
| 12 |
+
|
| 13 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 14 |
+
|
| 15 |
+
from src.logger import get_logger # pylint: disable=import-error
|
| 16 |
+
logger = get_logger(__name__)
|
| 17 |
+
|
| 18 |
+
class QuarterlyStockDataset(Dataset):
|
| 19 |
+
def __init__(self, df, sequence_length=90):
|
| 20 |
+
try:
|
| 21 |
+
self.sequence_length = sequence_length
|
| 22 |
+
self.samples = []
|
| 23 |
+
|
| 24 |
+
df = df.sort_values(by=["Ticker", "Date"]).reset_index(drop=True)
|
| 25 |
+
tickers = df['Ticker'].unique()
|
| 26 |
+
feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
| 27 |
+
|
| 28 |
+
for ticker in tickers:
|
| 29 |
+
ticker_df = df[df['Ticker'] == ticker]
|
| 30 |
+
data = ticker_df[feature_cols].values
|
| 31 |
+
ticker_id = ticker_df['Ticker_Encoded'].iloc[0]
|
| 32 |
+
|
| 33 |
+
for i in range(0, len(data) - sequence_length + 1, sequence_length):
|
| 34 |
+
window = data[i:i+sequence_length]
|
| 35 |
+
self.samples.append((torch.tensor(window, dtype=torch.float32),
|
| 36 |
+
torch.tensor(ticker_id, dtype=torch.long)))
|
| 37 |
+
|
| 38 |
+
print(f"Created {len(self.samples)} quarterly sequences across {len(tickers)} tickers.")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error("Error batching dataset: %s", e)
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
def __len__(self):
|
| 44 |
+
return len(self.samples)
|
| 45 |
+
|
| 46 |
+
def __getitem__(self, idx):
|
| 47 |
+
return self.samples[idx]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class LSTMAutoEncoder(nn.Module):
|
| 51 |
+
def __init__(self, input_dim, num_tickers, embed_dim=8, hidden_size=64, latent_dim=16, num_layers=1):
|
| 52 |
+
super(LSTMAutoEncoder, self).__init__()
|
| 53 |
+
self.ticker_embed = nn.Embedding(num_tickers, embed_dim)
|
| 54 |
+
|
| 55 |
+
# Encoder
|
| 56 |
+
self.encoder = nn.LSTM(input_dim + embed_dim, hidden_size, num_layers=num_layers, batch_first=True)
|
| 57 |
+
self.fc_enc = nn.Linear(hidden_size, latent_dim)
|
| 58 |
+
|
| 59 |
+
# Decoder
|
| 60 |
+
self.fc_dec = nn.Linear(latent_dim + embed_dim, hidden_size)
|
| 61 |
+
self.decoder = nn.LSTM(hidden_size, input_dim, num_layers=num_layers, batch_first=True)
|
| 62 |
+
|
| 63 |
+
def forward(self, x, ticker_id):
|
| 64 |
+
ticker_emb = self.ticker_embed(ticker_id).unsqueeze(1).repeat(1, x.size(1), 1)
|
| 65 |
+
|
| 66 |
+
x_in = torch.cat([x, ticker_emb], dim=2)
|
| 67 |
+
|
| 68 |
+
# Encoder
|
| 69 |
+
enc_out, (h, c) = self.encoder(x_in)
|
| 70 |
+
latent = self.fc_enc(enc_out[:, -1, :])
|
| 71 |
+
|
| 72 |
+
latent_cat = torch.cat([latent, self.ticker_embed(ticker_id)], dim=1)
|
| 73 |
+
latent_cat = latent_cat.unsqueeze(1).repeat(1, x.size(1), 1)
|
| 74 |
+
|
| 75 |
+
# Decoder
|
| 76 |
+
dec_input = self.fc_dec(latent_cat)
|
| 77 |
+
out_dec, _ = self.decoder(dec_input)
|
| 78 |
+
return out_dec
|
| 79 |
+
|
| 80 |
+
def objective(trial, df, sequence_length=90, device='cpu'):
|
| 81 |
+
try:
|
| 82 |
+
num_layers = trial.suggest_int("num_layers", 1, 3)
|
| 83 |
+
hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128])
|
| 84 |
+
latent_dim = trial.suggest_categorical("latent_dim", [8, 16, 32])
|
| 85 |
+
lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
|
| 86 |
+
embed_dim = trial.suggest_categorical("embed_dim", [4, 8, 16])
|
| 87 |
+
|
| 88 |
+
train_df = df[df['Date'] < '2023-01-01']
|
| 89 |
+
val_df = df[(df['Date'] >= '2023-01-01') & (df['Date'] < '2024-01-01')]
|
| 90 |
+
|
| 91 |
+
train_dataset = QuarterlyStockDataset(train_df, sequence_length)
|
| 92 |
+
val_dataset = QuarterlyStockDataset(val_df, sequence_length)
|
| 93 |
+
|
| 94 |
+
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
|
| 95 |
+
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
|
| 96 |
+
|
| 97 |
+
num_tickers = df['Ticker_Encoded'].nunique()
|
| 98 |
+
model = LSTMAutoEncoder(
|
| 99 |
+
input_dim=5, num_tickers=num_tickers, embed_dim=embed_dim,
|
| 100 |
+
hidden_size=hidden_size, latent_dim=latent_dim, num_layers=num_layers
|
| 101 |
+
).to(device)
|
| 102 |
+
|
| 103 |
+
criterion = nn.MSELoss()
|
| 104 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
| 105 |
+
|
| 106 |
+
epochs = 20
|
| 107 |
+
for epoch in range(epochs):
|
| 108 |
+
model.train()
|
| 109 |
+
total_train_loss = 0
|
| 110 |
+
for batch_x, batch_ticker in train_loader:
|
| 111 |
+
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
|
| 112 |
+
optimizer.zero_grad()
|
| 113 |
+
recon = model(batch_x, batch_ticker)
|
| 114 |
+
loss = criterion(recon, batch_x)
|
| 115 |
+
loss.backward()
|
| 116 |
+
optimizer.step()
|
| 117 |
+
total_train_loss += loss.item()
|
| 118 |
+
|
| 119 |
+
model.eval()
|
| 120 |
+
total_val_loss = 0
|
| 121 |
+
with torch.no_grad():
|
| 122 |
+
for batch_x, batch_ticker in val_loader:
|
| 123 |
+
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
|
| 124 |
+
recon = model(batch_x, batch_ticker)
|
| 125 |
+
loss = criterion(recon, batch_x)
|
| 126 |
+
total_val_loss += loss.item()
|
| 127 |
+
|
| 128 |
+
avg_val_loss = total_val_loss / len(val_loader)
|
| 129 |
+
return avg_val_loss
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error("Error training Model : %s", e)
|
| 132 |
+
raise
|
| 133 |
+
|
| 134 |
+
if __name__ == "__main__":
|
| 135 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 136 |
+
processed_data_path = os.path.join(base_dir, 'data', 'processed', 'stock_data.parquet')
|
| 137 |
+
model_path = os.path.join(base_dir, 'models', 'lstm_autoencoder.pth')
|
| 138 |
+
loss_path = os.path.join(base_dir, 'resources', 'loss_values.json')
|
| 139 |
+
hyperparams_path = os.path.join(base_dir, 'models', 'hyperparameters.json')
|
| 140 |
+
|
| 141 |
+
df = pd.read_parquet(processed_data_path)
|
| 142 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 143 |
+
|
| 144 |
+
study = optuna.create_study(direction="minimize")
|
| 145 |
+
study.optimize(lambda trial: objective(trial, df, device=device), n_trials=10)
|
| 146 |
+
|
| 147 |
+
best_trial = study.best_trial
|
| 148 |
+
best_params = best_trial.params
|
| 149 |
+
|
| 150 |
+
train_df = df[df['Date'] < '2024-01-01']
|
| 151 |
+
full_dataset = QuarterlyStockDataset(train_df, sequence_length=90)
|
| 152 |
+
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False)
|
| 153 |
+
|
| 154 |
+
num_tickers = df['Ticker_Encoded'].nunique()
|
| 155 |
+
best_model = LSTMAutoEncoder(
|
| 156 |
+
input_dim=5,
|
| 157 |
+
num_tickers=num_tickers,
|
| 158 |
+
embed_dim=best_params.get('embed_dim', 8),
|
| 159 |
+
hidden_size=best_params['hidden_size'],
|
| 160 |
+
latent_dim=best_params['latent_dim'],
|
| 161 |
+
num_layers=best_params['num_layers']
|
| 162 |
+
).to(device)
|
| 163 |
+
|
| 164 |
+
criterion = nn.MSELoss()
|
| 165 |
+
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])
|
| 166 |
+
|
| 167 |
+
epochs = 50
|
| 168 |
+
train_losses = []
|
| 169 |
+
for epoch in range(epochs):
|
| 170 |
+
best_model.train()
|
| 171 |
+
total_loss = 0
|
| 172 |
+
for batch_x, batch_ticker in full_loader:
|
| 173 |
+
batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
|
| 174 |
+
optimizer.zero_grad()
|
| 175 |
+
recon = best_model(batch_x, batch_ticker)
|
| 176 |
+
loss = criterion(recon, batch_x)
|
| 177 |
+
loss.backward()
|
| 178 |
+
optimizer.step()
|
| 179 |
+
total_loss += loss.item()
|
| 180 |
+
avg_loss = total_loss / len(full_loader)
|
| 181 |
+
train_losses.append(avg_loss)
|
| 182 |
+
print(f"Epoch [{epoch+1}/{epochs}] Loss: {avg_loss:.6f}")
|
| 183 |
+
|
| 184 |
+
torch.save(best_model.state_dict(), model_path)
|
| 185 |
+
with open(loss_path, 'w') as f:
|
| 186 |
+
json.dump(train_losses, f)
|
| 187 |
+
with open(hyperparams_path, 'w') as f:
|
| 188 |
+
json.dump(best_params, f)
|
| 189 |
+
|
| 190 |
+
print(f"Model, losses, and hyperparameters saved successfully.")
|
src/gan_evaluate.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from scipy.stats import ks_2samp
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 12 |
+
|
| 13 |
+
from src.model import ConditionalGenerator
|
| 14 |
+
|
| 15 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 16 |
+
processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet")
|
| 17 |
+
latent_dir = os.path.join(base_dir, "data","processed")
|
| 18 |
+
resources_dir = os.path.join(base_dir, "resources")
|
| 19 |
+
models_dir = os.path.join(base_dir, "models")
|
| 20 |
+
|
| 21 |
+
latent_path = os.path.join(latent_dir, "latent_vectors.npy")
|
| 22 |
+
tickers_path = os.path.join(latent_dir, "sequence_tickers.npy")
|
| 23 |
+
label_encoder_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
|
| 24 |
+
gen_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth")
|
| 25 |
+
gan_config_path = os.path.join(resources_dir, "gan_config.json")
|
| 26 |
+
|
| 27 |
+
with open(gan_config_path, "r") as f:
|
| 28 |
+
params = json.load(f)
|
| 29 |
+
|
| 30 |
+
noise_dim = params["noise_dim"]
|
| 31 |
+
latent_dim = params["latent_dim"]
|
| 32 |
+
hidden_dim = params["hidden_dim"]
|
| 33 |
+
embed_dim = params.get("embed_dim", 8)
|
| 34 |
+
|
| 35 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 36 |
+
|
| 37 |
+
import joblib
|
| 38 |
+
label_encoder = joblib.load(label_encoder_path)
|
| 39 |
+
num_tickers = len(label_encoder.classes_)
|
| 40 |
+
|
| 41 |
+
G = ConditionalGenerator(
|
| 42 |
+
noise_dim=noise_dim,
|
| 43 |
+
latent_dim=latent_dim,
|
| 44 |
+
embed_dim=embed_dim,
|
| 45 |
+
num_tickers=num_tickers,
|
| 46 |
+
hidden_dim=hidden_dim
|
| 47 |
+
).to(device)
|
| 48 |
+
|
| 49 |
+
G.load_state_dict(torch.load(gen_path, map_location=device))
|
| 50 |
+
G.eval()
|
| 51 |
+
print(f"Loaded conditional generator (latent_dim={latent_dim}, embed_dim={embed_dim})")
|
| 52 |
+
|
| 53 |
+
real_latent = np.load(latent_path)
|
| 54 |
+
N_SAMPLES = real_latent.shape[0]
|
| 55 |
+
batch_size = 512
|
| 56 |
+
|
| 57 |
+
ticker_ids = np.random.randint(0, num_tickers, N_SAMPLES)
|
| 58 |
+
samples = []
|
| 59 |
+
|
| 60 |
+
for i in tqdm(range((N_SAMPLES + batch_size - 1) // batch_size), desc="Generating synthetic latents"):
|
| 61 |
+
b = min(batch_size, N_SAMPLES - i * batch_size)
|
| 62 |
+
z = torch.randn(b, noise_dim).to(device)
|
| 63 |
+
tickers_batch = torch.tensor(ticker_ids[i * batch_size: i * batch_size + b], dtype=torch.long).to(device)
|
| 64 |
+
out = G(z, tickers_batch).detach().cpu().numpy()
|
| 65 |
+
samples.append(out)
|
| 66 |
+
|
| 67 |
+
synth_latent = np.vstack(samples)
|
| 68 |
+
np.save(os.path.join(latent_dir, "synthetic_latent_vectors.npy"), synth_latent)
|
| 69 |
+
np.save(os.path.join(latent_dir, "synthetic_latent_tickers.npy"), ticker_ids)
|
| 70 |
+
|
| 71 |
+
print(f"Saved synthetic latent vectors to {latent_dir}")
|
| 72 |
+
|
| 73 |
+
metrics = {"per_dimension": {}, "correlation": {}}
|
| 74 |
+
min_dim = min(real_latent.shape[1], synth_latent.shape[1])
|
| 75 |
+
real_latent = real_latent[:, :min_dim]
|
| 76 |
+
synth_latent = synth_latent[:, :min_dim]
|
| 77 |
+
|
| 78 |
+
for i in range(min_dim):
|
| 79 |
+
r = real_latent[:, i]
|
| 80 |
+
s = synth_latent[:, i]
|
| 81 |
+
min_n = min(len(r), len(s))
|
| 82 |
+
ks_stat, ks_p = ks_2samp(
|
| 83 |
+
np.random.choice(r, min_n, replace=False),
|
| 84 |
+
np.random.choice(s, min_n, replace=False)
|
| 85 |
+
)
|
| 86 |
+
metrics["per_dimension"][f"latent_{i}"] = {
|
| 87 |
+
"real_mean": float(r.mean()),
|
| 88 |
+
"synth_mean": float(s.mean()),
|
| 89 |
+
"mean_diff": float(s.mean() - r.mean()),
|
| 90 |
+
"real_std": float(r.std()),
|
| 91 |
+
"synth_std": float(s.std()),
|
| 92 |
+
"std_diff": float(s.std() - r.std()),
|
| 93 |
+
"ks_stat": float(ks_stat),
|
| 94 |
+
"ks_pvalue": float(ks_p),
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
real_corr = np.corrcoef(real_latent, rowvar=False)
|
| 98 |
+
synth_corr = np.corrcoef(synth_latent, rowvar=False)
|
| 99 |
+
metrics["correlation"]["frobenius_diff"] = float(np.linalg.norm(real_corr - synth_corr, ord='fro'))
|
| 100 |
+
|
| 101 |
+
# Save metrics
|
| 102 |
+
METRICS_JSON = os.path.join(resources_dir, "conditional_gan_metrics.json")
|
| 103 |
+
with open(METRICS_JSON, "w") as f:
|
| 104 |
+
json.dump(metrics, f, indent=4)
|
| 105 |
+
|
| 106 |
+
print(f"Evaluation metrics saved to: {METRICS_JSON}")
|
src/gan_generate_synthetic_latent.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from src.logger import get_logger
|
| 14 |
+
logger = get_logger(__name__)
|
| 15 |
+
except Exception:
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
from src.model import ConditionalGenerator
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
try:
|
| 24 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 25 |
+
models_dir = os.path.join(base_dir, "models")
|
| 26 |
+
resources_dir = os.path.join(base_dir, "resources")
|
| 27 |
+
data_dir = os.path.join(base_dir, "data", "processed")
|
| 28 |
+
|
| 29 |
+
model_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth")
|
| 30 |
+
config_path = os.path.join(resources_dir, "gan_config.json")
|
| 31 |
+
latent_path = os.path.join(data_dir, "latent_vectors.npy")
|
| 32 |
+
ticker_path = os.path.join(data_dir, "sequence_tickers.npy")
|
| 33 |
+
|
| 34 |
+
output_latent_path = os.path.join(data_dir, "synthetic_latent_vectors.npy")
|
| 35 |
+
output_ticker_path = os.path.join(data_dir, "synthetic_ticker_mapping.npy")
|
| 36 |
+
|
| 37 |
+
real_latent = np.load(latent_path)
|
| 38 |
+
tickers = np.load(ticker_path)
|
| 39 |
+
n_samples = real_latent.shape[0]
|
| 40 |
+
|
| 41 |
+
logger.info(f"Loaded {n_samples} real latent vectors with shape {real_latent.shape}")
|
| 42 |
+
logger.info(f"Loaded ticker mapping shape: {tickers.shape}")
|
| 43 |
+
|
| 44 |
+
with open(config_path, "r") as f:
|
| 45 |
+
config = json.load(f)
|
| 46 |
+
|
| 47 |
+
noise_dim = config["noise_dim"]
|
| 48 |
+
latent_dim = config["latent_dim"]
|
| 49 |
+
hidden_dim = config["hidden_dim"]
|
| 50 |
+
embed_dim = config["embed_dim"]
|
| 51 |
+
num_tickers = config["num_tickers"]
|
| 52 |
+
|
| 53 |
+
tickers = np.load(ticker_path)
|
| 54 |
+
n_samples = tickers.shape[0]
|
| 55 |
+
|
| 56 |
+
logger.info(f"Loaded {n_samples} tickers for conditional generation")
|
| 57 |
+
|
| 58 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 59 |
+
G = ConditionalGenerator(noise_dim, latent_dim, hidden_dim, num_tickers, embed_dim).to(device)
|
| 60 |
+
G.load_state_dict(torch.load(model_path, map_location=device))
|
| 61 |
+
G.eval()
|
| 62 |
+
|
| 63 |
+
logger.info(
|
| 64 |
+
f"Loaded Conditional WGAN-GP Generator "
|
| 65 |
+
f"(noise_dim={noise_dim}, latent_dim={latent_dim}, num_tickers={num_tickers})"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
batch_size = 512
|
| 69 |
+
synthetic_latents = []
|
| 70 |
+
synthetic_tickers = []
|
| 71 |
+
|
| 72 |
+
for i in tqdm(range((n_samples + batch_size - 1) // batch_size), desc="Generating synthetic latent vectors"):
|
| 73 |
+
b = min(batch_size, n_samples - i * batch_size)
|
| 74 |
+
z = torch.randn(b, noise_dim).to(device)
|
| 75 |
+
ticker_batch = torch.tensor(tickers[i * batch_size:i * batch_size + b], dtype=torch.long).to(device)
|
| 76 |
+
|
| 77 |
+
with torch.no_grad():
|
| 78 |
+
fake_latent = G(z, ticker_batch).cpu().numpy()
|
| 79 |
+
|
| 80 |
+
synthetic_latents.append(fake_latent)
|
| 81 |
+
synthetic_tickers.append(ticker_batch.cpu().numpy())
|
| 82 |
+
|
| 83 |
+
synth_latent = np.vstack(synthetic_latents)
|
| 84 |
+
synth_tickers = np.concatenate(synthetic_tickers)
|
| 85 |
+
|
| 86 |
+
np.save(output_latent_path, synth_latent)
|
| 87 |
+
np.save(output_ticker_path, synth_tickers)
|
| 88 |
+
|
| 89 |
+
logger.info(f"Saved synthetic latent vectors: {output_latent_path} — shape {synth_latent.shape}")
|
| 90 |
+
logger.info(f"Saved synthetic ticker mapping: {output_ticker_path} — shape {synth_tickers.shape}")
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.error("Error generating synthetic latent vectors: %s", e)
|
| 94 |
+
raise
|
src/gan_model.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# conditional_wgangp_train.py
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import json
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
import torch.optim as optim
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
from sklearn.preprocessing import StandardScaler
|
| 12 |
+
from torch.utils.data import Dataset, DataLoader
|
| 13 |
+
|
| 14 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 15 |
+
try:
|
| 16 |
+
from src.logger import get_logger
|
| 17 |
+
logger = get_logger(__name__)
|
| 18 |
+
except Exception:
|
| 19 |
+
import logging
|
| 20 |
+
logging.basicConfig(level=logging.INFO)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
class LatentTickerDataset(Dataset):
|
| 24 |
+
def __init__(self, latent_path, ticker_path):
|
| 25 |
+
self.latents = np.load(latent_path)
|
| 26 |
+
self.tickers = np.load(ticker_path)
|
| 27 |
+
assert self.latents.shape[0] == self.tickers.shape[0], "Latents and tickers length mismatch"
|
| 28 |
+
|
| 29 |
+
def __len__(self):
|
| 30 |
+
return self.latents.shape[0]
|
| 31 |
+
|
| 32 |
+
def __getitem__(self, idx):
|
| 33 |
+
x = self.latents[idx].astype(np.float32)
|
| 34 |
+
y = int(self.tickers[idx])
|
| 35 |
+
return x, y
|
| 36 |
+
|
| 37 |
+
class ConditionalGenerator(nn.Module):
|
| 38 |
+
def __init__(self, noise_dim, embed_dim, num_tickers, latent_dim, hidden_dim=128):
|
| 39 |
+
super().__init__()
|
| 40 |
+
self.ticker_emb = nn.Embedding(num_tickers, embed_dim)
|
| 41 |
+
input_dim = noise_dim + embed_dim
|
| 42 |
+
self.net = nn.Sequential(
|
| 43 |
+
nn.Linear(input_dim, hidden_dim),
|
| 44 |
+
nn.LeakyReLU(0.2),
|
| 45 |
+
nn.Linear(hidden_dim, hidden_dim),
|
| 46 |
+
nn.LeakyReLU(0.2),
|
| 47 |
+
nn.Linear(hidden_dim, latent_dim)
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
def forward(self, z, ticker_ids):
|
| 51 |
+
emb = self.ticker_emb(ticker_ids)
|
| 52 |
+
x = torch.cat([z, emb], dim=1)
|
| 53 |
+
return self.net(x)
|
| 54 |
+
|
| 55 |
+
class ConditionalDiscriminator(nn.Module):
|
| 56 |
+
def __init__(self, latent_dim, embed_dim, num_tickers, hidden_dim=128):
|
| 57 |
+
super().__init__()
|
| 58 |
+
self.ticker_emb = nn.Embedding(num_tickers, embed_dim)
|
| 59 |
+
input_dim = latent_dim + embed_dim
|
| 60 |
+
self.net = nn.Sequential(
|
| 61 |
+
nn.Linear(input_dim, hidden_dim),
|
| 62 |
+
nn.LeakyReLU(0.2),
|
| 63 |
+
nn.Linear(hidden_dim, hidden_dim),
|
| 64 |
+
nn.LeakyReLU(0.2),
|
| 65 |
+
nn.Linear(hidden_dim, 1)
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
def forward(self, x, ticker_ids):
|
| 69 |
+
emb = self.ticker_emb(ticker_ids)
|
| 70 |
+
x_cat = torch.cat([x, emb], dim=1)
|
| 71 |
+
return self.net(x_cat)
|
| 72 |
+
|
| 73 |
+
def gradient_penalty_cond(D, real, fake, ticker_ids, device):
|
| 74 |
+
"""Compute gradient penalty for conditional discriminator D(x, ticker_ids)."""
|
| 75 |
+
batch_size = real.size(0)
|
| 76 |
+
alpha = torch.rand(batch_size, 1).to(device)
|
| 77 |
+
interpolates = (alpha * real + (1 - alpha) * fake).requires_grad_(True)
|
| 78 |
+
d_interpolates = D(interpolates, ticker_ids)
|
| 79 |
+
grad_outputs = torch.ones_like(d_interpolates).to(device)
|
| 80 |
+
gradients = torch.autograd.grad(
|
| 81 |
+
outputs=d_interpolates,
|
| 82 |
+
inputs=interpolates,
|
| 83 |
+
grad_outputs=grad_outputs,
|
| 84 |
+
create_graph=True,
|
| 85 |
+
retain_graph=True,
|
| 86 |
+
only_inputs=True
|
| 87 |
+
)[0]
|
| 88 |
+
gradients = gradients.view(batch_size, -1)
|
| 89 |
+
gp = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
|
| 90 |
+
return gp
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 94 |
+
latent_path = os.path.join(base_dir, "data", "processed", "latent_vectors.npy")
|
| 95 |
+
ticker_path = os.path.join(base_dir, "data", "processed", "sequence_tickers.npy")
|
| 96 |
+
models_dir = os.path.join(base_dir, "models")
|
| 97 |
+
resources_dir = os.path.join(base_dir, "resources")
|
| 98 |
+
os.makedirs(models_dir, exist_ok=True)
|
| 99 |
+
os.makedirs(resources_dir, exist_ok=True)
|
| 100 |
+
|
| 101 |
+
logger.info("Loading latent vectors from: %s", latent_path)
|
| 102 |
+
latent_vectors = np.load(latent_path)
|
| 103 |
+
logger.info("Loaded latent vectors shape: %s", latent_vectors.shape)
|
| 104 |
+
|
| 105 |
+
logger.info("Loading sequence ticker IDs from: %s", ticker_path)
|
| 106 |
+
sequence_tickers = np.load(ticker_path)
|
| 107 |
+
logger.info("Loaded ticker IDs shape: %s", sequence_tickers.shape)
|
| 108 |
+
|
| 109 |
+
scaler = StandardScaler()
|
| 110 |
+
latent_scaled = scaler.fit_transform(latent_vectors)
|
| 111 |
+
scaler_save = {"mean": scaler.mean_.tolist(), "scale": scaler.scale_.tolist()}
|
| 112 |
+
np.save(os.path.join(resources_dir, "latent_scaler.npy"), scaler_save)
|
| 113 |
+
logger.info("Saved latent scaler params to resources.")
|
| 114 |
+
|
| 115 |
+
dataset = LatentTickerDataset(latent_path, ticker_path)
|
| 116 |
+
dataset.latents = latent_scaled
|
| 117 |
+
batch_size = 256
|
| 118 |
+
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=2)
|
| 119 |
+
|
| 120 |
+
noise_dim = 64
|
| 121 |
+
hidden_dim = 128
|
| 122 |
+
n_epochs = 300
|
| 123 |
+
lr = 1e-4
|
| 124 |
+
lambda_gp = 10
|
| 125 |
+
n_critic = 5
|
| 126 |
+
embed_dim = 16
|
| 127 |
+
|
| 128 |
+
latent_dim = latent_scaled.shape[1]
|
| 129 |
+
num_tickers = int(sequence_tickers.max()) + 1
|
| 130 |
+
|
| 131 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 132 |
+
|
| 133 |
+
G = ConditionalGenerator(noise_dim=noise_dim, embed_dim=embed_dim,
|
| 134 |
+
num_tickers=num_tickers, latent_dim=latent_dim,
|
| 135 |
+
hidden_dim=hidden_dim).to(device)
|
| 136 |
+
D = ConditionalDiscriminator(latent_dim=latent_dim, embed_dim=embed_dim,
|
| 137 |
+
num_tickers=num_tickers, hidden_dim=hidden_dim).to(device)
|
| 138 |
+
|
| 139 |
+
opt_G = optim.Adam(G.parameters(), lr=lr, betas=(0.5, 0.9))
|
| 140 |
+
opt_D = optim.Adam(D.parameters(), lr=lr, betas=(0.5, 0.9))
|
| 141 |
+
|
| 142 |
+
losses = {"epoch": [], "D_loss": [], "G_loss": []}
|
| 143 |
+
|
| 144 |
+
logger.info("Starting Conditional WGAN-GP training...")
|
| 145 |
+
for epoch in range(n_epochs):
|
| 146 |
+
D_losses_epoch = []
|
| 147 |
+
G_losses_epoch = []
|
| 148 |
+
|
| 149 |
+
for real_batch, tickers_batch in tqdm(loader, desc=f"Epoch {epoch+1}/{n_epochs}", leave=False):
|
| 150 |
+
real = real_batch.to(device)
|
| 151 |
+
tickers = tickers_batch.to(device).long()
|
| 152 |
+
bsize = real.size(0)
|
| 153 |
+
|
| 154 |
+
for _ in range(n_critic):
|
| 155 |
+
z = torch.randn(bsize, noise_dim).to(device)
|
| 156 |
+
fake = G(z, tickers)
|
| 157 |
+
|
| 158 |
+
d_real = D(real, tickers)
|
| 159 |
+
d_fake = D(fake.detach(), tickers)
|
| 160 |
+
|
| 161 |
+
gp = gradient_penalty_cond(D, real, fake.detach(), tickers, device)
|
| 162 |
+
d_loss = -(d_real.mean() - d_fake.mean()) + lambda_gp * gp
|
| 163 |
+
|
| 164 |
+
opt_D.zero_grad()
|
| 165 |
+
d_loss.backward()
|
| 166 |
+
opt_D.step()
|
| 167 |
+
|
| 168 |
+
z = torch.randn(bsize, noise_dim).to(device)
|
| 169 |
+
fake = G(z, tickers)
|
| 170 |
+
g_loss = -D(fake, tickers).mean()
|
| 171 |
+
|
| 172 |
+
opt_G.zero_grad()
|
| 173 |
+
g_loss.backward()
|
| 174 |
+
opt_G.step()
|
| 175 |
+
|
| 176 |
+
D_losses_epoch.append(d_loss.item())
|
| 177 |
+
G_losses_epoch.append(g_loss.item())
|
| 178 |
+
|
| 179 |
+
mean_D = float(np.mean(D_losses_epoch)) if len(D_losses_epoch) else 0.0
|
| 180 |
+
mean_G = float(np.mean(G_losses_epoch)) if len(G_losses_epoch) else 0.0
|
| 181 |
+
|
| 182 |
+
losses["epoch"].append(epoch + 1)
|
| 183 |
+
losses["D_loss"].append(mean_D)
|
| 184 |
+
losses["G_loss"].append(mean_G)
|
| 185 |
+
|
| 186 |
+
logger.info(f"[{epoch+1}/{n_epochs}] D_loss={mean_D:.4f}, G_loss={mean_G:.4f}")
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
losses_df = pd.DataFrame(losses)
|
| 190 |
+
losses_csv_path = os.path.join(resources_dir, "latent_gan_losses.csv")
|
| 191 |
+
losses_df.to_csv(losses_csv_path, index=False)
|
| 192 |
+
logger.info("Saved training losses to %s", losses_csv_path)
|
| 193 |
+
|
| 194 |
+
torch.save(G.state_dict(), os.path.join(models_dir, "latent_gan_generator_conditional.pth"))
|
| 195 |
+
torch.save(D.state_dict(), os.path.join(models_dir, "latent_gan_discriminator_conditional.pth"))
|
| 196 |
+
logger.info("Saved GAN models to models/")
|
| 197 |
+
|
| 198 |
+
with open(os.path.join(resources_dir, "gan_config.json"), "w") as f:
|
| 199 |
+
json.dump({
|
| 200 |
+
"model": "WGAN-GP-conditional",
|
| 201 |
+
"noise_dim": noise_dim,
|
| 202 |
+
"latent_dim": latent_dim,
|
| 203 |
+
"hidden_dim": hidden_dim,
|
| 204 |
+
"epochs": n_epochs,
|
| 205 |
+
"batch_size": batch_size,
|
| 206 |
+
"lr": lr,
|
| 207 |
+
"lambda_gp": lambda_gp,
|
| 208 |
+
"n_critic": n_critic,
|
| 209 |
+
"embed_dim": embed_dim,
|
| 210 |
+
"num_tickers": num_tickers
|
| 211 |
+
}, f, indent=4)
|
| 212 |
+
|
| 213 |
+
logger.info("Saved GAN config to resources/gan_config.json")
|
| 214 |
+
logger.info("Training completed successfully.")
|
src/nn_model.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
| 10 |
+
|
| 11 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 12 |
+
orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet')
|
| 13 |
+
combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet')
|
| 14 |
+
resources_dir = os.path.join(base_dir, 'resources')
|
| 15 |
+
os.makedirs(resources_dir, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
original_df = pd.read_parquet(orig_data_path)
|
| 18 |
+
combined_df = pd.read_parquet(combined_data_path)
|
| 19 |
+
|
| 20 |
+
for df in [original_df, combined_df]:
|
| 21 |
+
df.sort_values(['Ticker', 'Date'], inplace=True)
|
| 22 |
+
df.reset_index(drop=True, inplace=True)
|
| 23 |
+
|
| 24 |
+
def add_trend_label(df):
|
| 25 |
+
df['Next_Close'] = df.groupby('Ticker')['Close'].shift(-1)
|
| 26 |
+
df['Trend'] = (df['Next_Close'] > df['Close']).astype(int)
|
| 27 |
+
df.dropna(subset=['Next_Close'], inplace=True)
|
| 28 |
+
return df
|
| 29 |
+
|
| 30 |
+
original_df = add_trend_label(original_df)
|
| 31 |
+
combined_df = add_trend_label(combined_df)
|
| 32 |
+
|
| 33 |
+
le = LabelEncoder()
|
| 34 |
+
original_df['TickerID'] = le.fit_transform(original_df['Ticker'])
|
| 35 |
+
combined_df['TickerID'] = le.transform(combined_df['Ticker'])
|
| 36 |
+
|
| 37 |
+
num_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
| 38 |
+
feature_cols = num_cols + ['TickerID']
|
| 39 |
+
target_col = 'Trend'
|
| 40 |
+
|
| 41 |
+
original_df = original_df.sort_values(['TickerID', 'Date']).reset_index(drop=True)
|
| 42 |
+
combined_df = combined_df.sort_values(['TickerID', 'Date']).reset_index(drop=True)
|
| 43 |
+
|
| 44 |
+
X_orig = original_df[feature_cols]
|
| 45 |
+
y_orig = original_df[target_col]
|
| 46 |
+
X_mix = combined_df[feature_cols]
|
| 47 |
+
y_mix = combined_df[target_col]
|
| 48 |
+
|
| 49 |
+
split_idx = int(len(X_orig) * 0.8)
|
| 50 |
+
split_idx_mix = int(len(X_mix) * 0.8)
|
| 51 |
+
|
| 52 |
+
X_train_orig, X_test = X_orig.iloc[:split_idx].copy(), X_orig.iloc[split_idx:].copy()
|
| 53 |
+
y_train_orig, y_test = y_orig.iloc[:split_idx].copy(), y_orig.iloc[split_idx:].copy()
|
| 54 |
+
|
| 55 |
+
X_train_mix, _ = X_mix.iloc[:split_idx_mix].copy(), X_mix.iloc[split_idx_mix:].copy()
|
| 56 |
+
y_train_mix, _ = y_mix.iloc[:split_idx_mix].copy(), y_mix.iloc[split_idx_mix:].copy()
|
| 57 |
+
|
| 58 |
+
scaler = StandardScaler()
|
| 59 |
+
scaler.fit(X_train_orig[num_cols])
|
| 60 |
+
|
| 61 |
+
X_train_orig.loc[:, num_cols] = scaler.transform(X_train_orig[num_cols])
|
| 62 |
+
X_train_mix.loc[:, num_cols] = scaler.transform(X_train_mix[num_cols])
|
| 63 |
+
X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])
|
| 64 |
+
|
| 65 |
+
def to_tensor(X, y):
|
| 66 |
+
X_num = torch.tensor(X[num_cols].values, dtype=torch.float32)
|
| 67 |
+
X_ticker = torch.tensor(X['TickerID'].values, dtype=torch.long)
|
| 68 |
+
y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)
|
| 69 |
+
return X_num, X_ticker, y
|
| 70 |
+
|
| 71 |
+
X_train_orig_num, X_train_orig_ticker, y_train_orig_t = to_tensor(X_train_orig, y_train_orig)
|
| 72 |
+
X_train_mix_num, X_train_mix_ticker, y_train_mix_t = to_tensor(X_train_mix, y_train_mix)
|
| 73 |
+
X_test_num, X_test_ticker, y_test_t = to_tensor(X_test, y_test)
|
| 74 |
+
|
| 75 |
+
n_tickers_total = max(
|
| 76 |
+
X_train_orig_ticker.max().item(),
|
| 77 |
+
X_train_mix_ticker.max().item(),
|
| 78 |
+
X_test_ticker.max().item()
|
| 79 |
+
) + 1
|
| 80 |
+
|
| 81 |
+
class TrendNN(nn.Module):
|
| 82 |
+
def __init__(self, n_tickers, input_dim):
|
| 83 |
+
super().__init__()
|
| 84 |
+
self.ticker_embed = nn.Embedding(n_tickers, 8)
|
| 85 |
+
self.net = nn.Sequential(
|
| 86 |
+
nn.Linear(input_dim + 8, 64),
|
| 87 |
+
nn.ReLU(),
|
| 88 |
+
nn.Linear(64, 32),
|
| 89 |
+
nn.ReLU(),
|
| 90 |
+
nn.Linear(32, 1),
|
| 91 |
+
nn.Sigmoid()
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def forward(self, x_num, ticker_id):
|
| 95 |
+
ticker_vec = self.ticker_embed(ticker_id)
|
| 96 |
+
x = torch.cat([x_num, ticker_vec], dim=1)
|
| 97 |
+
return self.net(x)
|
| 98 |
+
|
| 99 |
+
def train_model(X_num, X_ticker, y, X_val, X_val_ticker, y_val, name, epochs=100, batch_size=1024):
|
| 100 |
+
model = TrendNN(n_tickers=n_tickers_total, input_dim=len(num_cols))
|
| 101 |
+
criterion = nn.BCELoss()
|
| 102 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
| 103 |
+
|
| 104 |
+
history = {"train_loss": [], "val_loss": [], "val_acc": []}
|
| 105 |
+
n_samples = len(X_num)
|
| 106 |
+
|
| 107 |
+
for epoch in range(epochs):
|
| 108 |
+
model.train()
|
| 109 |
+
perm = torch.randperm(n_samples)
|
| 110 |
+
total_loss = 0
|
| 111 |
+
|
| 112 |
+
for i in range(0, n_samples, batch_size):
|
| 113 |
+
idx = perm[i:i+batch_size]
|
| 114 |
+
batch_X_num, batch_ticker, batch_y = X_num[idx], X_ticker[idx], y[idx]
|
| 115 |
+
|
| 116 |
+
optimizer.zero_grad()
|
| 117 |
+
y_pred = model(batch_X_num, batch_ticker)
|
| 118 |
+
loss = criterion(y_pred, batch_y)
|
| 119 |
+
loss.backward()
|
| 120 |
+
optimizer.step()
|
| 121 |
+
total_loss += loss.item()
|
| 122 |
+
|
| 123 |
+
model.eval()
|
| 124 |
+
with torch.no_grad():
|
| 125 |
+
y_val_pred = model(X_val, X_val_ticker)
|
| 126 |
+
val_loss = criterion(y_val_pred, y_val).item()
|
| 127 |
+
val_acc = ((y_val_pred > 0.5).float() == y_val).float().mean().item()
|
| 128 |
+
|
| 129 |
+
avg_train_loss = total_loss / (n_samples // batch_size)
|
| 130 |
+
history["train_loss"].append(avg_train_loss)
|
| 131 |
+
history["val_loss"].append(val_loss)
|
| 132 |
+
history["val_acc"].append(val_acc)
|
| 133 |
+
|
| 134 |
+
if (epoch + 1) % 5 == 0:
|
| 135 |
+
print(f"[{name}] Epoch {epoch+1}/{epochs} | "
|
| 136 |
+
f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
|
| 137 |
+
|
| 138 |
+
model_path = os.path.join(resources_dir, f"model_{name.lower()}.pt")
|
| 139 |
+
torch.save(model.state_dict(), model_path)
|
| 140 |
+
return model, history
|
| 141 |
+
|
| 142 |
+
model_orig, hist_orig = train_model(
|
| 143 |
+
X_train_orig_num, X_train_orig_ticker, y_train_orig_t,
|
| 144 |
+
X_test_num, X_test_ticker, y_test_t, "Original"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
model_mix, hist_mix = train_model(
|
| 148 |
+
X_train_mix_num, X_train_mix_ticker, y_train_mix_t,
|
| 149 |
+
X_test_num, X_test_ticker, y_test_t, "Combined"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
results = {
|
| 153 |
+
"original": hist_orig,
|
| 154 |
+
"combined": hist_mix
|
| 155 |
+
}
|
| 156 |
+
with open(os.path.join(resources_dir, 'training_metrics.json'), "w") as f:
|
| 157 |
+
json.dump(results, f, indent=4)
|