Raheel Abdul Rehman commited on
Commit
bbf5d55
·
0 Parent(s):

Prod Publish

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
File without changes
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
app/app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+
6
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
7
+ orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet')
8
+ combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet')
9
+
10
+ orig_df = pd.read_parquet(orig_data_path)
11
+ combined_df = pd.read_parquet(combined_data_path)
12
+
13
+ for df in [orig_df, combined_df]:
14
+ if 'Date' in df.columns:
15
+ df['Date'] = pd.to_datetime(df['Date'])
16
+
17
+ orig_df = orig_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
18
+ combined_df = combined_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
19
+
20
+ FEATURE_COLS = [c for c in ['Open', 'High', 'Low', 'Close', 'Volume'] if c in orig_df.columns]
21
+
22
+ def plot_ticker_data(ticker, feature):
23
+ """
24
+ Plot last 10 years of time series for selected feature for original and combined datasets.
25
+ """
26
+ if feature not in FEATURE_COLS:
27
+ return f"Feature '{feature}' not found in dataset."
28
+
29
+ orig_data = orig_df[orig_df['Ticker'] == ticker].sort_values('Date').reset_index(drop=True)
30
+ synth_data = combined_df[combined_df['Ticker'] == ticker].sort_values('Date').reset_index(drop=True)
31
+
32
+ if orig_data.empty and synth_data.empty:
33
+ return f"No data found for ticker: {ticker}"
34
+ if orig_data.empty:
35
+ return f"No original data found for {ticker}"
36
+ if synth_data.empty:
37
+ return f"No combined/synthetic data found for {ticker}"
38
+
39
+ latest_date = min(orig_data['Date'].max(), synth_data['Date'].max())
40
+ cutoff_date = latest_date - pd.DateOffset(years=5)
41
+
42
+ orig_data = orig_data[orig_data['Date'] >= cutoff_date]
43
+ synth_data = synth_data[synth_data['Date'] >= cutoff_date]
44
+
45
+ orig_series = orig_data[['Date', feature]].dropna()
46
+ synth_series = synth_data[['Date', feature]].dropna()
47
+
48
+ fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=False)
49
+ fig.suptitle(f"{ticker} — {feature} (Last 10 Years)", fontsize=14)
50
+
51
+ axes[0].plot(orig_series['Date'], orig_series[feature], linewidth=1.0, alpha=0.9)
52
+ axes[0].set_title("Original Data")
53
+ axes[0].set_ylabel(feature)
54
+ axes[0].grid(True)
55
+
56
+ axes[1].plot(synth_series['Date'], synth_series[feature], linewidth=1.0, alpha=0.9)
57
+ axes[1].set_title("Synthetic Data")
58
+ axes[1].set_ylabel(feature)
59
+ axes[1].grid(True)
60
+
61
+ try:
62
+ min_date = min(orig_series['Date'].min(), synth_series['Date'].min())
63
+ max_date = max(orig_series['Date'].max(), synth_series['Date'].max())
64
+ axes[0].set_xlim(min_date, max_date)
65
+ axes[1].set_xlim(min_date, max_date)
66
+ except Exception:
67
+ pass
68
+
69
+ plt.tight_layout(rect=[0, 0, 1, 0.96])
70
+ return fig
71
+
72
+ unique_tickers = sorted(orig_df['Ticker'].unique())
73
+
74
+ demo = gr.Interface(
75
+ fn=plot_ticker_data,
76
+ inputs=[
77
+ gr.Dropdown(unique_tickers, label="Select Stock Ticker"),
78
+ gr.Dropdown(FEATURE_COLS, label="Select Feature (Open/High/Low/Close/Volume)")
79
+ ],
80
+ outputs=gr.Plot(label="Time Series Comparison"),
81
+ title="Real vs Synthetic Time Series Viewer",
82
+ description="Pick a ticker and feature to view the last 5 years of data from original and synthetic datasets."
83
+ )
84
+
85
+ if __name__ == "__main__":
86
+ demo.launch()
charts/NN_train_loss_plot.png ADDED

Git LFS Details

  • SHA256: 580c52f8b62c5fd5857eeaaa22e4be253d340fffedb9a80002563915e0205802
  • Pointer size: 130 Bytes
  • Size of remote file: 41.8 kB
charts/NN_val_accuracy_plot.png ADDED

Git LFS Details

  • SHA256: f2ecb0276ff49fcf5fc2862ba0673ec7527aa62389054af483a1c1bdf5969636
  • Pointer size: 131 Bytes
  • Size of remote file: 110 kB
charts/NN_val_loss_plot.png ADDED

Git LFS Details

  • SHA256: bb6060f55133362b95a3ea398f2e849f213fcebd0cc8a26ae04e8ecece48383f
  • Pointer size: 130 Bytes
  • Size of remote file: 82.6 kB
charts/ae_table_evaluator.png ADDED

Git LFS Details

  • SHA256: 7260641ab4ae48a107145d1c26368adad4256663494be1c36528786301acd88a
  • Pointer size: 130 Bytes
  • Size of remote file: 93.3 kB
charts/au_training_loss.png ADDED

Git LFS Details

  • SHA256: f22315b776ff8d0f8944cdc18d6e103e876dffb9b1163f17a9b64db74831ac24
  • Pointer size: 130 Bytes
  • Size of remote file: 23.3 kB
charts/gan_pca.png ADDED

Git LFS Details

  • SHA256: eb4dce411fae95c8e134e84bc7e5ba2d4b6b01970c9bc27d9312ad293842ac1b
  • Pointer size: 131 Bytes
  • Size of remote file: 189 kB
charts/gan_table_evaluator.png ADDED

Git LFS Details

  • SHA256: 05f6ddd006969ab6d054585c1d822e8aca15c3b71c1ce7db1667358de65a8a2e
  • Pointer size: 131 Bytes
  • Size of remote file: 433 kB
charts/gan_training_loss.png ADDED

Git LFS Details

  • SHA256: 08ae7313f8d29dbacb5fa9ebc03966687480dfa6eb88d0e0b7dd37c30123ca7a
  • Pointer size: 130 Bytes
  • Size of remote file: 42.8 kB
data/final_data.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4244f7ab9fa70bfa9aa16a0c03796528c3744f63f5d27d8cfd13ac664646e133
3
+ size 44822597
data/orig_processed.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f2357e122e5ccaeaa0cdc08d0b593e947b72603ff96841ddf33674deec8567f
3
+ size 56514266
models/ae_hyperparameters.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5619bc38b522cbed5839c4f0ab9193e199ceae0fb901d45b5afecaf1fd7806e7
3
+ size 100
models/ae_lstm_autoencoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf07dca9322668dcad0023c8bed0ac604a1bba11bea98ac039e8e1e3afa4650e
3
+ size 289111
models/conditional_gan_metrics.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07b4ba961a2be1762511554a33e26d2d555799fee7d6bb794966e0e36a7f05b0
3
+ size 12670
models/gan_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74d6b40cb6c6794bfe64995bfe5639cb2551cce7983c643e49531ef177ab4fe9
3
+ size 261
models/latent_gan_discriminator_conditional.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:666dfd51a80772671418dd2d6dbf36816869b7297349aeb4f878f3539548a099
3
+ size 127690
models/latent_gan_generator_conditional.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f538c56f0ffccb35235a6f9280b2452dc522ced62bf0106d1371c174c389ff6
3
+ size 160022
models/model_combined.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55baabc1a62fda48e5c3b344bdc9ea74d6e54a8971f8437a55a40ddb8f6cd5b7
3
+ size 31596
models/model_original.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:781876452c9e781bead24a6b4880ecc2e5a4f57407b8b78612c0f85fb961b26e
3
+ size 31596
notebooks/AE_EDA.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/GAN_EDA.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/NN_EDA.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ pandas
3
+ scikit-learn
4
+ gradio
5
+ matplotlib
6
+ numpy
7
+ pyarrow
src/ae_decoder.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import pickle
5
+ import numpy as np
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+
9
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ from src.model import LSTMAutoEncoder
12
+
13
+
14
+ def decode_latent_vectors(
15
+ model_path,
16
+ synthetic_latent_path,
17
+ ticker_map_path,
18
+ output_path,
19
+ model_params,
20
+ seq_len=90,
21
+ device=None
22
+ ):
23
+ """
24
+ Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder.
25
+ """
26
+
27
+ device = device or ("cuda" if torch.cuda.is_available() else "cpu")
28
+
29
+ model = LSTMAutoEncoder(
30
+ input_dim=5,
31
+ num_tickers=model_params["num_tickers"],
32
+ embed_dim=model_params["embed_dim"],
33
+ hidden_size=model_params["hidden_size"],
34
+ latent_dim=model_params["latent_dim"],
35
+ num_layers=model_params["num_layers"]
36
+ ).to(device)
37
+
38
+ model.load_state_dict(torch.load(model_path, map_location=device))
39
+ model.eval()
40
+ print(f"Loaded trained LSTM Autoencoder from {model_path}")
41
+
42
+ latent_vectors = np.load(synthetic_latent_path)
43
+ ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers")
44
+
45
+ if not os.path.exists(ticker_ids_path):
46
+ raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}")
47
+
48
+ ticker_ids = np.load(ticker_ids_path)
49
+
50
+ latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device)
51
+ ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device)
52
+
53
+ decoded_batches = []
54
+ batch_size = 128
55
+
56
+ for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"):
57
+ batch_latent = latent_t[i:i + batch_size]
58
+ batch_ticker = ticker_t[i:i + batch_size]
59
+
60
+ with torch.no_grad():
61
+ ticker_emb = model.ticker_embed(batch_ticker)
62
+ latent_cat = torch.cat([batch_latent, ticker_emb], dim=1)
63
+ latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1)
64
+ dec_input = model.fc_dec(latent_cat)
65
+ reconstructed, _ = model.decoder(dec_input)
66
+
67
+ decoded_batches.append(reconstructed.cpu().numpy())
68
+
69
+ decoded = np.concatenate(decoded_batches, axis=0)
70
+ print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}")
71
+
72
+ with open(ticker_map_path, "rb") as f:
73
+ label_encoder = pickle.load(f)
74
+
75
+ if hasattr(label_encoder, "inverse_transform"):
76
+ tickers = label_encoder.inverse_transform(ticker_ids)
77
+ elif isinstance(label_encoder, (np.ndarray, list)):
78
+ tickers = np.array(label_encoder)[ticker_ids]
79
+ elif isinstance(label_encoder, dict):
80
+ tickers = [label_encoder[int(i)] for i in ticker_ids]
81
+ else:
82
+ raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}")
83
+
84
+
85
+ records = []
86
+ for i in range(len(decoded)):
87
+ ticker = tickers[i]
88
+ for t in range(seq_len):
89
+ o, h, l, c, v = decoded[i, t]
90
+ records.append({
91
+ "Ticker": ticker,
92
+ "Ticker_Encoded": int(ticker_ids[i]),
93
+ "TimeStep": t,
94
+ "Open": o,
95
+ "High": h,
96
+ "Low": l,
97
+ "Close": c,
98
+ "Volume": v
99
+ })
100
+
101
+ decoded_df = pd.DataFrame(records)
102
+ decoded_df.to_parquet(output_path, index=False)
103
+ print(f"Decoded OHLCV data saved to {output_path}")
104
+
105
+ return decoded_df
106
+
107
+
108
+ if __name__ == "__main__":
109
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
110
+ model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
111
+ synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy")
112
+ ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
113
+ output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet")
114
+
115
+ model_params = {
116
+ "num_layers": 2,
117
+ "hidden_size": 64,
118
+ "latent_dim": 32,
119
+ "embed_dim": 16,
120
+ "num_tickers": 503
121
+ }
122
+
123
+ decode_latent_vectors(
124
+ model_path=model_path,
125
+ synthetic_latent_path=synthetic_latent_path,
126
+ ticker_map_path=ticker_map_path,
127
+ output_path=output_path,
128
+ model_params=model_params,
129
+ seq_len=90
130
+ )
src/ae_evaluate.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import json
5
+ import numpy as np
6
+ import pandas as pd
7
+ from torch.utils.data import DataLoader
8
+ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
9
+ import matplotlib.pyplot as plt
10
+
11
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ from src.model import QuarterlyStockDataset, LSTMAutoEncoder # uses updated version with embeddings
14
+
15
+ if __name__ == "__main__":
16
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17
+ processed_data_path = os.path.join(base_dir, 'data', 'processed', 'stock_data.parquet')
18
+ model_path = os.path.join(base_dir, 'models', 'lstm_autoencoder.pth')
19
+ metrics_path = os.path.join(base_dir, 'resources', 'metrics.json')
20
+
21
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
22
+ df = pd.read_parquet(processed_data_path)
23
+
24
+ test_df = df[df['Date'] >= '2024-01-01']
25
+
26
+ sequence_length = 90
27
+ test_dataset = QuarterlyStockDataset(test_df, sequence_length=sequence_length)
28
+ test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
29
+
30
+ best_params = {
31
+ 'hidden_size': 64,
32
+ 'latent_dim': 32,
33
+ 'num_layers': 2,
34
+ 'embed_dim': 16
35
+ }
36
+
37
+ num_tickers = df['Ticker_Encoded'].nunique()
38
+
39
+ model = LSTMAutoEncoder(
40
+ input_dim=5,
41
+ num_tickers=num_tickers,
42
+ embed_dim=best_params['embed_dim'],
43
+ hidden_size=best_params['hidden_size'],
44
+ latent_dim=best_params['latent_dim'],
45
+ num_layers=best_params['num_layers']
46
+ ).to(device)
47
+
48
+ model.load_state_dict(torch.load(model_path, map_location=device))
49
+ model.eval()
50
+
51
+ all_actual, all_recon = [], []
52
+
53
+ with torch.no_grad():
54
+ for batch_x, batch_ticker in test_loader:
55
+ batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
56
+ recon = model(batch_x, batch_ticker)
57
+ all_actual.append(batch_x.cpu().numpy())
58
+ all_recon.append(recon.cpu().numpy())
59
+ X_test = np.concatenate(all_actual, axis=0)
60
+ X_recon = np.concatenate(all_recon, axis=0)
61
+ X_test_flat = X_test.reshape(-1, X_test.shape[-1])
62
+ X_recon_flat = X_recon.reshape(-1, X_recon.shape[-1])
63
+
64
+ mae = mean_absolute_error(X_test_flat, X_recon_flat)
65
+ rmse = np.sqrt(mean_squared_error(X_test_flat, X_recon_flat))
66
+ r2 = r2_score(X_test_flat, X_recon_flat)
67
+
68
+ metrics = {
69
+ "MAE": float(mae),
70
+ "RMSE": float(rmse),
71
+ "R2": float(r2)
72
+ }
73
+
74
+ with open(metrics_path, 'w') as f:
75
+ json.dump(metrics, f, indent=4)
76
+ print(f"✅ Test metrics saved at: {metrics_path}")
src/ae_latent_extract.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import pandas as pd
5
+ import numpy as np
6
+ from torch.utils.data import DataLoader
7
+
8
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from src.model import LSTMAutoEncoder, QuarterlyStockDataset # pylint: disable=import-error
11
+ from src.logger import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+ if __name__ == "__main__":
16
+ try:
17
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
18
+ processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet")
19
+ model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
20
+ latent_vectors_path = os.path.join(base_dir, "..", "GAN", "data", "processed", "latent_vectors.npy")
21
+ ticker_mapping_path = os.path.join(base_dir, "..", "GAN", "data", "processed", "ticker_mapping.npy")
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ df = pd.read_parquet(processed_data_path)
26
+ tickers = df["Ticker"].unique()
27
+ num_tickers = df["Ticker_Encoded"].nunique()
28
+
29
+ model = LSTMAutoEncoder(
30
+ input_dim=5,
31
+ num_tickers=num_tickers,
32
+ embed_dim=16,
33
+ hidden_size=64,
34
+ latent_dim=32,
35
+ num_layers=2
36
+ ).to(device)
37
+
38
+ model.load_state_dict(torch.load(model_path, map_location=device))
39
+ model.eval()
40
+
41
+ def encode(model, x, ticker_id):
42
+ ticker_emb = model.ticker_embed(ticker_id).unsqueeze(1).repeat(1, x.size(1), 1)
43
+ x_in = torch.cat([x, ticker_emb], dim=2)
44
+ enc_out, _ = model.encoder(x_in)
45
+ latent = model.fc_enc(enc_out[:, -1, :])
46
+ return latent
47
+
48
+ all_latents = []
49
+ all_tickers = []
50
+
51
+ for ticker in tickers:
52
+ ticker_df = df[df["Ticker"] == ticker].copy()
53
+ if len(ticker_df) < 90:
54
+ continue
55
+
56
+ dataset = QuarterlyStockDataset(ticker_df, sequence_length=90)
57
+ loader = DataLoader(dataset, batch_size=64, shuffle=False)
58
+
59
+ ticker_latents = []
60
+
61
+ with torch.no_grad():
62
+ for batch_x, batch_ticker in loader:
63
+ batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
64
+ latent = encode(model, batch_x, batch_ticker)
65
+ ticker_latents.append(latent.cpu().numpy())
66
+
67
+ if ticker_latents:
68
+ ticker_latents = np.concatenate(ticker_latents, axis=0)
69
+ all_latents.append(ticker_latents)
70
+ all_tickers.extend([ticker] * len(ticker_latents))
71
+ logger.info(f"Extracted {len(ticker_latents)} latent vectors for {ticker}.")
72
+
73
+ all_latents = np.concatenate(all_latents, axis=0)
74
+ all_tickers = np.array(all_tickers)
75
+
76
+ np.save(latent_vectors_path, all_latents)
77
+ np.save(ticker_mapping_path, all_tickers)
78
+
79
+ logger.info(f"Saved {len(all_latents)} latent vectors to {latent_vectors_path}")
80
+ logger.info(f"Saved ticker mapping to {ticker_mapping_path}")
81
+
82
+ except Exception as e:
83
+ logger.error("Error extracting latent space vectors: %s", e)
84
+ raise
src/ae_model.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import optuna
5
+ import warnings
6
+ import torch
7
+ import torch.nn as nn
8
+ import pandas as pd
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+ warnings.simplefilter(action='ignore', category=FutureWarning)
12
+
13
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
14
+
15
+ from src.logger import get_logger # pylint: disable=import-error
16
+ logger = get_logger(__name__)
17
+
18
+ class QuarterlyStockDataset(Dataset):
19
+ def __init__(self, df, sequence_length=90):
20
+ try:
21
+ self.sequence_length = sequence_length
22
+ self.samples = []
23
+
24
+ df = df.sort_values(by=["Ticker", "Date"]).reset_index(drop=True)
25
+ tickers = df['Ticker'].unique()
26
+ feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
27
+
28
+ for ticker in tickers:
29
+ ticker_df = df[df['Ticker'] == ticker]
30
+ data = ticker_df[feature_cols].values
31
+ ticker_id = ticker_df['Ticker_Encoded'].iloc[0]
32
+
33
+ for i in range(0, len(data) - sequence_length + 1, sequence_length):
34
+ window = data[i:i+sequence_length]
35
+ self.samples.append((torch.tensor(window, dtype=torch.float32),
36
+ torch.tensor(ticker_id, dtype=torch.long)))
37
+
38
+ print(f"Created {len(self.samples)} quarterly sequences across {len(tickers)} tickers.")
39
+ except Exception as e:
40
+ logger.error("Error batching dataset: %s", e)
41
+ raise
42
+
43
+ def __len__(self):
44
+ return len(self.samples)
45
+
46
+ def __getitem__(self, idx):
47
+ return self.samples[idx]
48
+
49
+
50
+ class LSTMAutoEncoder(nn.Module):
51
+ def __init__(self, input_dim, num_tickers, embed_dim=8, hidden_size=64, latent_dim=16, num_layers=1):
52
+ super(LSTMAutoEncoder, self).__init__()
53
+ self.ticker_embed = nn.Embedding(num_tickers, embed_dim)
54
+
55
+ # Encoder
56
+ self.encoder = nn.LSTM(input_dim + embed_dim, hidden_size, num_layers=num_layers, batch_first=True)
57
+ self.fc_enc = nn.Linear(hidden_size, latent_dim)
58
+
59
+ # Decoder
60
+ self.fc_dec = nn.Linear(latent_dim + embed_dim, hidden_size)
61
+ self.decoder = nn.LSTM(hidden_size, input_dim, num_layers=num_layers, batch_first=True)
62
+
63
+ def forward(self, x, ticker_id):
64
+ ticker_emb = self.ticker_embed(ticker_id).unsqueeze(1).repeat(1, x.size(1), 1)
65
+
66
+ x_in = torch.cat([x, ticker_emb], dim=2)
67
+
68
+ # Encoder
69
+ enc_out, (h, c) = self.encoder(x_in)
70
+ latent = self.fc_enc(enc_out[:, -1, :])
71
+
72
+ latent_cat = torch.cat([latent, self.ticker_embed(ticker_id)], dim=1)
73
+ latent_cat = latent_cat.unsqueeze(1).repeat(1, x.size(1), 1)
74
+
75
+ # Decoder
76
+ dec_input = self.fc_dec(latent_cat)
77
+ out_dec, _ = self.decoder(dec_input)
78
+ return out_dec
79
+
80
+ def objective(trial, df, sequence_length=90, device='cpu'):
81
+ try:
82
+ num_layers = trial.suggest_int("num_layers", 1, 3)
83
+ hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128])
84
+ latent_dim = trial.suggest_categorical("latent_dim", [8, 16, 32])
85
+ lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
86
+ embed_dim = trial.suggest_categorical("embed_dim", [4, 8, 16])
87
+
88
+ train_df = df[df['Date'] < '2023-01-01']
89
+ val_df = df[(df['Date'] >= '2023-01-01') & (df['Date'] < '2024-01-01')]
90
+
91
+ train_dataset = QuarterlyStockDataset(train_df, sequence_length)
92
+ val_dataset = QuarterlyStockDataset(val_df, sequence_length)
93
+
94
+ train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
95
+ val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
96
+
97
+ num_tickers = df['Ticker_Encoded'].nunique()
98
+ model = LSTMAutoEncoder(
99
+ input_dim=5, num_tickers=num_tickers, embed_dim=embed_dim,
100
+ hidden_size=hidden_size, latent_dim=latent_dim, num_layers=num_layers
101
+ ).to(device)
102
+
103
+ criterion = nn.MSELoss()
104
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr)
105
+
106
+ epochs = 20
107
+ for epoch in range(epochs):
108
+ model.train()
109
+ total_train_loss = 0
110
+ for batch_x, batch_ticker in train_loader:
111
+ batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
112
+ optimizer.zero_grad()
113
+ recon = model(batch_x, batch_ticker)
114
+ loss = criterion(recon, batch_x)
115
+ loss.backward()
116
+ optimizer.step()
117
+ total_train_loss += loss.item()
118
+
119
+ model.eval()
120
+ total_val_loss = 0
121
+ with torch.no_grad():
122
+ for batch_x, batch_ticker in val_loader:
123
+ batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
124
+ recon = model(batch_x, batch_ticker)
125
+ loss = criterion(recon, batch_x)
126
+ total_val_loss += loss.item()
127
+
128
+ avg_val_loss = total_val_loss / len(val_loader)
129
+ return avg_val_loss
130
+ except Exception as e:
131
+ logger.error("Error training Model : %s", e)
132
+ raise
133
+
134
+ if __name__ == "__main__":
135
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
136
+ processed_data_path = os.path.join(base_dir, 'data', 'processed', 'stock_data.parquet')
137
+ model_path = os.path.join(base_dir, 'models', 'lstm_autoencoder.pth')
138
+ loss_path = os.path.join(base_dir, 'resources', 'loss_values.json')
139
+ hyperparams_path = os.path.join(base_dir, 'models', 'hyperparameters.json')
140
+
141
+ df = pd.read_parquet(processed_data_path)
142
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
143
+
144
+ study = optuna.create_study(direction="minimize")
145
+ study.optimize(lambda trial: objective(trial, df, device=device), n_trials=10)
146
+
147
+ best_trial = study.best_trial
148
+ best_params = best_trial.params
149
+
150
+ train_df = df[df['Date'] < '2024-01-01']
151
+ full_dataset = QuarterlyStockDataset(train_df, sequence_length=90)
152
+ full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False)
153
+
154
+ num_tickers = df['Ticker_Encoded'].nunique()
155
+ best_model = LSTMAutoEncoder(
156
+ input_dim=5,
157
+ num_tickers=num_tickers,
158
+ embed_dim=best_params.get('embed_dim', 8),
159
+ hidden_size=best_params['hidden_size'],
160
+ latent_dim=best_params['latent_dim'],
161
+ num_layers=best_params['num_layers']
162
+ ).to(device)
163
+
164
+ criterion = nn.MSELoss()
165
+ optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])
166
+
167
+ epochs = 50
168
+ train_losses = []
169
+ for epoch in range(epochs):
170
+ best_model.train()
171
+ total_loss = 0
172
+ for batch_x, batch_ticker in full_loader:
173
+ batch_x, batch_ticker = batch_x.to(device), batch_ticker.to(device)
174
+ optimizer.zero_grad()
175
+ recon = best_model(batch_x, batch_ticker)
176
+ loss = criterion(recon, batch_x)
177
+ loss.backward()
178
+ optimizer.step()
179
+ total_loss += loss.item()
180
+ avg_loss = total_loss / len(full_loader)
181
+ train_losses.append(avg_loss)
182
+ print(f"Epoch [{epoch+1}/{epochs}] Loss: {avg_loss:.6f}")
183
+
184
+ torch.save(best_model.state_dict(), model_path)
185
+ with open(loss_path, 'w') as f:
186
+ json.dump(train_losses, f)
187
+ with open(hyperparams_path, 'w') as f:
188
+ json.dump(best_params, f)
189
+
190
+ print(f"Model, losses, and hyperparameters saved successfully.")
src/gan_evaluate.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import torch
5
+ import numpy as np
6
+ import pandas as pd
7
+ from scipy.stats import ks_2samp
8
+ import matplotlib.pyplot as plt
9
+ from tqdm import tqdm
10
+
11
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ from src.model import ConditionalGenerator
14
+
15
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
16
+ processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet")
17
+ latent_dir = os.path.join(base_dir, "data","processed")
18
+ resources_dir = os.path.join(base_dir, "resources")
19
+ models_dir = os.path.join(base_dir, "models")
20
+
21
+ latent_path = os.path.join(latent_dir, "latent_vectors.npy")
22
+ tickers_path = os.path.join(latent_dir, "sequence_tickers.npy")
23
+ label_encoder_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
24
+ gen_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth")
25
+ gan_config_path = os.path.join(resources_dir, "gan_config.json")
26
+
27
+ with open(gan_config_path, "r") as f:
28
+ params = json.load(f)
29
+
30
+ noise_dim = params["noise_dim"]
31
+ latent_dim = params["latent_dim"]
32
+ hidden_dim = params["hidden_dim"]
33
+ embed_dim = params.get("embed_dim", 8)
34
+
35
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+
37
+ import joblib
38
+ label_encoder = joblib.load(label_encoder_path)
39
+ num_tickers = len(label_encoder.classes_)
40
+
41
+ G = ConditionalGenerator(
42
+ noise_dim=noise_dim,
43
+ latent_dim=latent_dim,
44
+ embed_dim=embed_dim,
45
+ num_tickers=num_tickers,
46
+ hidden_dim=hidden_dim
47
+ ).to(device)
48
+
49
+ G.load_state_dict(torch.load(gen_path, map_location=device))
50
+ G.eval()
51
+ print(f"Loaded conditional generator (latent_dim={latent_dim}, embed_dim={embed_dim})")
52
+
53
+ real_latent = np.load(latent_path)
54
+ N_SAMPLES = real_latent.shape[0]
55
+ batch_size = 512
56
+
57
+ ticker_ids = np.random.randint(0, num_tickers, N_SAMPLES)
58
+ samples = []
59
+
60
+ for i in tqdm(range((N_SAMPLES + batch_size - 1) // batch_size), desc="Generating synthetic latents"):
61
+ b = min(batch_size, N_SAMPLES - i * batch_size)
62
+ z = torch.randn(b, noise_dim).to(device)
63
+ tickers_batch = torch.tensor(ticker_ids[i * batch_size: i * batch_size + b], dtype=torch.long).to(device)
64
+ out = G(z, tickers_batch).detach().cpu().numpy()
65
+ samples.append(out)
66
+
67
+ synth_latent = np.vstack(samples)
68
+ np.save(os.path.join(latent_dir, "synthetic_latent_vectors.npy"), synth_latent)
69
+ np.save(os.path.join(latent_dir, "synthetic_latent_tickers.npy"), ticker_ids)
70
+
71
+ print(f"Saved synthetic latent vectors to {latent_dir}")
72
+
73
+ metrics = {"per_dimension": {}, "correlation": {}}
74
+ min_dim = min(real_latent.shape[1], synth_latent.shape[1])
75
+ real_latent = real_latent[:, :min_dim]
76
+ synth_latent = synth_latent[:, :min_dim]
77
+
78
+ for i in range(min_dim):
79
+ r = real_latent[:, i]
80
+ s = synth_latent[:, i]
81
+ min_n = min(len(r), len(s))
82
+ ks_stat, ks_p = ks_2samp(
83
+ np.random.choice(r, min_n, replace=False),
84
+ np.random.choice(s, min_n, replace=False)
85
+ )
86
+ metrics["per_dimension"][f"latent_{i}"] = {
87
+ "real_mean": float(r.mean()),
88
+ "synth_mean": float(s.mean()),
89
+ "mean_diff": float(s.mean() - r.mean()),
90
+ "real_std": float(r.std()),
91
+ "synth_std": float(s.std()),
92
+ "std_diff": float(s.std() - r.std()),
93
+ "ks_stat": float(ks_stat),
94
+ "ks_pvalue": float(ks_p),
95
+ }
96
+
97
+ real_corr = np.corrcoef(real_latent, rowvar=False)
98
+ synth_corr = np.corrcoef(synth_latent, rowvar=False)
99
+ metrics["correlation"]["frobenius_diff"] = float(np.linalg.norm(real_corr - synth_corr, ord='fro'))
100
+
101
+ # Save metrics
102
+ METRICS_JSON = os.path.join(resources_dir, "conditional_gan_metrics.json")
103
+ with open(METRICS_JSON, "w") as f:
104
+ json.dump(metrics, f, indent=4)
105
+
106
+ print(f"Evaluation metrics saved to: {METRICS_JSON}")
src/gan_generate_synthetic_latent.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import torch
5
+ import numpy as np
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+ import logging
9
+
10
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
+
12
+ try:
13
+ from src.logger import get_logger
14
+ logger = get_logger(__name__)
15
+ except Exception:
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ from src.model import ConditionalGenerator
20
+
21
+
22
+ if __name__ == "__main__":
23
+ try:
24
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
25
+ models_dir = os.path.join(base_dir, "models")
26
+ resources_dir = os.path.join(base_dir, "resources")
27
+ data_dir = os.path.join(base_dir, "data", "processed")
28
+
29
+ model_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth")
30
+ config_path = os.path.join(resources_dir, "gan_config.json")
31
+ latent_path = os.path.join(data_dir, "latent_vectors.npy")
32
+ ticker_path = os.path.join(data_dir, "sequence_tickers.npy")
33
+
34
+ output_latent_path = os.path.join(data_dir, "synthetic_latent_vectors.npy")
35
+ output_ticker_path = os.path.join(data_dir, "synthetic_ticker_mapping.npy")
36
+
37
+ real_latent = np.load(latent_path)
38
+ tickers = np.load(ticker_path)
39
+ n_samples = real_latent.shape[0]
40
+
41
+ logger.info(f"Loaded {n_samples} real latent vectors with shape {real_latent.shape}")
42
+ logger.info(f"Loaded ticker mapping shape: {tickers.shape}")
43
+
44
+ with open(config_path, "r") as f:
45
+ config = json.load(f)
46
+
47
+ noise_dim = config["noise_dim"]
48
+ latent_dim = config["latent_dim"]
49
+ hidden_dim = config["hidden_dim"]
50
+ embed_dim = config["embed_dim"]
51
+ num_tickers = config["num_tickers"]
52
+
53
+ tickers = np.load(ticker_path)
54
+ n_samples = tickers.shape[0]
55
+
56
+ logger.info(f"Loaded {n_samples} tickers for conditional generation")
57
+
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+ G = ConditionalGenerator(noise_dim, latent_dim, hidden_dim, num_tickers, embed_dim).to(device)
60
+ G.load_state_dict(torch.load(model_path, map_location=device))
61
+ G.eval()
62
+
63
+ logger.info(
64
+ f"Loaded Conditional WGAN-GP Generator "
65
+ f"(noise_dim={noise_dim}, latent_dim={latent_dim}, num_tickers={num_tickers})"
66
+ )
67
+
68
+ batch_size = 512
69
+ synthetic_latents = []
70
+ synthetic_tickers = []
71
+
72
+ for i in tqdm(range((n_samples + batch_size - 1) // batch_size), desc="Generating synthetic latent vectors"):
73
+ b = min(batch_size, n_samples - i * batch_size)
74
+ z = torch.randn(b, noise_dim).to(device)
75
+ ticker_batch = torch.tensor(tickers[i * batch_size:i * batch_size + b], dtype=torch.long).to(device)
76
+
77
+ with torch.no_grad():
78
+ fake_latent = G(z, ticker_batch).cpu().numpy()
79
+
80
+ synthetic_latents.append(fake_latent)
81
+ synthetic_tickers.append(ticker_batch.cpu().numpy())
82
+
83
+ synth_latent = np.vstack(synthetic_latents)
84
+ synth_tickers = np.concatenate(synthetic_tickers)
85
+
86
+ np.save(output_latent_path, synth_latent)
87
+ np.save(output_ticker_path, synth_tickers)
88
+
89
+ logger.info(f"Saved synthetic latent vectors: {output_latent_path} — shape {synth_latent.shape}")
90
+ logger.info(f"Saved synthetic ticker mapping: {output_ticker_path} — shape {synth_tickers.shape}")
91
+
92
+ except Exception as e:
93
+ logger.error("Error generating synthetic latent vectors: %s", e)
94
+ raise
src/gan_model.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # conditional_wgangp_train.py
2
+ import os
3
+ import sys
4
+ import json
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.optim as optim
8
+ import numpy as np
9
+ import pandas as pd
10
+ from tqdm import tqdm
11
+ from sklearn.preprocessing import StandardScaler
12
+ from torch.utils.data import Dataset, DataLoader
13
+
14
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
15
+ try:
16
+ from src.logger import get_logger
17
+ logger = get_logger(__name__)
18
+ except Exception:
19
+ import logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ class LatentTickerDataset(Dataset):
24
+ def __init__(self, latent_path, ticker_path):
25
+ self.latents = np.load(latent_path)
26
+ self.tickers = np.load(ticker_path)
27
+ assert self.latents.shape[0] == self.tickers.shape[0], "Latents and tickers length mismatch"
28
+
29
+ def __len__(self):
30
+ return self.latents.shape[0]
31
+
32
+ def __getitem__(self, idx):
33
+ x = self.latents[idx].astype(np.float32)
34
+ y = int(self.tickers[idx])
35
+ return x, y
36
+
37
+ class ConditionalGenerator(nn.Module):
38
+ def __init__(self, noise_dim, embed_dim, num_tickers, latent_dim, hidden_dim=128):
39
+ super().__init__()
40
+ self.ticker_emb = nn.Embedding(num_tickers, embed_dim)
41
+ input_dim = noise_dim + embed_dim
42
+ self.net = nn.Sequential(
43
+ nn.Linear(input_dim, hidden_dim),
44
+ nn.LeakyReLU(0.2),
45
+ nn.Linear(hidden_dim, hidden_dim),
46
+ nn.LeakyReLU(0.2),
47
+ nn.Linear(hidden_dim, latent_dim)
48
+ )
49
+
50
+ def forward(self, z, ticker_ids):
51
+ emb = self.ticker_emb(ticker_ids)
52
+ x = torch.cat([z, emb], dim=1)
53
+ return self.net(x)
54
+
55
+ class ConditionalDiscriminator(nn.Module):
56
+ def __init__(self, latent_dim, embed_dim, num_tickers, hidden_dim=128):
57
+ super().__init__()
58
+ self.ticker_emb = nn.Embedding(num_tickers, embed_dim)
59
+ input_dim = latent_dim + embed_dim
60
+ self.net = nn.Sequential(
61
+ nn.Linear(input_dim, hidden_dim),
62
+ nn.LeakyReLU(0.2),
63
+ nn.Linear(hidden_dim, hidden_dim),
64
+ nn.LeakyReLU(0.2),
65
+ nn.Linear(hidden_dim, 1)
66
+ )
67
+
68
+ def forward(self, x, ticker_ids):
69
+ emb = self.ticker_emb(ticker_ids)
70
+ x_cat = torch.cat([x, emb], dim=1)
71
+ return self.net(x_cat)
72
+
73
+ def gradient_penalty_cond(D, real, fake, ticker_ids, device):
74
+ """Compute gradient penalty for conditional discriminator D(x, ticker_ids)."""
75
+ batch_size = real.size(0)
76
+ alpha = torch.rand(batch_size, 1).to(device)
77
+ interpolates = (alpha * real + (1 - alpha) * fake).requires_grad_(True)
78
+ d_interpolates = D(interpolates, ticker_ids)
79
+ grad_outputs = torch.ones_like(d_interpolates).to(device)
80
+ gradients = torch.autograd.grad(
81
+ outputs=d_interpolates,
82
+ inputs=interpolates,
83
+ grad_outputs=grad_outputs,
84
+ create_graph=True,
85
+ retain_graph=True,
86
+ only_inputs=True
87
+ )[0]
88
+ gradients = gradients.view(batch_size, -1)
89
+ gp = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
90
+ return gp
91
+
92
+ if __name__ == "__main__":
93
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
94
+ latent_path = os.path.join(base_dir, "data", "processed", "latent_vectors.npy")
95
+ ticker_path = os.path.join(base_dir, "data", "processed", "sequence_tickers.npy")
96
+ models_dir = os.path.join(base_dir, "models")
97
+ resources_dir = os.path.join(base_dir, "resources")
98
+ os.makedirs(models_dir, exist_ok=True)
99
+ os.makedirs(resources_dir, exist_ok=True)
100
+
101
+ logger.info("Loading latent vectors from: %s", latent_path)
102
+ latent_vectors = np.load(latent_path)
103
+ logger.info("Loaded latent vectors shape: %s", latent_vectors.shape)
104
+
105
+ logger.info("Loading sequence ticker IDs from: %s", ticker_path)
106
+ sequence_tickers = np.load(ticker_path)
107
+ logger.info("Loaded ticker IDs shape: %s", sequence_tickers.shape)
108
+
109
+ scaler = StandardScaler()
110
+ latent_scaled = scaler.fit_transform(latent_vectors)
111
+ scaler_save = {"mean": scaler.mean_.tolist(), "scale": scaler.scale_.tolist()}
112
+ np.save(os.path.join(resources_dir, "latent_scaler.npy"), scaler_save)
113
+ logger.info("Saved latent scaler params to resources.")
114
+
115
+ dataset = LatentTickerDataset(latent_path, ticker_path)
116
+ dataset.latents = latent_scaled
117
+ batch_size = 256
118
+ loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=2)
119
+
120
+ noise_dim = 64
121
+ hidden_dim = 128
122
+ n_epochs = 300
123
+ lr = 1e-4
124
+ lambda_gp = 10
125
+ n_critic = 5
126
+ embed_dim = 16
127
+
128
+ latent_dim = latent_scaled.shape[1]
129
+ num_tickers = int(sequence_tickers.max()) + 1
130
+
131
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
132
+
133
+ G = ConditionalGenerator(noise_dim=noise_dim, embed_dim=embed_dim,
134
+ num_tickers=num_tickers, latent_dim=latent_dim,
135
+ hidden_dim=hidden_dim).to(device)
136
+ D = ConditionalDiscriminator(latent_dim=latent_dim, embed_dim=embed_dim,
137
+ num_tickers=num_tickers, hidden_dim=hidden_dim).to(device)
138
+
139
+ opt_G = optim.Adam(G.parameters(), lr=lr, betas=(0.5, 0.9))
140
+ opt_D = optim.Adam(D.parameters(), lr=lr, betas=(0.5, 0.9))
141
+
142
+ losses = {"epoch": [], "D_loss": [], "G_loss": []}
143
+
144
+ logger.info("Starting Conditional WGAN-GP training...")
145
+ for epoch in range(n_epochs):
146
+ D_losses_epoch = []
147
+ G_losses_epoch = []
148
+
149
+ for real_batch, tickers_batch in tqdm(loader, desc=f"Epoch {epoch+1}/{n_epochs}", leave=False):
150
+ real = real_batch.to(device)
151
+ tickers = tickers_batch.to(device).long()
152
+ bsize = real.size(0)
153
+
154
+ for _ in range(n_critic):
155
+ z = torch.randn(bsize, noise_dim).to(device)
156
+ fake = G(z, tickers)
157
+
158
+ d_real = D(real, tickers)
159
+ d_fake = D(fake.detach(), tickers)
160
+
161
+ gp = gradient_penalty_cond(D, real, fake.detach(), tickers, device)
162
+ d_loss = -(d_real.mean() - d_fake.mean()) + lambda_gp * gp
163
+
164
+ opt_D.zero_grad()
165
+ d_loss.backward()
166
+ opt_D.step()
167
+
168
+ z = torch.randn(bsize, noise_dim).to(device)
169
+ fake = G(z, tickers)
170
+ g_loss = -D(fake, tickers).mean()
171
+
172
+ opt_G.zero_grad()
173
+ g_loss.backward()
174
+ opt_G.step()
175
+
176
+ D_losses_epoch.append(d_loss.item())
177
+ G_losses_epoch.append(g_loss.item())
178
+
179
+ mean_D = float(np.mean(D_losses_epoch)) if len(D_losses_epoch) else 0.0
180
+ mean_G = float(np.mean(G_losses_epoch)) if len(G_losses_epoch) else 0.0
181
+
182
+ losses["epoch"].append(epoch + 1)
183
+ losses["D_loss"].append(mean_D)
184
+ losses["G_loss"].append(mean_G)
185
+
186
+ logger.info(f"[{epoch+1}/{n_epochs}] D_loss={mean_D:.4f}, G_loss={mean_G:.4f}")
187
+
188
+
189
+ losses_df = pd.DataFrame(losses)
190
+ losses_csv_path = os.path.join(resources_dir, "latent_gan_losses.csv")
191
+ losses_df.to_csv(losses_csv_path, index=False)
192
+ logger.info("Saved training losses to %s", losses_csv_path)
193
+
194
+ torch.save(G.state_dict(), os.path.join(models_dir, "latent_gan_generator_conditional.pth"))
195
+ torch.save(D.state_dict(), os.path.join(models_dir, "latent_gan_discriminator_conditional.pth"))
196
+ logger.info("Saved GAN models to models/")
197
+
198
+ with open(os.path.join(resources_dir, "gan_config.json"), "w") as f:
199
+ json.dump({
200
+ "model": "WGAN-GP-conditional",
201
+ "noise_dim": noise_dim,
202
+ "latent_dim": latent_dim,
203
+ "hidden_dim": hidden_dim,
204
+ "epochs": n_epochs,
205
+ "batch_size": batch_size,
206
+ "lr": lr,
207
+ "lambda_gp": lambda_gp,
208
+ "n_critic": n_critic,
209
+ "embed_dim": embed_dim,
210
+ "num_tickers": num_tickers
211
+ }, f, indent=4)
212
+
213
+ logger.info("Saved GAN config to resources/gan_config.json")
214
+ logger.info("Training completed successfully.")
src/nn_model.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import pandas as pd
4
+ import torch
5
+ import torch.nn as nn
6
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+ import json
8
+
9
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
12
+ orig_data_path = os.path.join(base_dir, 'data', 'orig_processed.parquet')
13
+ combined_data_path = os.path.join(base_dir, 'data', 'final_data.parquet')
14
+ resources_dir = os.path.join(base_dir, 'resources')
15
+ os.makedirs(resources_dir, exist_ok=True)
16
+
17
+ original_df = pd.read_parquet(orig_data_path)
18
+ combined_df = pd.read_parquet(combined_data_path)
19
+
20
+ for df in [original_df, combined_df]:
21
+ df.sort_values(['Ticker', 'Date'], inplace=True)
22
+ df.reset_index(drop=True, inplace=True)
23
+
24
+ def add_trend_label(df):
25
+ df['Next_Close'] = df.groupby('Ticker')['Close'].shift(-1)
26
+ df['Trend'] = (df['Next_Close'] > df['Close']).astype(int)
27
+ df.dropna(subset=['Next_Close'], inplace=True)
28
+ return df
29
+
30
+ original_df = add_trend_label(original_df)
31
+ combined_df = add_trend_label(combined_df)
32
+
33
+ le = LabelEncoder()
34
+ original_df['TickerID'] = le.fit_transform(original_df['Ticker'])
35
+ combined_df['TickerID'] = le.transform(combined_df['Ticker'])
36
+
37
+ num_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
38
+ feature_cols = num_cols + ['TickerID']
39
+ target_col = 'Trend'
40
+
41
+ original_df = original_df.sort_values(['TickerID', 'Date']).reset_index(drop=True)
42
+ combined_df = combined_df.sort_values(['TickerID', 'Date']).reset_index(drop=True)
43
+
44
+ X_orig = original_df[feature_cols]
45
+ y_orig = original_df[target_col]
46
+ X_mix = combined_df[feature_cols]
47
+ y_mix = combined_df[target_col]
48
+
49
+ split_idx = int(len(X_orig) * 0.8)
50
+ split_idx_mix = int(len(X_mix) * 0.8)
51
+
52
+ X_train_orig, X_test = X_orig.iloc[:split_idx].copy(), X_orig.iloc[split_idx:].copy()
53
+ y_train_orig, y_test = y_orig.iloc[:split_idx].copy(), y_orig.iloc[split_idx:].copy()
54
+
55
+ X_train_mix, _ = X_mix.iloc[:split_idx_mix].copy(), X_mix.iloc[split_idx_mix:].copy()
56
+ y_train_mix, _ = y_mix.iloc[:split_idx_mix].copy(), y_mix.iloc[split_idx_mix:].copy()
57
+
58
+ scaler = StandardScaler()
59
+ scaler.fit(X_train_orig[num_cols])
60
+
61
+ X_train_orig.loc[:, num_cols] = scaler.transform(X_train_orig[num_cols])
62
+ X_train_mix.loc[:, num_cols] = scaler.transform(X_train_mix[num_cols])
63
+ X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])
64
+
65
+ def to_tensor(X, y):
66
+ X_num = torch.tensor(X[num_cols].values, dtype=torch.float32)
67
+ X_ticker = torch.tensor(X['TickerID'].values, dtype=torch.long)
68
+ y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)
69
+ return X_num, X_ticker, y
70
+
71
+ X_train_orig_num, X_train_orig_ticker, y_train_orig_t = to_tensor(X_train_orig, y_train_orig)
72
+ X_train_mix_num, X_train_mix_ticker, y_train_mix_t = to_tensor(X_train_mix, y_train_mix)
73
+ X_test_num, X_test_ticker, y_test_t = to_tensor(X_test, y_test)
74
+
75
+ n_tickers_total = max(
76
+ X_train_orig_ticker.max().item(),
77
+ X_train_mix_ticker.max().item(),
78
+ X_test_ticker.max().item()
79
+ ) + 1
80
+
81
+ class TrendNN(nn.Module):
82
+ def __init__(self, n_tickers, input_dim):
83
+ super().__init__()
84
+ self.ticker_embed = nn.Embedding(n_tickers, 8)
85
+ self.net = nn.Sequential(
86
+ nn.Linear(input_dim + 8, 64),
87
+ nn.ReLU(),
88
+ nn.Linear(64, 32),
89
+ nn.ReLU(),
90
+ nn.Linear(32, 1),
91
+ nn.Sigmoid()
92
+ )
93
+
94
+ def forward(self, x_num, ticker_id):
95
+ ticker_vec = self.ticker_embed(ticker_id)
96
+ x = torch.cat([x_num, ticker_vec], dim=1)
97
+ return self.net(x)
98
+
99
+ def train_model(X_num, X_ticker, y, X_val, X_val_ticker, y_val, name, epochs=100, batch_size=1024):
100
+ model = TrendNN(n_tickers=n_tickers_total, input_dim=len(num_cols))
101
+ criterion = nn.BCELoss()
102
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
103
+
104
+ history = {"train_loss": [], "val_loss": [], "val_acc": []}
105
+ n_samples = len(X_num)
106
+
107
+ for epoch in range(epochs):
108
+ model.train()
109
+ perm = torch.randperm(n_samples)
110
+ total_loss = 0
111
+
112
+ for i in range(0, n_samples, batch_size):
113
+ idx = perm[i:i+batch_size]
114
+ batch_X_num, batch_ticker, batch_y = X_num[idx], X_ticker[idx], y[idx]
115
+
116
+ optimizer.zero_grad()
117
+ y_pred = model(batch_X_num, batch_ticker)
118
+ loss = criterion(y_pred, batch_y)
119
+ loss.backward()
120
+ optimizer.step()
121
+ total_loss += loss.item()
122
+
123
+ model.eval()
124
+ with torch.no_grad():
125
+ y_val_pred = model(X_val, X_val_ticker)
126
+ val_loss = criterion(y_val_pred, y_val).item()
127
+ val_acc = ((y_val_pred > 0.5).float() == y_val).float().mean().item()
128
+
129
+ avg_train_loss = total_loss / (n_samples // batch_size)
130
+ history["train_loss"].append(avg_train_loss)
131
+ history["val_loss"].append(val_loss)
132
+ history["val_acc"].append(val_acc)
133
+
134
+ if (epoch + 1) % 5 == 0:
135
+ print(f"[{name}] Epoch {epoch+1}/{epochs} | "
136
+ f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
137
+
138
+ model_path = os.path.join(resources_dir, f"model_{name.lower()}.pt")
139
+ torch.save(model.state_dict(), model_path)
140
+ return model, history
141
+
142
+ model_orig, hist_orig = train_model(
143
+ X_train_orig_num, X_train_orig_ticker, y_train_orig_t,
144
+ X_test_num, X_test_ticker, y_test_t, "Original"
145
+ )
146
+
147
+ model_mix, hist_mix = train_model(
148
+ X_train_mix_num, X_train_mix_ticker, y_train_mix_t,
149
+ X_test_num, X_test_ticker, y_test_t, "Combined"
150
+ )
151
+
152
+ results = {
153
+ "original": hist_orig,
154
+ "combined": hist_mix
155
+ }
156
+ with open(os.path.join(resources_dir, 'training_metrics.json'), "w") as f:
157
+ json.dump(results, f, indent=4)