Synthetic_Stock_Data / src /ae_decoder.py
Raheel Abdul Rehman
Prod Publish
bbf5d55
import os
import sys
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.model import LSTMAutoEncoder
def decode_latent_vectors(
model_path,
synthetic_latent_path,
ticker_map_path,
output_path,
model_params,
seq_len=90,
device=None
):
"""
Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder.
"""
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMAutoEncoder(
input_dim=5,
num_tickers=model_params["num_tickers"],
embed_dim=model_params["embed_dim"],
hidden_size=model_params["hidden_size"],
latent_dim=model_params["latent_dim"],
num_layers=model_params["num_layers"]
).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
print(f"Loaded trained LSTM Autoencoder from {model_path}")
latent_vectors = np.load(synthetic_latent_path)
ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers")
if not os.path.exists(ticker_ids_path):
raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}")
ticker_ids = np.load(ticker_ids_path)
latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device)
ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device)
decoded_batches = []
batch_size = 128
for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"):
batch_latent = latent_t[i:i + batch_size]
batch_ticker = ticker_t[i:i + batch_size]
with torch.no_grad():
ticker_emb = model.ticker_embed(batch_ticker)
latent_cat = torch.cat([batch_latent, ticker_emb], dim=1)
latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1)
dec_input = model.fc_dec(latent_cat)
reconstructed, _ = model.decoder(dec_input)
decoded_batches.append(reconstructed.cpu().numpy())
decoded = np.concatenate(decoded_batches, axis=0)
print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}")
with open(ticker_map_path, "rb") as f:
label_encoder = pickle.load(f)
if hasattr(label_encoder, "inverse_transform"):
tickers = label_encoder.inverse_transform(ticker_ids)
elif isinstance(label_encoder, (np.ndarray, list)):
tickers = np.array(label_encoder)[ticker_ids]
elif isinstance(label_encoder, dict):
tickers = [label_encoder[int(i)] for i in ticker_ids]
else:
raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}")
records = []
for i in range(len(decoded)):
ticker = tickers[i]
for t in range(seq_len):
o, h, l, c, v = decoded[i, t]
records.append({
"Ticker": ticker,
"Ticker_Encoded": int(ticker_ids[i]),
"TimeStep": t,
"Open": o,
"High": h,
"Low": l,
"Close": c,
"Volume": v
})
decoded_df = pd.DataFrame(records)
decoded_df.to_parquet(output_path, index=False)
print(f"Decoded OHLCV data saved to {output_path}")
return decoded_df
if __name__ == "__main__":
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy")
ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet")
model_params = {
"num_layers": 2,
"hidden_size": 64,
"latent_dim": 32,
"embed_dim": 16,
"num_tickers": 503
}
decode_latent_vectors(
model_path=model_path,
synthetic_latent_path=synthetic_latent_path,
ticker_map_path=ticker_map_path,
output_path=output_path,
model_params=model_params,
seq_len=90
)