import os import sys import torch import pickle import numpy as np import pandas as pd from tqdm import tqdm sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.model import LSTMAutoEncoder def decode_latent_vectors( model_path, synthetic_latent_path, ticker_map_path, output_path, model_params, seq_len=90, device=None ): """ Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder. """ device = device or ("cuda" if torch.cuda.is_available() else "cpu") model = LSTMAutoEncoder( input_dim=5, num_tickers=model_params["num_tickers"], embed_dim=model_params["embed_dim"], hidden_size=model_params["hidden_size"], latent_dim=model_params["latent_dim"], num_layers=model_params["num_layers"] ).to(device) model.load_state_dict(torch.load(model_path, map_location=device)) model.eval() print(f"Loaded trained LSTM Autoencoder from {model_path}") latent_vectors = np.load(synthetic_latent_path) ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers") if not os.path.exists(ticker_ids_path): raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}") ticker_ids = np.load(ticker_ids_path) latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device) ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device) decoded_batches = [] batch_size = 128 for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"): batch_latent = latent_t[i:i + batch_size] batch_ticker = ticker_t[i:i + batch_size] with torch.no_grad(): ticker_emb = model.ticker_embed(batch_ticker) latent_cat = torch.cat([batch_latent, ticker_emb], dim=1) latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1) dec_input = model.fc_dec(latent_cat) reconstructed, _ = model.decoder(dec_input) decoded_batches.append(reconstructed.cpu().numpy()) decoded = np.concatenate(decoded_batches, axis=0) print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}") with open(ticker_map_path, "rb") as f: label_encoder = pickle.load(f) if hasattr(label_encoder, "inverse_transform"): tickers = label_encoder.inverse_transform(ticker_ids) elif isinstance(label_encoder, (np.ndarray, list)): tickers = np.array(label_encoder)[ticker_ids] elif isinstance(label_encoder, dict): tickers = [label_encoder[int(i)] for i in ticker_ids] else: raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}") records = [] for i in range(len(decoded)): ticker = tickers[i] for t in range(seq_len): o, h, l, c, v = decoded[i, t] records.append({ "Ticker": ticker, "Ticker_Encoded": int(ticker_ids[i]), "TimeStep": t, "Open": o, "High": h, "Low": l, "Close": c, "Volume": v }) decoded_df = pd.DataFrame(records) decoded_df.to_parquet(output_path, index=False) print(f"Decoded OHLCV data saved to {output_path}") return decoded_df if __name__ == "__main__": base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth") synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy") ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl") output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet") model_params = { "num_layers": 2, "hidden_size": 64, "latent_dim": 32, "embed_dim": 16, "num_tickers": 503 } decode_latent_vectors( model_path=model_path, synthetic_latent_path=synthetic_latent_path, ticker_map_path=ticker_map_path, output_path=output_path, model_params=model_params, seq_len=90 )