| import os | |
| import sys | |
| import torch | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| from tqdm import tqdm | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from src.model import LSTMAutoEncoder | |
| def decode_latent_vectors( | |
| model_path, | |
| synthetic_latent_path, | |
| ticker_map_path, | |
| output_path, | |
| model_params, | |
| seq_len=90, | |
| device=None | |
| ): | |
| """ | |
| Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder. | |
| """ | |
| device = device or ("cuda" if torch.cuda.is_available() else "cpu") | |
| model = LSTMAutoEncoder( | |
| input_dim=5, | |
| num_tickers=model_params["num_tickers"], | |
| embed_dim=model_params["embed_dim"], | |
| hidden_size=model_params["hidden_size"], | |
| latent_dim=model_params["latent_dim"], | |
| num_layers=model_params["num_layers"] | |
| ).to(device) | |
| model.load_state_dict(torch.load(model_path, map_location=device)) | |
| model.eval() | |
| print(f"Loaded trained LSTM Autoencoder from {model_path}") | |
| latent_vectors = np.load(synthetic_latent_path) | |
| ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers") | |
| if not os.path.exists(ticker_ids_path): | |
| raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}") | |
| ticker_ids = np.load(ticker_ids_path) | |
| latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device) | |
| ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device) | |
| decoded_batches = [] | |
| batch_size = 128 | |
| for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"): | |
| batch_latent = latent_t[i:i + batch_size] | |
| batch_ticker = ticker_t[i:i + batch_size] | |
| with torch.no_grad(): | |
| ticker_emb = model.ticker_embed(batch_ticker) | |
| latent_cat = torch.cat([batch_latent, ticker_emb], dim=1) | |
| latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1) | |
| dec_input = model.fc_dec(latent_cat) | |
| reconstructed, _ = model.decoder(dec_input) | |
| decoded_batches.append(reconstructed.cpu().numpy()) | |
| decoded = np.concatenate(decoded_batches, axis=0) | |
| print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}") | |
| with open(ticker_map_path, "rb") as f: | |
| label_encoder = pickle.load(f) | |
| if hasattr(label_encoder, "inverse_transform"): | |
| tickers = label_encoder.inverse_transform(ticker_ids) | |
| elif isinstance(label_encoder, (np.ndarray, list)): | |
| tickers = np.array(label_encoder)[ticker_ids] | |
| elif isinstance(label_encoder, dict): | |
| tickers = [label_encoder[int(i)] for i in ticker_ids] | |
| else: | |
| raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}") | |
| records = [] | |
| for i in range(len(decoded)): | |
| ticker = tickers[i] | |
| for t in range(seq_len): | |
| o, h, l, c, v = decoded[i, t] | |
| records.append({ | |
| "Ticker": ticker, | |
| "Ticker_Encoded": int(ticker_ids[i]), | |
| "TimeStep": t, | |
| "Open": o, | |
| "High": h, | |
| "Low": l, | |
| "Close": c, | |
| "Volume": v | |
| }) | |
| decoded_df = pd.DataFrame(records) | |
| decoded_df.to_parquet(output_path, index=False) | |
| print(f"Decoded OHLCV data saved to {output_path}") | |
| return decoded_df | |
| if __name__ == "__main__": | |
| base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth") | |
| synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy") | |
| ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl") | |
| output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet") | |
| model_params = { | |
| "num_layers": 2, | |
| "hidden_size": 64, | |
| "latent_dim": 32, | |
| "embed_dim": 16, | |
| "num_tickers": 503 | |
| } | |
| decode_latent_vectors( | |
| model_path=model_path, | |
| synthetic_latent_path=synthetic_latent_path, | |
| ticker_map_path=ticker_map_path, | |
| output_path=output_path, | |
| model_params=model_params, | |
| seq_len=90 | |
| ) | |