File size: 4,251 Bytes
bbf5d55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import sys
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.model import LSTMAutoEncoder
def decode_latent_vectors(
model_path,
synthetic_latent_path,
ticker_map_path,
output_path,
model_params,
seq_len=90,
device=None
):
"""
Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder.
"""
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMAutoEncoder(
input_dim=5,
num_tickers=model_params["num_tickers"],
embed_dim=model_params["embed_dim"],
hidden_size=model_params["hidden_size"],
latent_dim=model_params["latent_dim"],
num_layers=model_params["num_layers"]
).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
print(f"Loaded trained LSTM Autoencoder from {model_path}")
latent_vectors = np.load(synthetic_latent_path)
ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers")
if not os.path.exists(ticker_ids_path):
raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}")
ticker_ids = np.load(ticker_ids_path)
latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device)
ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device)
decoded_batches = []
batch_size = 128
for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"):
batch_latent = latent_t[i:i + batch_size]
batch_ticker = ticker_t[i:i + batch_size]
with torch.no_grad():
ticker_emb = model.ticker_embed(batch_ticker)
latent_cat = torch.cat([batch_latent, ticker_emb], dim=1)
latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1)
dec_input = model.fc_dec(latent_cat)
reconstructed, _ = model.decoder(dec_input)
decoded_batches.append(reconstructed.cpu().numpy())
decoded = np.concatenate(decoded_batches, axis=0)
print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}")
with open(ticker_map_path, "rb") as f:
label_encoder = pickle.load(f)
if hasattr(label_encoder, "inverse_transform"):
tickers = label_encoder.inverse_transform(ticker_ids)
elif isinstance(label_encoder, (np.ndarray, list)):
tickers = np.array(label_encoder)[ticker_ids]
elif isinstance(label_encoder, dict):
tickers = [label_encoder[int(i)] for i in ticker_ids]
else:
raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}")
records = []
for i in range(len(decoded)):
ticker = tickers[i]
for t in range(seq_len):
o, h, l, c, v = decoded[i, t]
records.append({
"Ticker": ticker,
"Ticker_Encoded": int(ticker_ids[i]),
"TimeStep": t,
"Open": o,
"High": h,
"Low": l,
"Close": c,
"Volume": v
})
decoded_df = pd.DataFrame(records)
decoded_df.to_parquet(output_path, index=False)
print(f"Decoded OHLCV data saved to {output_path}")
return decoded_df
if __name__ == "__main__":
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy")
ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet")
model_params = {
"num_layers": 2,
"hidden_size": 64,
"latent_dim": 32,
"embed_dim": 16,
"num_tickers": 503
}
decode_latent_vectors(
model_path=model_path,
synthetic_latent_path=synthetic_latent_path,
ticker_map_path=ticker_map_path,
output_path=output_path,
model_params=model_params,
seq_len=90
)
|