Synthetic_Stock_Data / src /ae_decoder.py

Raheel Abdul Rehman

Prod Publish

bbf5d55 about 2 months ago

4.25 kB

	import os
	import sys
	import torch
	import pickle
	import numpy as np
	import pandas as pd
	from tqdm import tqdm

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from src.model import LSTMAutoEncoder


	def decode_latent_vectors(
	model_path,
	synthetic_latent_path,
	ticker_map_path,
	output_path,
	model_params,
	seq_len=90,
	device=None
	):
	"""
	Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder.
	"""

	device = device or ("cuda" if torch.cuda.is_available() else "cpu")

	model = LSTMAutoEncoder(
	input_dim=5,
	num_tickers=model_params["num_tickers"],
	embed_dim=model_params["embed_dim"],
	hidden_size=model_params["hidden_size"],
	latent_dim=model_params["latent_dim"],
	num_layers=model_params["num_layers"]
	).to(device)

	model.load_state_dict(torch.load(model_path, map_location=device))
	model.eval()
	print(f"Loaded trained LSTM Autoencoder from {model_path}")

	latent_vectors = np.load(synthetic_latent_path)
	ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers")

	if not os.path.exists(ticker_ids_path):
	raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}")

	ticker_ids = np.load(ticker_ids_path)

	latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device)
	ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device)

	decoded_batches = []
	batch_size = 128

	for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"):
	batch_latent = latent_t[i:i + batch_size]
	batch_ticker = ticker_t[i:i + batch_size]

	with torch.no_grad():
	ticker_emb = model.ticker_embed(batch_ticker)
	latent_cat = torch.cat([batch_latent, ticker_emb], dim=1)
	latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1)
	dec_input = model.fc_dec(latent_cat)
	reconstructed, _ = model.decoder(dec_input)

	decoded_batches.append(reconstructed.cpu().numpy())

	decoded = np.concatenate(decoded_batches, axis=0)
	print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}")

	with open(ticker_map_path, "rb") as f:
	label_encoder = pickle.load(f)

	if hasattr(label_encoder, "inverse_transform"):
	tickers = label_encoder.inverse_transform(ticker_ids)
	elif isinstance(label_encoder, (np.ndarray, list)):
	tickers = np.array(label_encoder)[ticker_ids]
	elif isinstance(label_encoder, dict):
	tickers = [label_encoder[int(i)] for i in ticker_ids]
	else:
	raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}")


	records = []
	for i in range(len(decoded)):
	ticker = tickers[i]
	for t in range(seq_len):
	o, h, l, c, v = decoded[i, t]
	records.append({
	"Ticker": ticker,
	"Ticker_Encoded": int(ticker_ids[i]),
	"TimeStep": t,
	"Open": o,
	"High": h,
	"Low": l,
	"Close": c,
	"Volume": v
	})

	decoded_df = pd.DataFrame(records)
	decoded_df.to_parquet(output_path, index=False)
	print(f"Decoded OHLCV data saved to {output_path}")

	return decoded_df


	if __name__ == "__main__":
	base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
	synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy")
	ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
	output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet")

	model_params = {
	"num_layers": 2,
	"hidden_size": 64,
	"latent_dim": 32,
	"embed_dim": 16,
	"num_tickers": 503
	}

	decode_latent_vectors(
	model_path=model_path,
	synthetic_latent_path=synthetic_latent_path,
	ticker_map_path=ticker_map_path,
	output_path=output_path,
	model_params=model_params,
	seq_len=90
	)