File size: 4,251 Bytes

bbf5d55

import os
import sys
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from src.model import LSTMAutoEncoder


def decode_latent_vectors(
    model_path,
    synthetic_latent_path,
    ticker_map_path,
    output_path,
    model_params,
    seq_len=90,
    device=None
):
    """
    Decode latent vectors back into OHLCV sequences using the trained LSTM Autoencoder.
    """

    device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    model = LSTMAutoEncoder(
        input_dim=5,
        num_tickers=model_params["num_tickers"],
        embed_dim=model_params["embed_dim"],
        hidden_size=model_params["hidden_size"],
        latent_dim=model_params["latent_dim"],
        num_layers=model_params["num_layers"]
    ).to(device)

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print(f"Loaded trained LSTM Autoencoder from {model_path}")

    latent_vectors = np.load(synthetic_latent_path)
    ticker_ids_path = synthetic_latent_path.replace("latent_vectors", "latent_tickers")

    if not os.path.exists(ticker_ids_path):
        raise FileNotFoundError(f"Ticker mapping not found at {ticker_ids_path}")

    ticker_ids = np.load(ticker_ids_path)

    latent_t = torch.tensor(latent_vectors, dtype=torch.float32).to(device)
    ticker_t = torch.tensor(ticker_ids, dtype=torch.long).to(device)

    decoded_batches = []
    batch_size = 128

    for i in tqdm(range(0, len(latent_t), batch_size), desc="Decoding latent sequences"):
        batch_latent = latent_t[i:i + batch_size]
        batch_ticker = ticker_t[i:i + batch_size]

        with torch.no_grad():
            ticker_emb = model.ticker_embed(batch_ticker)
            latent_cat = torch.cat([batch_latent, ticker_emb], dim=1)
            latent_cat = latent_cat.unsqueeze(1).repeat(1, seq_len, 1)
            dec_input = model.fc_dec(latent_cat)
            reconstructed, _ = model.decoder(dec_input)

        decoded_batches.append(reconstructed.cpu().numpy())

    decoded = np.concatenate(decoded_batches, axis=0)
    print(f"Decoded {decoded.shape[0]} sequences of length {seq_len}")

    with open(ticker_map_path, "rb") as f:
        label_encoder = pickle.load(f)

    if hasattr(label_encoder, "inverse_transform"):
        tickers = label_encoder.inverse_transform(ticker_ids)
    elif isinstance(label_encoder, (np.ndarray, list)):
        tickers = np.array(label_encoder)[ticker_ids]
    elif isinstance(label_encoder, dict):
        tickers = [label_encoder[int(i)] for i in ticker_ids]
    else:
        raise TypeError(f"Unrecognized ticker mapping format: {type(label_encoder)}")


    records = []
    for i in range(len(decoded)):
        ticker = tickers[i]
        for t in range(seq_len):
            o, h, l, c, v = decoded[i, t]
            records.append({
                "Ticker": ticker,
                "Ticker_Encoded": int(ticker_ids[i]),
                "TimeStep": t,
                "Open": o,
                "High": h,
                "Low": l,
                "Close": c,
                "Volume": v
            })

    decoded_df = pd.DataFrame(records)
    decoded_df.to_parquet(output_path, index=False)
    print(f"Decoded OHLCV data saved to {output_path}")

    return decoded_df


if __name__ == "__main__":
    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    model_path = os.path.join(base_dir, "models", "lstm_autoencoder.pth")
    synthetic_latent_path = os.path.join(base_dir, "data", "latent", "synthetic_latent_vectors.npy")
    ticker_map_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
    output_path = os.path.join(base_dir, "data", "processed", "decoded_synthetic_ohlcv.parquet")

    model_params = {
        "num_layers": 2,
        "hidden_size": 64,
        "latent_dim": 32,
        "embed_dim": 16,
        "num_tickers": 503
    }

    decode_latent_vectors(
        model_path=model_path,
        synthetic_latent_path=synthetic_latent_path,
        ticker_map_path=ticker_map_path,
        output_path=output_path,
        model_params=model_params,
        seq_len=90
    )