import os import sys import json import torch import numpy as np import pandas as pd from scipy.stats import ks_2samp import matplotlib.pyplot as plt from tqdm import tqdm sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.model import ConditionalGenerator base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet") latent_dir = os.path.join(base_dir, "data","processed") resources_dir = os.path.join(base_dir, "resources") models_dir = os.path.join(base_dir, "models") latent_path = os.path.join(latent_dir, "latent_vectors.npy") tickers_path = os.path.join(latent_dir, "sequence_tickers.npy") label_encoder_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl") gen_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth") gan_config_path = os.path.join(resources_dir, "gan_config.json") with open(gan_config_path, "r") as f: params = json.load(f) noise_dim = params["noise_dim"] latent_dim = params["latent_dim"] hidden_dim = params["hidden_dim"] embed_dim = params.get("embed_dim", 8) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") import joblib label_encoder = joblib.load(label_encoder_path) num_tickers = len(label_encoder.classes_) G = ConditionalGenerator( noise_dim=noise_dim, latent_dim=latent_dim, embed_dim=embed_dim, num_tickers=num_tickers, hidden_dim=hidden_dim ).to(device) G.load_state_dict(torch.load(gen_path, map_location=device)) G.eval() print(f"Loaded conditional generator (latent_dim={latent_dim}, embed_dim={embed_dim})") real_latent = np.load(latent_path) N_SAMPLES = real_latent.shape[0] batch_size = 512 ticker_ids = np.random.randint(0, num_tickers, N_SAMPLES) samples = [] for i in tqdm(range((N_SAMPLES + batch_size - 1) // batch_size), desc="Generating synthetic latents"): b = min(batch_size, N_SAMPLES - i * batch_size) z = torch.randn(b, noise_dim).to(device) tickers_batch = torch.tensor(ticker_ids[i * batch_size: i * batch_size + b], dtype=torch.long).to(device) out = G(z, tickers_batch).detach().cpu().numpy() samples.append(out) synth_latent = np.vstack(samples) np.save(os.path.join(latent_dir, "synthetic_latent_vectors.npy"), synth_latent) np.save(os.path.join(latent_dir, "synthetic_latent_tickers.npy"), ticker_ids) print(f"Saved synthetic latent vectors to {latent_dir}") metrics = {"per_dimension": {}, "correlation": {}} min_dim = min(real_latent.shape[1], synth_latent.shape[1]) real_latent = real_latent[:, :min_dim] synth_latent = synth_latent[:, :min_dim] for i in range(min_dim): r = real_latent[:, i] s = synth_latent[:, i] min_n = min(len(r), len(s)) ks_stat, ks_p = ks_2samp( np.random.choice(r, min_n, replace=False), np.random.choice(s, min_n, replace=False) ) metrics["per_dimension"][f"latent_{i}"] = { "real_mean": float(r.mean()), "synth_mean": float(s.mean()), "mean_diff": float(s.mean() - r.mean()), "real_std": float(r.std()), "synth_std": float(s.std()), "std_diff": float(s.std() - r.std()), "ks_stat": float(ks_stat), "ks_pvalue": float(ks_p), } real_corr = np.corrcoef(real_latent, rowvar=False) synth_corr = np.corrcoef(synth_latent, rowvar=False) metrics["correlation"]["frobenius_diff"] = float(np.linalg.norm(real_corr - synth_corr, ord='fro')) # Save metrics METRICS_JSON = os.path.join(resources_dir, "conditional_gan_metrics.json") with open(METRICS_JSON, "w") as f: json.dump(metrics, f, indent=4) print(f"Evaluation metrics saved to: {METRICS_JSON}")