Synthetic_Stock_Data / src /gan_evaluate.py
Raheel Abdul Rehman
Prod Publish
bbf5d55
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
from tqdm import tqdm
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.model import ConditionalGenerator
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet")
latent_dir = os.path.join(base_dir, "data","processed")
resources_dir = os.path.join(base_dir, "resources")
models_dir = os.path.join(base_dir, "models")
latent_path = os.path.join(latent_dir, "latent_vectors.npy")
tickers_path = os.path.join(latent_dir, "sequence_tickers.npy")
label_encoder_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
gen_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth")
gan_config_path = os.path.join(resources_dir, "gan_config.json")
with open(gan_config_path, "r") as f:
params = json.load(f)
noise_dim = params["noise_dim"]
latent_dim = params["latent_dim"]
hidden_dim = params["hidden_dim"]
embed_dim = params.get("embed_dim", 8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import joblib
label_encoder = joblib.load(label_encoder_path)
num_tickers = len(label_encoder.classes_)
G = ConditionalGenerator(
noise_dim=noise_dim,
latent_dim=latent_dim,
embed_dim=embed_dim,
num_tickers=num_tickers,
hidden_dim=hidden_dim
).to(device)
G.load_state_dict(torch.load(gen_path, map_location=device))
G.eval()
print(f"Loaded conditional generator (latent_dim={latent_dim}, embed_dim={embed_dim})")
real_latent = np.load(latent_path)
N_SAMPLES = real_latent.shape[0]
batch_size = 512
ticker_ids = np.random.randint(0, num_tickers, N_SAMPLES)
samples = []
for i in tqdm(range((N_SAMPLES + batch_size - 1) // batch_size), desc="Generating synthetic latents"):
b = min(batch_size, N_SAMPLES - i * batch_size)
z = torch.randn(b, noise_dim).to(device)
tickers_batch = torch.tensor(ticker_ids[i * batch_size: i * batch_size + b], dtype=torch.long).to(device)
out = G(z, tickers_batch).detach().cpu().numpy()
samples.append(out)
synth_latent = np.vstack(samples)
np.save(os.path.join(latent_dir, "synthetic_latent_vectors.npy"), synth_latent)
np.save(os.path.join(latent_dir, "synthetic_latent_tickers.npy"), ticker_ids)
print(f"Saved synthetic latent vectors to {latent_dir}")
metrics = {"per_dimension": {}, "correlation": {}}
min_dim = min(real_latent.shape[1], synth_latent.shape[1])
real_latent = real_latent[:, :min_dim]
synth_latent = synth_latent[:, :min_dim]
for i in range(min_dim):
r = real_latent[:, i]
s = synth_latent[:, i]
min_n = min(len(r), len(s))
ks_stat, ks_p = ks_2samp(
np.random.choice(r, min_n, replace=False),
np.random.choice(s, min_n, replace=False)
)
metrics["per_dimension"][f"latent_{i}"] = {
"real_mean": float(r.mean()),
"synth_mean": float(s.mean()),
"mean_diff": float(s.mean() - r.mean()),
"real_std": float(r.std()),
"synth_std": float(s.std()),
"std_diff": float(s.std() - r.std()),
"ks_stat": float(ks_stat),
"ks_pvalue": float(ks_p),
}
real_corr = np.corrcoef(real_latent, rowvar=False)
synth_corr = np.corrcoef(synth_latent, rowvar=False)
metrics["correlation"]["frobenius_diff"] = float(np.linalg.norm(real_corr - synth_corr, ord='fro'))
# Save metrics
METRICS_JSON = os.path.join(resources_dir, "conditional_gan_metrics.json")
with open(METRICS_JSON, "w") as f:
json.dump(metrics, f, indent=4)
print(f"Evaluation metrics saved to: {METRICS_JSON}")