File size: 3,711 Bytes
bbf5d55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
from tqdm import tqdm
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.model import ConditionalGenerator
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet")
latent_dir = os.path.join(base_dir, "data","processed")
resources_dir = os.path.join(base_dir, "resources")
models_dir = os.path.join(base_dir, "models")
latent_path = os.path.join(latent_dir, "latent_vectors.npy")
tickers_path = os.path.join(latent_dir, "sequence_tickers.npy")
label_encoder_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl")
gen_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth")
gan_config_path = os.path.join(resources_dir, "gan_config.json")
with open(gan_config_path, "r") as f:
params = json.load(f)
noise_dim = params["noise_dim"]
latent_dim = params["latent_dim"]
hidden_dim = params["hidden_dim"]
embed_dim = params.get("embed_dim", 8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import joblib
label_encoder = joblib.load(label_encoder_path)
num_tickers = len(label_encoder.classes_)
G = ConditionalGenerator(
noise_dim=noise_dim,
latent_dim=latent_dim,
embed_dim=embed_dim,
num_tickers=num_tickers,
hidden_dim=hidden_dim
).to(device)
G.load_state_dict(torch.load(gen_path, map_location=device))
G.eval()
print(f"Loaded conditional generator (latent_dim={latent_dim}, embed_dim={embed_dim})")
real_latent = np.load(latent_path)
N_SAMPLES = real_latent.shape[0]
batch_size = 512
ticker_ids = np.random.randint(0, num_tickers, N_SAMPLES)
samples = []
for i in tqdm(range((N_SAMPLES + batch_size - 1) // batch_size), desc="Generating synthetic latents"):
b = min(batch_size, N_SAMPLES - i * batch_size)
z = torch.randn(b, noise_dim).to(device)
tickers_batch = torch.tensor(ticker_ids[i * batch_size: i * batch_size + b], dtype=torch.long).to(device)
out = G(z, tickers_batch).detach().cpu().numpy()
samples.append(out)
synth_latent = np.vstack(samples)
np.save(os.path.join(latent_dir, "synthetic_latent_vectors.npy"), synth_latent)
np.save(os.path.join(latent_dir, "synthetic_latent_tickers.npy"), ticker_ids)
print(f"Saved synthetic latent vectors to {latent_dir}")
metrics = {"per_dimension": {}, "correlation": {}}
min_dim = min(real_latent.shape[1], synth_latent.shape[1])
real_latent = real_latent[:, :min_dim]
synth_latent = synth_latent[:, :min_dim]
for i in range(min_dim):
r = real_latent[:, i]
s = synth_latent[:, i]
min_n = min(len(r), len(s))
ks_stat, ks_p = ks_2samp(
np.random.choice(r, min_n, replace=False),
np.random.choice(s, min_n, replace=False)
)
metrics["per_dimension"][f"latent_{i}"] = {
"real_mean": float(r.mean()),
"synth_mean": float(s.mean()),
"mean_diff": float(s.mean() - r.mean()),
"real_std": float(r.std()),
"synth_std": float(s.std()),
"std_diff": float(s.std() - r.std()),
"ks_stat": float(ks_stat),
"ks_pvalue": float(ks_p),
}
real_corr = np.corrcoef(real_latent, rowvar=False)
synth_corr = np.corrcoef(synth_latent, rowvar=False)
metrics["correlation"]["frobenius_diff"] = float(np.linalg.norm(real_corr - synth_corr, ord='fro'))
# Save metrics
METRICS_JSON = os.path.join(resources_dir, "conditional_gan_metrics.json")
with open(METRICS_JSON, "w") as f:
json.dump(metrics, f, indent=4)
print(f"Evaluation metrics saved to: {METRICS_JSON}")
|