|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import torch |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from scipy.stats import ks_2samp |
|
|
import matplotlib.pyplot as plt |
|
|
from tqdm import tqdm |
|
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
|
|
from src.model import ConditionalGenerator |
|
|
|
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
processed_data_path = os.path.join(base_dir, "data", "processed", "stock_data.parquet") |
|
|
latent_dir = os.path.join(base_dir, "data","processed") |
|
|
resources_dir = os.path.join(base_dir, "resources") |
|
|
models_dir = os.path.join(base_dir, "models") |
|
|
|
|
|
latent_path = os.path.join(latent_dir, "latent_vectors.npy") |
|
|
tickers_path = os.path.join(latent_dir, "sequence_tickers.npy") |
|
|
label_encoder_path = os.path.join(base_dir, "data", "processed", "ticker_label_encoder.pkl") |
|
|
gen_path = os.path.join(models_dir, "latent_gan_generator_conditional.pth") |
|
|
gan_config_path = os.path.join(resources_dir, "gan_config.json") |
|
|
|
|
|
with open(gan_config_path, "r") as f: |
|
|
params = json.load(f) |
|
|
|
|
|
noise_dim = params["noise_dim"] |
|
|
latent_dim = params["latent_dim"] |
|
|
hidden_dim = params["hidden_dim"] |
|
|
embed_dim = params.get("embed_dim", 8) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
import joblib |
|
|
label_encoder = joblib.load(label_encoder_path) |
|
|
num_tickers = len(label_encoder.classes_) |
|
|
|
|
|
G = ConditionalGenerator( |
|
|
noise_dim=noise_dim, |
|
|
latent_dim=latent_dim, |
|
|
embed_dim=embed_dim, |
|
|
num_tickers=num_tickers, |
|
|
hidden_dim=hidden_dim |
|
|
).to(device) |
|
|
|
|
|
G.load_state_dict(torch.load(gen_path, map_location=device)) |
|
|
G.eval() |
|
|
print(f"Loaded conditional generator (latent_dim={latent_dim}, embed_dim={embed_dim})") |
|
|
|
|
|
real_latent = np.load(latent_path) |
|
|
N_SAMPLES = real_latent.shape[0] |
|
|
batch_size = 512 |
|
|
|
|
|
ticker_ids = np.random.randint(0, num_tickers, N_SAMPLES) |
|
|
samples = [] |
|
|
|
|
|
for i in tqdm(range((N_SAMPLES + batch_size - 1) // batch_size), desc="Generating synthetic latents"): |
|
|
b = min(batch_size, N_SAMPLES - i * batch_size) |
|
|
z = torch.randn(b, noise_dim).to(device) |
|
|
tickers_batch = torch.tensor(ticker_ids[i * batch_size: i * batch_size + b], dtype=torch.long).to(device) |
|
|
out = G(z, tickers_batch).detach().cpu().numpy() |
|
|
samples.append(out) |
|
|
|
|
|
synth_latent = np.vstack(samples) |
|
|
np.save(os.path.join(latent_dir, "synthetic_latent_vectors.npy"), synth_latent) |
|
|
np.save(os.path.join(latent_dir, "synthetic_latent_tickers.npy"), ticker_ids) |
|
|
|
|
|
print(f"Saved synthetic latent vectors to {latent_dir}") |
|
|
|
|
|
metrics = {"per_dimension": {}, "correlation": {}} |
|
|
min_dim = min(real_latent.shape[1], synth_latent.shape[1]) |
|
|
real_latent = real_latent[:, :min_dim] |
|
|
synth_latent = synth_latent[:, :min_dim] |
|
|
|
|
|
for i in range(min_dim): |
|
|
r = real_latent[:, i] |
|
|
s = synth_latent[:, i] |
|
|
min_n = min(len(r), len(s)) |
|
|
ks_stat, ks_p = ks_2samp( |
|
|
np.random.choice(r, min_n, replace=False), |
|
|
np.random.choice(s, min_n, replace=False) |
|
|
) |
|
|
metrics["per_dimension"][f"latent_{i}"] = { |
|
|
"real_mean": float(r.mean()), |
|
|
"synth_mean": float(s.mean()), |
|
|
"mean_diff": float(s.mean() - r.mean()), |
|
|
"real_std": float(r.std()), |
|
|
"synth_std": float(s.std()), |
|
|
"std_diff": float(s.std() - r.std()), |
|
|
"ks_stat": float(ks_stat), |
|
|
"ks_pvalue": float(ks_p), |
|
|
} |
|
|
|
|
|
real_corr = np.corrcoef(real_latent, rowvar=False) |
|
|
synth_corr = np.corrcoef(synth_latent, rowvar=False) |
|
|
metrics["correlation"]["frobenius_diff"] = float(np.linalg.norm(real_corr - synth_corr, ord='fro')) |
|
|
|
|
|
|
|
|
METRICS_JSON = os.path.join(resources_dir, "conditional_gan_metrics.json") |
|
|
with open(METRICS_JSON, "w") as f: |
|
|
json.dump(metrics, f, indent=4) |
|
|
|
|
|
print(f"Evaluation metrics saved to: {METRICS_JSON}") |
|
|
|