import numpy as np import pandas as pd import os import sys import json sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from sklearn.preprocessing import OneHotEncoder from synthcity.metrics import eval_detection, eval_performance, eval_statistical from synthcity.plugins.core.dataloader import GenericDataLoader pd.options.mode.chained_assignment = None import argparse def eval_metrics(syn_data, real_data, info): real_data.columns = range(len(real_data.columns)) syn_data.columns = range(len(syn_data.columns)) num_col_idx = info['num_col_idx'] cat_col_idx = info['cat_col_idx'] target_col_idx = info['target_col_idx'] if info['task_type'] == 'regression': num_col_idx += target_col_idx else: cat_col_idx += target_col_idx num_real_data = real_data[num_col_idx] cat_real_data = real_data[cat_col_idx] num_real_data_np = num_real_data.to_numpy() cat_real_data_np = cat_real_data.to_numpy().astype('str') num_syn_data = syn_data[num_col_idx] cat_syn_data = syn_data[cat_col_idx] num_syn_data_np = num_syn_data.to_numpy() # cat_syn_data_np = np.array if cat_real_data.shape[1] > 0: cat_syn_data_np = cat_syn_data.to_numpy().astype('str') encoder = OneHotEncoder() encoder.fit(np.concatenate((cat_real_data_np, cat_syn_data_np), axis=0)) cat_real_data_oh = encoder.transform(cat_real_data_np).toarray() cat_syn_data_oh = encoder.transform(cat_syn_data_np).toarray() else: cat_real_data_oh = np.empty((cat_real_data.shape[0], 0)) cat_syn_data_oh = np.empty((cat_syn_data.shape[0], 0)) le_real_data = pd.DataFrame(np.concatenate((num_real_data_np, cat_real_data_oh), axis = 1)).astype(float) le_syn_data = pd.DataFrame(np.concatenate((num_syn_data_np, cat_syn_data_oh), axis = 1)).astype(float) np.set_printoptions(precision=4) print('=========== All Features ===========') print('Data shape: ', le_syn_data.shape) X_syn_loader = GenericDataLoader(le_syn_data) X_real_loader = GenericDataLoader(le_real_data) quality_evaluator = eval_statistical.AlphaPrecision() qual_res = quality_evaluator.evaluate(X_real_loader, X_syn_loader) qual_res = { k: v for (k, v) in qual_res.items() if "naive" in k } # use the naive implementation of AlphaPrecision # qual_score = np.mean(list(qual_res.values())) print('alpha precision: {:.6f}, beta recall: {:.6f}'.format(qual_res['delta_precision_alpha_naive'], qual_res['delta_coverage_beta_naive'] )) Alpha_Precision_all = qual_res['delta_precision_alpha_naive'] Beta_Recall_all = qual_res['delta_coverage_beta_naive'] # save_dir = f'eval/quality/{dataname}' # if not os.path.exists(save_dir): # os.makedirs(save_dir) # with open(f'{save_dir}/{model}.txt', 'w') as f: # f.write(f'{Alpha_Precision_all}\n') # f.write(f'{Beta_Recall_all}\n') return Alpha_Precision_all, Beta_Recall_all if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--dataname', type=str, default='adult') parser.add_argument('--model', type=str, default='model') parser.add_argument('--path', type=str, default = None, help='The file path of the synthetic data') args = parser.parse_args() dataname = args.dataname model = args.model if not args.path: syn_path = f'synthetic/{dataname}/{model}.csv' else: syn_path = args.path real_path = f'synthetic/{dataname}/real.csv' data_dir = f'data/{dataname}' print(syn_path) with open(f'{data_dir}/info.json', 'r') as f: info = json.load(f) syn_data = pd.read_csv(syn_path) real_data = pd.read_csv(real_path) ''' Special treatment for default dataset and CoDi model ''' real_data.columns = range(len(real_data.columns)) syn_data.columns = range(len(syn_data.columns)) num_col_idx = info['num_col_idx'] cat_col_idx = info['cat_col_idx'] target_col_idx = info['target_col_idx'] if info['task_type'] == 'regression': num_col_idx += target_col_idx else: cat_col_idx += target_col_idx num_real_data = real_data[num_col_idx] cat_real_data = real_data[cat_col_idx] num_real_data_np = num_real_data.to_numpy() cat_real_data_np = cat_real_data.to_numpy().astype('str') num_syn_data = syn_data[num_col_idx] cat_syn_data = syn_data[cat_col_idx] num_syn_data_np = num_syn_data.to_numpy() # cat_syn_data_np = np.array cat_syn_data_np = cat_syn_data.to_numpy().astype('str') if (dataname == 'default' or dataname == 'news') and model[:4] == 'codi': cat_syn_data_np = cat_syn_data.astype('int').to_numpy().astype('str') elif model[:5] == 'great': if dataname == 'shoppers': cat_syn_data_np[:, 1] = cat_syn_data[11].astype('int').to_numpy().astype('str') cat_syn_data_np[:, 2] = cat_syn_data[12].astype('int').to_numpy().astype('str') cat_syn_data_np[:, 3] = cat_syn_data[13].astype('int').to_numpy().astype('str') max_data = cat_real_data[14].max() cat_syn_data.loc[cat_syn_data[14] > max_data, 14] = max_data # cat_syn_data[14] = cat_syn_data[14].apply(lambda x: threshold if x > max_data else x) cat_syn_data_np[:, 4] = cat_syn_data[14].astype('int').to_numpy().astype('str') cat_syn_data_np[:, 4] = cat_syn_data[14].astype('int').to_numpy().astype('str') elif dataname in ['default', 'faults', 'beijing']: columns = cat_real_data.columns for i, col in enumerate(columns): if (cat_real_data[col].dtype == 'int'): max_data = cat_real_data[col].max() min_data = cat_real_data[col].min() cat_syn_data.loc[cat_syn_data[col] > max_data, col] = max_data cat_syn_data.loc[cat_syn_data[col] < min_data, col] = min_data cat_syn_data_np[:, i] = cat_syn_data[col].astype('int').to_numpy().astype('str') else: cat_syn_data_np = cat_syn_data.to_numpy().astype('str') else: cat_syn_data_np = cat_syn_data.to_numpy().astype('str') encoder = OneHotEncoder() encoder.fit(cat_real_data_np) cat_real_data_oh = encoder.transform(cat_real_data_np).toarray() cat_syn_data_oh = encoder.transform(cat_syn_data_np).toarray() le_real_data = pd.DataFrame(np.concatenate((num_real_data_np, cat_real_data_oh), axis = 1)).astype(float) le_real_num = pd.DataFrame(num_real_data_np).astype(float) le_real_cat = pd.DataFrame(cat_real_data_oh).astype(float) le_syn_data = pd.DataFrame(np.concatenate((num_syn_data_np, cat_syn_data_oh), axis = 1)).astype(float) le_syn_num = pd.DataFrame(num_syn_data_np).astype(float) le_syn_cat = pd.DataFrame(cat_syn_data_oh).astype(float) np.set_printoptions(precision=4) result = [] print('=========== All Features ===========') print('Data shape: ', le_syn_data.shape) X_syn_loader = GenericDataLoader(le_syn_data) X_real_loader = GenericDataLoader(le_real_data) quality_evaluator = eval_statistical.AlphaPrecision() qual_res = quality_evaluator.evaluate(X_real_loader, X_syn_loader) qual_res = { k: v for (k, v) in qual_res.items() if "naive" in k } # use the naive implementation of AlphaPrecision qual_score = np.mean(list(qual_res.values())) print('alpha precision: {:.6f}, beta recall: {:.6f}'.format(qual_res['delta_precision_alpha_naive'], qual_res['delta_coverage_beta_naive'] )) Alpha_Precision_all = qual_res['delta_precision_alpha_naive'] Beta_Recall_all = qual_res['delta_coverage_beta_naive'] save_dir = f'eval/quality/{dataname}' if not os.path.exists(save_dir): os.makedirs(save_dir) with open(f'{save_dir}/{model}.txt', 'w') as f: f.write(f'{Alpha_Precision_all}\n') f.write(f'{Beta_Recall_all}\n')