Spaces:
Running
Running
| import json | |
| import os | |
| from collections import defaultdict | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| import yaml | |
| from matplotlib import pyplot as plt | |
| from lightgbm import LGBMClassifier | |
| from scipy.stats import ks_2samp | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, f1_score, roc_auc_score | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler | |
| from sklearn.tree import DecisionTreeClassifier | |
| from xgboost import XGBClassifier | |
| def format_by_rank( | |
| data: pd.DataFrame, maximize: bool = True, warning_threshold: float = None, stds: pd.DataFrame = None | |
| ): | |
| ranks = data.abs().rank(axis=1, method="min", ascending=not maximize) | |
| prefix = pd.DataFrame("", index=ranks.index, columns=ranks.columns) | |
| suffix = pd.DataFrame("", index=ranks.index, columns=ranks.columns) | |
| highlight = ranks <= 1.0 | |
| suffix[highlight] = r"}" | |
| if stds is None: | |
| prefix[highlight] = r"\textbf{" | |
| result = prefix + data.applymap(lambda x: f"{x:.3f}") + suffix | |
| else: | |
| prefix[highlight] = r"\boldsymbol{" | |
| result = ("$" + prefix + data.applymap(lambda x: f"{x:.3f}") + r"_{\pm " | |
| + stds.applymap(lambda x: f"{x:.3f}") + "}" + suffix + "$") | |
| if warning_threshold is not None: | |
| data = data.abs() | |
| warning = data < warning_threshold if maximize else data > warning_threshold | |
| warn_prefix = pd.DataFrame("", index=ranks.index, columns=ranks.columns) | |
| warn_suffix = pd.DataFrame("", index=ranks.index, columns=ranks.columns) | |
| warn_prefix[warning] = r"\textcolor{red}{" | |
| warn_suffix[warning] = r"}" | |
| result = warn_prefix + result + warn_suffix | |
| return result | |
| class RelationalEvaluator: | |
| renames = { | |
| "sdv": "HMA", | |
| "ind": "IND", | |
| "rctgan": "RCT", | |
| "clava": "CLD", | |
| "irg": "IRG" | |
| } | |
| hue_order = ["Real", "IRG", "HMA", "IND", "RCTGAN", "CLD"] | |
| palette = "tab10" | |
| def __init__( | |
| self, | |
| dataset_name: str, | |
| models: list = ["sdv", "ind", "rctgan", "clava", "irg"], | |
| sdv_scale: float = 1.0 | |
| ): | |
| self.dataset_name = dataset_name | |
| with open(os.path.join("datasets", dataset_name, "schema", "irg.yaml"), "r") as f: | |
| self.schema = yaml.safe_load(f) | |
| self.real_path = os.path.join("datasets", dataset_name, "preprocessed") | |
| self.model_paths = { | |
| m: os.path.join("datasets", dataset_name, "out", m, "generated") for m in models | |
| } | |
| self.report_path = os.path.join("datasets", dataset_name, "reports") | |
| os.makedirs(self.report_path, exist_ok=True) | |
| self.sdv_scale = sdv_scale | |
| plt.rcParams["font.family"] = "DejaVu Serif" | |
| def evaluate_schema(self): | |
| pk_uniqueness = pd.DataFrame() | |
| fk_validity = pd.DataFrame() | |
| ineq_validity = pd.DataFrame() | |
| for table_name, table_args in self.schema["tables"].items(): | |
| primary_key = table_args.get("primary_key") | |
| foreign_keys = table_args.get("foreign_keys", []) | |
| inequalities = table_args.get("inequality", []) | |
| for m, p in self.model_paths.items(): | |
| table = pd.read_csv(os.path.join(p, f"{table_name}.csv")) | |
| if primary_key is not None and not isinstance(primary_key, str): | |
| pk_uniqueness.loc[table_name, m] = (1 - | |
| table[primary_key].drop_duplicates().shape[0] / table.shape[0]) | |
| for fk in foreign_keys: | |
| parent_table_name, child_column_names, parent_column_names = ( | |
| fk["parent_table_name"], fk["child_column_names"], fk["parent_column_names"]) | |
| parent = pd.read_csv(os.path.join(p, f"{parent_table_name}.csv")) | |
| fk_validity.loc[ | |
| f"{table_name}.{child_column_names} -> {parent_table_name}.{parent_column_names}", m | |
| ] = table[ | |
| [child_column_names] if isinstance(child_column_names, str) else child_column_names | |
| ].dropna().merge( | |
| parent, left_on=child_column_names, right_on=parent_column_names, how="left", | |
| indicator="__merge__" | |
| )["__merge__"].value_counts(normalize=True).to_dict().get("left_only", 0.0) | |
| for ineq in inequalities: | |
| l, r = ineq | |
| ineq_validity.loc[f"{table_name}{ineq}", m] = (table[l] == table[r]).mean() | |
| pk_uniqueness.loc["Avg.", :] = pk_uniqueness.mean() | |
| fk_validity.loc["Avg.", :] = fk_validity.mean() | |
| ineq_validity.loc["Avg.", :] = ineq_validity.mean() | |
| pk_uniqueness.loc["# Vio.", :] = (pk_uniqueness.iloc[:-1] > 0).sum() | |
| fk_validity.loc["# Vio.", :] = (fk_validity.iloc[:-1] > 0).sum() | |
| ineq_validity.loc["# Vio.", :] = (ineq_validity.iloc[:-1] > 0).sum() | |
| pk_uniqueness.to_csv(os.path.join(self.report_path, "pk_uniqueness.csv")) | |
| fk_validity.to_csv(os.path.join(self.report_path, "fk_validity.csv")) | |
| ineq_validity.to_csv(os.path.join(self.report_path, "ineq_validity.csv")) | |
| formatted_pk_uniqueness = format_by_rank(pk_uniqueness, maximize=False, warning_threshold=0.) | |
| formatted_fk_validity = format_by_rank(fk_validity, maximize=False, warning_threshold=0.) | |
| formatted_ineq_validity = format_by_rank(ineq_validity, maximize=False, warning_threshold=0.) | |
| formatted_pk_uniqueness.to_latex(os.path.join(self.report_path, "pk_uniqueness.tex"), escape=False) | |
| formatted_fk_validity.to_latex(os.path.join(self.report_path, "fk_validity.tex"), escape=False) | |
| formatted_ineq_validity.to_latex(os.path.join(self.report_path, "ineq_validity.tex"), escape=False) | |
| pd.concat({ | |
| "PK": formatted_pk_uniqueness, "FK": fk_validity, "INEQ": formatted_ineq_validity | |
| }, axis=0).to_latex(os.path.join(self.report_path, "schema.tex"), escape=False) | |
| def evaluate_shapes(self): | |
| sizes = pd.DataFrame() | |
| for table_name in self.schema["tables"]: | |
| real_size = pd.read_csv(os.path.join(self.real_path, f"{table_name}.csv")).shape[0] | |
| for m, p in self.model_paths.items(): | |
| exp_size = real_size if m != "sdv" else real_size * self.sdv_scale | |
| model_size = pd.read_csv(os.path.join(p, f"{table_name}.csv")).shape[0] | |
| sizes.loc[table_name, m] = (model_size - exp_size) / exp_size | |
| sizes.loc["Avg.", :] = sizes.abs().mean() | |
| sizes.to_csv(os.path.join(self.report_path, "sizes.csv")) | |
| sizes = format_by_rank(sizes, maximize=False, warning_threshold=0.5) | |
| sizes.to_latex(os.path.join(self.report_path, "sizes.tex"), escape=False) | |
| def evaluate_degrees(self, visualize: bool = False): | |
| degrees = pd.DataFrame() | |
| isna = pd.DataFrame() | |
| n_fks = 0 | |
| for table_name, table_args in self.schema["tables"].items(): | |
| n_fks += len(table_args.get("foreign_keys", [])) | |
| ncols = 3 | |
| nrows = int(np.ceil(n_fks / ncols)) | |
| fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * ncols, 2.5 * nrows), squeeze=False) | |
| idx = 0 | |
| handles, labels = None, None | |
| for table_name, table_args in self.schema["tables"].items(): | |
| for fk in table_args.get("foreign_keys", []): | |
| parent_table_name, child_column_names, parent_column_names = ( | |
| fk["parent_table_name"], fk["child_column_names"], fk["parent_column_names"]) | |
| fk_descr = f"{table_name}.{child_column_names} -> {parent_table_name}.{parent_column_names}" | |
| real_degrees, real_isna = self._extract_degrees( | |
| self.real_path, parent_table_name, table_name, parent_column_names, child_column_names | |
| ) | |
| if real_isna is not None: | |
| real_isna = real_isna.mean() | |
| all_degrees = { | |
| "Real": real_degrees | |
| } | |
| for m, p in self.model_paths.items(): | |
| syn_degrees, syn_isna = self._extract_degrees( | |
| p, parent_table_name, table_name, parent_column_names, child_column_names | |
| ) | |
| degrees.loc[fk_descr, m] = ks_2samp(real_degrees.values, syn_degrees.values).statistic | |
| if real_isna is not None: | |
| syn_isna = 0. if syn_isna is None else syn_isna.mean() | |
| isna.loc[fk_descr, m] = syn_isna - real_isna | |
| all_degrees[self.renames[m]] = syn_degrees | |
| if visualize: | |
| combined_degrees = pd.concat(all_degrees).reset_index().rename(columns={ | |
| "level_0": "Model", 0: "Degrees" | |
| }) | |
| ax: plt.Axes = axes[idx // ncols][idx % ncols] | |
| combined_degrees["Degrees"] += np.random.normal(0, 0.1, combined_degrees.shape[0]) | |
| sns.kdeplot( | |
| combined_degrees, x="Degrees", hue="Model", common_norm=True, ax=ax, | |
| hue_order=self.hue_order, palette=self.palette, clip=(0, combined_degrees["Degrees"].max()) | |
| ) | |
| if handles is None: | |
| legend = ax.get_legend() | |
| handles, labels = legend.legendHandles, [text.get_text() for text in legend.texts] | |
| ax.legend_.remove() | |
| ax.set_ylabel("") | |
| ax.set_xlabel("") | |
| ax.set_title(f"FK {idx + 1}") | |
| idx += 1 | |
| degrees.loc["Avg.", :] = degrees.mean() | |
| isna.loc["Avg.", :] = isna.abs().mean() | |
| degrees.to_csv(os.path.join(self.report_path, "degrees.csv")) | |
| degrees = format_by_rank(degrees) | |
| degrees.to_latex(os.path.join(self.report_path, "degrees.tex"), escape=False) | |
| isna.to_csv(os.path.join(self.report_path, "isna.csv")) | |
| isna = format_by_rank(isna, maximize=False) | |
| isna.to_csv(os.path.join(self.report_path, "isna.tex")) | |
| if visualize: | |
| for i in range(nrows): | |
| axes[i][0].set_ylabel("Density", va="center", fontsize=20, labelpad=10) | |
| fig.legend( | |
| handles, labels, columnspacing=5.5, loc="upper center", ncol=len(self.hue_order), | |
| bbox_to_anchor=(0.5, 1.08), frameon=False, fontsize=24 | |
| ) | |
| plt.tight_layout() | |
| fig.savefig(os.path.join(self.report_path, "degrees.png"), bbox_inches='tight') | |
| fig.savefig(os.path.join(self.report_path, "degrees.pdf"), bbox_inches='tight') | |
| def _extract_degrees(base_path: str, parent: str, child: str, parent_columns, child_columns): | |
| parent_table = pd.read_csv(os.path.join(base_path, f"{parent}.csv")) | |
| child_table = pd.read_csv(os.path.join(base_path, f"{child}.csv")) | |
| child_table["__index"] = child_table.index.astype(int) | |
| mi = parent_table.merge( | |
| child_table, left_on=parent_columns, right_on=child_columns, how="right", indicator="__merge" | |
| ) | |
| mi = mi.groupby("__index").head(1).set_index("__index").loc[child_table.index] | |
| child_table = child_table[ | |
| (mi["__merge"] == "both") | ( | |
| child_table[child_columns].isna() if isinstance(child_columns, str) | |
| else child_table[child_columns].isna().any(axis=1) | |
| ) | |
| ].drop(columns=["__index"]) | |
| has_child_degrees = child_table.groupby(child_columns).size() | |
| if isinstance(parent_columns, str): | |
| parent_table = parent_table.rename(columns={parent_columns: child_columns}) | |
| else: | |
| parent_table = parent_table.rename(columns={p: c for p, c in zip(parent_columns, child_columns)}) | |
| degrees = pd.Series(0, index=parent_table.set_index(child_columns).index) | |
| degrees[has_child_degrees.index] = has_child_degrees | |
| isna = child_table[child_columns].isna() | |
| if np.any(isna.values): | |
| isna = isna if len(isna.shape) == 1 else isna.any(axis=1) | |
| else: | |
| isna = None | |
| return degrees, isna | |
| def evaluate_time(self): | |
| timing = pd.DataFrame() | |
| for m in self.model_paths: | |
| with open(os.path.join("datasets", self.dataset_name, "out", m, "timing.json"), "r") as f: | |
| model_timing = json.load(f) | |
| fit_time = model_timing["fit"] + model_timing.get("preprocess", 0) | |
| gen_time = model_timing["sample"] + model_timing.get("postprocess", 0) | |
| timing.loc["fit", m] = fit_time | |
| timing.loc["sample", m] = gen_time | |
| timing.to_csv(os.path.join(self.report_path, "timing.csv")) | |
| timing.to_latex(os.path.join(self.report_path, "timing.tex"), escape=False) | |
| def evaluate_ml(self): | |
| perf = pd.DataFrame(index=pd.MultiIndex.from_frame(pd.DataFrame(columns=["ML", "rep", "metric"]))) | |
| real_x, real_y = self._extract_ml(self.real_path) | |
| raw_columns = real_x.columns | |
| real_dtypes = real_x.dtypes | |
| le = LabelEncoder() | |
| real_y = le.fit_transform(real_y) | |
| num_columns = [c for c in real_x.select_dtypes(include="number").columns if real_x[c].nunique() > 10] | |
| num_scaler = StandardScaler() | |
| real_num_x = num_scaler.fit_transform(real_x[num_columns].fillna(0)) | |
| cat_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) | |
| real_cat_x = cat_encoder.fit_transform(real_x.drop(columns=num_columns).fillna("NA!").astype(str)) | |
| real_x = np.concatenate([real_num_x, real_cat_x], axis=1) | |
| for i in range(3): | |
| train_x, test_x, train_y, test_y = train_test_split(real_x, real_y, stratify=real_y) | |
| self._eval_downstream(test_x, test_y, train_x, train_y, perf, "Real", i) | |
| for m, p in self.model_paths.items(): | |
| syn_x, syn_y = self._extract_ml(p) | |
| if syn_x.shape[-1] == 0: | |
| syn_x = pd.DataFrame(columns=raw_columns) | |
| syn_num_x = syn_x[num_columns] | |
| syn_cat_x = syn_x.drop(columns=num_columns) | |
| else: | |
| syn_x = syn_x[raw_columns].astype(real_dtypes) | |
| syn_num_x = num_scaler.transform(syn_x[num_columns].fillna(0)) | |
| syn_cat_x = cat_encoder.transform(syn_x.drop(columns=num_columns).fillna("NA!").astype(str)) | |
| syn_y = le.transform(syn_y) | |
| syn_x = np.concatenate([syn_num_x, syn_cat_x], axis=1) | |
| for i in range(3): | |
| self._eval_downstream(real_x, real_y, syn_x, syn_y, perf, self.renames[m], i) | |
| perf.to_csv(os.path.join(self.report_path, "ml-perf.csv")) | |
| grouped = perf.groupby(level=(0, 2)) | |
| format_by_rank( | |
| grouped.mean(), maximize=True, stds=grouped.std() | |
| ).to_latex(os.path.join(self.report_path, "ml-perf.tex"), escape=False) | |
| def _eval_downstream(test_x, test_y, train_x, train_y, perf: pd.DataFrame, key: str, rep: int): | |
| if train_x.shape[0] == 0: | |
| perf.loc[:, key] = np.nan | |
| else: | |
| acc, f1, auc = RelationalEvaluator._run_downstream(DecisionTreeClassifier, test_x, test_y, train_x, train_y) | |
| perf.loc[("DT", rep, "Acc."), key] = acc | |
| perf.loc[("DT", rep, "F1"), key] = f1 | |
| perf.loc[("DT", rep, "AUC"), key] = auc | |
| acc, f1, auc = RelationalEvaluator._run_downstream(RandomForestClassifier, test_x, test_y, train_x, train_y) | |
| perf.loc[("RF", rep, "Acc."), key] = acc | |
| perf.loc[("RF", rep, "F1"), key] = f1 | |
| perf.loc[("RF", rep, "AUC"), key] = auc | |
| acc, f1, auc = RelationalEvaluator._run_downstream(XGBClassifier, test_x, test_y, train_x, train_y) | |
| perf.loc[("XGB", rep, "Acc."), key] = acc | |
| perf.loc[("XGB", rep, "F1"), key] = f1 | |
| perf.loc[("XGB", rep, "AUC"), key] = auc | |
| acc, f1, auc = RelationalEvaluator._run_downstream(LGBMClassifier, test_x, test_y, train_x, train_y) | |
| perf.loc[("LGBM", rep, "Acc."), key] = acc | |
| perf.loc[("LGBM", rep, "F1"), key] = f1 | |
| perf.loc[("LGBM", rep, "AUC"), key] = auc | |
| def _run_downstream(model, test_x, test_y, train_x, train_y): | |
| model = model() | |
| le = LabelEncoder() | |
| train_y = le.fit_transform(train_y) | |
| model.fit(train_x, train_y) | |
| pred = model.predict(test_x) | |
| pred = le.inverse_transform(pred) | |
| proba = model.predict_proba(test_x) | |
| if len(proba.shape) == 1: | |
| proba = np.stack([1 - proba, proba]).T | |
| elif proba.shape[-1] == 1: | |
| proba = np.concatenate([1 - proba, proba], axis=1) | |
| if len(np.unique(test_y)) > len(np.unique(train_y)): | |
| placeholder_probs = np.zeros((test_x.shape[0], len(np.unique(test_y)))) | |
| for i, j in enumerate(np.unique(train_y)): | |
| placeholder_probs[:, j] = proba[:, i] | |
| proba = placeholder_probs | |
| return accuracy_score(test_y, pred), f1_score(test_y, pred, average="weighted"), roc_auc_score( | |
| test_y, proba, average="weighted", multi_class="ovr", labels=sorted(np.unique(test_y)) | |
| ) | |
| def _extract_ml(self, path: str): | |
| pass | |
| def evaluate_query(self): | |
| all_queries = { | |
| "Real": self._get_query_results(self.real_path), | |
| } | |
| for m, p in self.model_paths.items(): | |
| if m == "ind": | |
| continue | |
| all_queries[self.renames[m]] = self._get_query_results(p) | |
| swapped_queries = defaultdict(dict) | |
| for k, v in all_queries.items(): | |
| for kk, vv in v.items(): | |
| swapped_queries[kk][k] = vv | |
| n_rows = int(np.ceil(len(swapped_queries) / 4)) | |
| fig, axes = plt.subplots(n_rows, 4, figsize=(16, 2 * n_rows), squeeze=False) | |
| axes = axes.flatten() | |
| handles, labels = None, None | |
| for i, (k, v) in enumerate(swapped_queries.items()): | |
| if (v["Real"].mean() - v["Real"].min()) * 10 < v["Real"].max() - v["Real"].mean(): | |
| log_scale = True | |
| else: | |
| log_scale = False | |
| if pd.api.types.is_integer_dtype(v["Real"].dtype): | |
| vc = pd.concat(v).round().dropna().astype(int) | |
| else: | |
| vc = pd.concat(v).astype(np.float64).dropna() | |
| vc = vc.reset_index().rename(columns={"level_0": "Model", 0: "Value"}) | |
| ax = axes[i] | |
| sns.kdeplot( | |
| vc, x="Value", hue="Model", hue_order=[h for h in self.hue_order if h != "IND"], | |
| common_norm=False, ax=ax, palette=self.palette, | |
| clip=(min(vc["Value"].min(), 0), vc["Value"].max()), bw_adjust=3, log_scale=log_scale, #cut=0 | |
| ) | |
| if handles is None: | |
| legend = ax.get_legend() | |
| handles, labels = legend.legendHandles, [text.get_text() for text in legend.texts] | |
| ax.legend_.remove() | |
| ax.set_ylabel("") | |
| ax.set_xlabel("") | |
| ax.set_title(k) | |
| axes[0].set_ylabel("Density", va="center", fontsize=20, labelpad=10) | |
| plt.tight_layout(rect=(0, 0, 1, 0.8)) | |
| fig.legend( | |
| handles, labels, columnspacing=3.5, loc="upper center", ncol=5, | |
| bbox_to_anchor=(0.5, 1.05), frameon=False, fontsize=24 | |
| ) | |
| fig.savefig(os.path.join(self.report_path, "queries.png")) | |
| fig.savefig(os.path.join(self.report_path, "queries.pdf")) | |
| def _get_query_results(path: str): | |
| pass | |