| """ |
| Comprehensive data augmentation and model improvement pipeline. |
| |
| Augmentation strategies: |
| 1. Variable subsampling: randomly drop variables to create new graph topologies |
| 2. Sample-size variation: subsample rows from existing large-N datasets |
| 3. Noise injection: add random noise to some variables |
| |
| Then trains multiple model architectures and does a full comparison. |
| """ |
| import os |
| import sys |
| import numpy as np |
| import pandas as pd |
| import json |
| import logging |
| import warnings |
| import time |
| from itertools import combinations |
|
|
| warnings.filterwarnings('ignore') |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') |
| logger = logging.getLogger(__name__) |
| logging.getLogger('causallearn').setLevel(logging.ERROR) |
|
|
| sys.path.insert(0, '/app') |
| from causal_selection.data.generator import ( |
| load_bn_model, get_true_dag_adjmat, dag_to_cpdag, sample_dataset, |
| ALL_NETWORKS, MEDIUM_NETWORKS, LARGE_NETWORKS, get_network_tier |
| ) |
| from causal_selection.discovery.algorithms import run_algorithm, ALGORITHM_POOL |
| from causal_selection.discovery.evaluator import evaluate_algorithm_result |
| from causal_selection.features.extractor import extract_all_features, FEATURE_NAMES |
| from causal_selection.meta_learner.trainer import ( |
| load_meta_dataset, evaluate_lono_cv, train_meta_learner, |
| save_model, get_feature_importance, ALGO_NAMES, RESULTS_DIR |
| ) |
|
|
| from sklearn.ensemble import ( |
| RandomForestRegressor, GradientBoostingRegressor, |
| RandomForestClassifier, GradientBoostingClassifier |
| ) |
| from sklearn.multioutput import MultiOutputRegressor |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.metrics import mean_squared_error |
| import joblib |
|
|
|
|
| |
| |
| |
|
|
| def augment_all(networks_for_varsub=None, n_varsub=3, drop_frac=0.3, |
| networks_for_samplesub=None, n_samplesub=2): |
| """Run all augmentation strategies and return combined augmented data.""" |
| |
| all_feats, all_shds, all_nshds, all_cfgs = [], [], [], [] |
| |
| |
| logger.info("="*60) |
| logger.info("AUGMENTATION: Variable Subsampling") |
| logger.info("="*60) |
| |
| if networks_for_varsub is None: |
| |
| networks_for_varsub = ['sachs', 'alarm', 'child', 'insurance', |
| 'water', 'barley', 'mildew', |
| 'hailfinder', 'hepar2'] |
| |
| for net_name in networks_for_varsub: |
| try: |
| model = load_bn_model(net_name) |
| true_dag, node_names = get_true_dag_adjmat(model) |
| n_vars = len(node_names) |
| |
| if n_vars < 8: |
| continue |
| |
| tier = get_network_tier(net_name) |
| timeout = {'small': 60, 'medium': 90, 'large': 120}[tier] |
| |
| for aug_i in range(n_varsub): |
| rng = np.random.RandomState(200 + aug_i * 100 + hash(net_name) % 100) |
| |
| |
| keep_frac = 1.0 - drop_frac + rng.uniform(-0.1, 0.1) |
| keep_frac = max(0.5, min(0.85, keep_frac)) |
| n_keep = max(5, int(n_vars * keep_frac)) |
| keep_idx = sorted(rng.choice(n_vars, n_keep, replace=False)) |
| |
| sub_dag = true_dag[np.ix_(keep_idx, keep_idx)] |
| sub_cpdag = dag_to_cpdag(sub_dag) |
| sub_names = [node_names[i] for i in keep_idx] |
| |
| n_samples = rng.choice([500, 1000, 2000]) |
| df_full = sample_dataset(model, n_samples, seed=200 + aug_i) |
| df_sub = df_full[sub_names].copy() |
| df_sub.columns = [f'X{i}' for i in range(len(sub_names))] |
| |
| logger.info(f" VarSub {net_name} #{aug_i}: {n_vars}->{n_keep} vars, N={n_samples}") |
| |
| f, s, ns, c = _run_single(df_sub, sub_cpdag, |
| f'{net_name}_vs{aug_i}', n_samples, |
| 200+aug_i, n_keep, timeout) |
| if f is not None: |
| all_feats.append(f) |
| all_shds.append(s) |
| all_nshds.append(ns) |
| all_cfgs.append(c) |
| |
| except Exception as e: |
| logger.error(f"VarSub failed for {net_name}: {e}") |
| |
| |
| logger.info("\n" + "="*60) |
| logger.info("AUGMENTATION: Sample Size Variation") |
| logger.info("="*60) |
| |
| if networks_for_samplesub is None: |
| networks_for_samplesub = ['asia', 'cancer', 'earthquake', 'sachs', |
| 'survey', 'alarm', 'child'] |
| |
| sub_sample_sizes = [300, 750, 1500, 3000] |
| |
| for net_name in networks_for_samplesub: |
| try: |
| model = load_bn_model(net_name) |
| true_dag, node_names = get_true_dag_adjmat(model) |
| true_cpdag = dag_to_cpdag(true_dag) |
| n_vars = len(node_names) |
| tier = get_network_tier(net_name) |
| timeout = {'small': 60, 'medium': 90, 'large': 120}[tier] |
| |
| for ss_i, n_samples in enumerate(sub_sample_sizes): |
| seed = 300 + ss_i |
| df = sample_dataset(model, n_samples, seed=seed) |
| |
| logger.info(f" SampleSub {net_name} N={n_samples} seed={seed}") |
| |
| f, s, ns, c = _run_single(df, true_cpdag, |
| f'{net_name}_ss{ss_i}', n_samples, |
| seed, n_vars, timeout) |
| if f is not None: |
| all_feats.append(f) |
| all_shds.append(s) |
| all_nshds.append(ns) |
| all_cfgs.append(c) |
| |
| except Exception as e: |
| logger.error(f"SampleSub failed for {net_name}: {e}") |
| |
| |
| logger.info("\n" + "="*60) |
| logger.info("AUGMENTATION: Noise Injection") |
| logger.info("="*60) |
| |
| noise_networks = ['asia', 'sachs', 'survey', 'cancer', 'earthquake'] |
| |
| for net_name in noise_networks: |
| try: |
| model = load_bn_model(net_name) |
| true_dag, node_names = get_true_dag_adjmat(model) |
| true_cpdag = dag_to_cpdag(true_dag) |
| n_vars = len(node_names) |
| timeout = 60 |
| |
| for noise_i, noise_frac in enumerate([0.05, 0.10]): |
| seed = 400 + noise_i |
| n_samples = 1000 |
| df = sample_dataset(model, n_samples, seed=seed) |
| |
| |
| rng = np.random.RandomState(seed) |
| n_flip = int(n_samples * n_vars * noise_frac) |
| for _ in range(n_flip): |
| r = rng.randint(n_samples) |
| c = rng.randint(n_vars) |
| max_val = df.iloc[:, c].max() |
| df.iloc[r, c] = rng.randint(0, max_val + 1) |
| |
| logger.info(f" Noise {net_name} frac={noise_frac}") |
| |
| f, s, ns, c = _run_single(df, true_cpdag, |
| f'{net_name}_n{noise_i}', n_samples, |
| seed, n_vars, timeout) |
| if f is not None: |
| all_feats.append(f) |
| all_shds.append(s) |
| all_nshds.append(ns) |
| all_cfgs.append(c) |
| |
| except Exception as e: |
| logger.error(f"Noise failed for {net_name}: {e}") |
| |
| return all_feats, all_shds, all_nshds, all_cfgs |
|
|
|
|
| def _run_single(df, true_cpdag, net_label, n_samples, seed, n_vars, timeout): |
| """Run feature extraction + all algorithms on one config.""" |
| try: |
| features = extract_all_features(df, n_probe_triplets=60) |
| |
| shd_row = {} |
| nshd_row = {} |
| max_possible = n_vars * (n_vars - 1) // 2 |
| |
| for algo_name in ALGO_NAMES: |
| result = run_algorithm(algo_name, df, timeout_sec=timeout) |
| metrics = evaluate_algorithm_result(result, true_cpdag) |
| shd_row[algo_name] = metrics['shd'] |
| nshd_row[algo_name] = metrics['normalized_shd'] |
| |
| feat_row = {name: features.get(name, 0.0) for name in FEATURE_NAMES} |
| config = { |
| 'network': net_label, |
| 'n_samples': n_samples, |
| 'seed': seed, |
| 'n_variables': n_vars, |
| 'n_true_edges': int(((true_cpdag + true_cpdag.T) > 0).sum() // 2), |
| } |
| |
| |
| best = min(shd_row, key=shd_row.get) |
| logger.info(f" Best: {best} SHD={shd_row[best]}") |
| |
| return feat_row, shd_row, nshd_row, config |
| |
| except Exception as e: |
| logger.error(f" Failed: {e}") |
| return None, None, None, None |
|
|
|
|
| |
| |
| |
|
|
| def train_pairwise_ranking(X, Y_nshd, configs): |
| """Train pairwise ranking classifiers: for each (algo_i, algo_j) pair, |
| train a classifier to predict whether algo_i beats algo_j. |
| |
| At inference: count wins for each algorithm, rank by win count. |
| """ |
| n_algos = len(ALGO_NAMES) |
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X) |
| |
| pair_models = {} |
| pair_accuracies = {} |
| |
| for i in range(n_algos): |
| for j in range(i+1, n_algos): |
| algo_i, algo_j = ALGO_NAMES[i], ALGO_NAMES[j] |
| |
| |
| y = (Y_nshd.iloc[:, i] < Y_nshd.iloc[:, j]).astype(int).values |
| |
| |
| if y.mean() == 0 or y.mean() == 1: |
| pair_models[(i,j)] = None |
| pair_accuracies[(i,j)] = y.mean() |
| continue |
| |
| clf = GradientBoostingClassifier( |
| n_estimators=200, max_depth=3, learning_rate=0.05, |
| random_state=42 |
| ) |
| clf.fit(X_scaled, y) |
| |
| train_acc = clf.score(X_scaled, y) |
| pair_models[(i,j)] = clf |
| pair_accuracies[(i,j)] = train_acc |
| |
| return pair_models, scaler, pair_accuracies |
|
|
|
|
| def predict_pairwise_ranking(pair_models, scaler, X_new, k=3): |
| """Use pairwise models to rank algorithms via win-count.""" |
| X_scaled = scaler.transform(X_new) |
| n_algos = len(ALGO_NAMES) |
| n_samples = X_scaled.shape[0] |
| |
| results = [] |
| for idx in range(n_samples): |
| wins = np.zeros(n_algos) |
| x = X_scaled[idx:idx+1] |
| |
| for i in range(n_algos): |
| for j in range(i+1, n_algos): |
| model = pair_models.get((i,j)) |
| if model is None: |
| continue |
| pred = model.predict(x)[0] |
| if pred == 1: |
| wins[i] += 1 |
| else: |
| wins[j] += 1 |
| |
| ranking = np.argsort(-wins) |
| results.append(ranking[:k]) |
| |
| return np.array(results) |
|
|
|
|
| def evaluate_pairwise_lono(X, Y_nshd, configs, k=3): |
| """LONO-CV for pairwise ranking model.""" |
| networks = configs['network'].values |
| unique_nets = sorted(configs['network'].unique()) |
| |
| base_nets = configs['network'].apply(lambda x: x.split('_')[0]).values |
| unique_base = sorted(set(base_nets)) |
| |
| top_k_hits = 0 |
| regrets = [] |
| total = 0 |
| |
| for test_base in unique_base: |
| test_mask = base_nets == test_base |
| train_mask = ~test_mask |
| |
| if train_mask.sum() < 5 or test_mask.sum() == 0: |
| continue |
| |
| X_train = X.values[train_mask] |
| Y_train = Y_nshd[train_mask] |
| X_test = X.values[test_mask] |
| Y_test = Y_nshd.values[test_mask] |
| |
| |
| scaler = StandardScaler() |
| X_train_s = scaler.fit_transform(X_train) |
| |
| n_algos = len(ALGO_NAMES) |
| pair_models = {} |
| |
| for i in range(n_algos): |
| for j in range(i+1, n_algos): |
| y = (Y_train.iloc[:, i] < Y_train.iloc[:, j]).astype(int).values |
| if y.mean() == 0 or y.mean() == 1: |
| pair_models[(i,j)] = None |
| continue |
| clf = GradientBoostingClassifier( |
| n_estimators=100, max_depth=3, learning_rate=0.05, |
| random_state=42 |
| ) |
| clf.fit(X_train_s, y) |
| pair_models[(i,j)] = clf |
| |
| |
| X_test_s = scaler.transform(X_test) |
| |
| for idx in range(len(X_test_s)): |
| wins = np.zeros(n_algos) |
| x = X_test_s[idx:idx+1] |
| |
| for i in range(n_algos): |
| for j in range(i+1, n_algos): |
| m = pair_models.get((i,j)) |
| if m is None: |
| continue |
| if m.predict(x)[0] == 1: |
| wins[i] += 1 |
| else: |
| wins[j] += 1 |
| |
| pred_top_k = np.argsort(-wins)[:k] |
| true_best = np.argmin(Y_test[idx]) |
| |
| if true_best in pred_top_k: |
| top_k_hits += 1 |
| |
| oracle = Y_test[idx, true_best] |
| selected = min(Y_test[idx, a] for a in pred_top_k) |
| regrets.append(selected - oracle) |
| total += 1 |
| |
| hit_rate = top_k_hits / total if total > 0 else 0 |
| mean_regret = np.mean(regrets) if regrets else 0 |
| |
| return { |
| 'top_k_hit_rate': hit_rate, |
| 'mean_regret': mean_regret, |
| 'median_regret': np.median(regrets) if regrets else 0, |
| 'n_evaluated': total, |
| } |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == '__main__': |
| start_time = time.time() |
| |
| |
| print("="*80) |
| print("STEP 1: DATA AUGMENTATION") |
| print("="*80) |
| |
| feats, shds, nshds, cfgs = augment_all( |
| n_varsub=2, drop_frac=0.3, |
| n_samplesub=2, |
| ) |
| |
| print(f"\nGenerated {len(cfgs)} augmented configs") |
| |
| |
| X_orig, Y_shd_orig, Y_nshd_orig, configs_orig = load_meta_dataset() |
| |
| X_aug = pd.DataFrame(feats, columns=FEATURE_NAMES) |
| Y_shd_aug = pd.DataFrame(shds, columns=ALGO_NAMES) |
| Y_nshd_aug = pd.DataFrame(nshds, columns=ALGO_NAMES) |
| configs_aug = pd.DataFrame(cfgs) |
| |
| X_all = pd.concat([X_orig, X_aug], ignore_index=True) |
| Y_shd_all = pd.concat([Y_shd_orig, Y_shd_aug], ignore_index=True) |
| Y_nshd_all = pd.concat([Y_nshd_orig, Y_nshd_aug], ignore_index=True) |
| configs_all = pd.concat([configs_orig, configs_aug], ignore_index=True) |
| |
| print(f"Total dataset: {len(configs_all)} configs " |
| f"({len(configs_orig)} original + {len(configs_aug)} augmented)") |
| |
| |
| X_all.to_csv(os.path.join(RESULTS_DIR, 'meta_features.csv'), index=False) |
| Y_shd_all.to_csv(os.path.join(RESULTS_DIR, 'shd_matrix.csv'), index=False) |
| Y_nshd_all.to_csv(os.path.join(RESULTS_DIR, 'normalized_shd_matrix.csv'), index=False) |
| configs_all.to_csv(os.path.join(RESULTS_DIR, 'configs.csv'), index=False) |
| |
| |
| print("\n" + "="*80) |
| print("STEP 2: MODEL COMPARISON (LONO-CV)") |
| print("="*80) |
| |
| |
| X, Y_shd, Y_nshd, configs = load_meta_dataset() |
| |
| print(f"\n{'Model':25s} {'Top3Hit':>8s} {'NDCG@3':>8s} {'Regret':>8s}") |
| print("-"*55) |
| |
| model_configs = [ |
| ('RF-200', 'rf', {'n_estimators': 200}), |
| ('RF-500', 'rf', {'n_estimators': 500}), |
| ('GBM-500-lr05', 'gbm', {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05}), |
| ('GBM-300-lr01', 'gbm', {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.01}), |
| ('GBM-200-lr1', 'gbm', {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1}), |
| ] |
| |
| best_hit = 0 |
| best_config = None |
| |
| for name, mtype, kwargs in model_configs: |
| r = evaluate_lono_cv(X, Y_nshd, configs, model_type=mtype, k=3, **kwargs) |
| o = r['overall'] |
| print(f"{name:25s} {o['top_k_hit_rate']:8.3f} {o['ndcg_at_k']:8.3f} {o['mean_regret']:8.4f}") |
| if o['top_k_hit_rate'] > best_hit: |
| best_hit = o['top_k_hit_rate'] |
| best_config = (name, mtype, kwargs, o) |
| |
| |
| print(f"\n{'Pairwise-GBM':25s}", end="") |
| pw_results = evaluate_pairwise_lono(X, Y_nshd, configs, k=3) |
| print(f" {pw_results['top_k_hit_rate']:8.3f} {'N/A':>8s} {pw_results['mean_regret']:8.4f}") |
| |
| if pw_results['top_k_hit_rate'] > best_hit: |
| best_hit = pw_results['top_k_hit_rate'] |
| best_config = ('Pairwise-GBM', 'pairwise', {}, pw_results) |
| |
| print(f"\n{'='*55}") |
| print(f"BEST MODEL: {best_config[0]} (hit rate={best_hit:.3f})") |
| print(f"{'='*55}") |
| |
| |
| if best_config[1] != 'pairwise': |
| model, scaler = train_meta_learner(X, Y_nshd, |
| model_type=best_config[1], |
| **best_config[2]) |
| save_model(model, scaler) |
| |
| avg_imp, _ = get_feature_importance(model) |
| print("\nTop 10 Features:") |
| for feat, imp in sorted(avg_imp.items(), key=lambda x: -x[1])[:10]: |
| print(f" {feat:30s}: {imp:.4f}") |
| else: |
| |
| print("Pairwise model is best - training final version...") |
| pair_models, scaler, _ = train_pairwise_ranking(X, Y_nshd, configs) |
| os.makedirs('/app/causal_selection/models', exist_ok=True) |
| joblib.dump({'pair_models': pair_models, 'scaler': scaler}, |
| '/app/causal_selection/models/pairwise_model.pkl') |
| |
| best_mo = [c for c in model_configs if c[0] != 'Pairwise-GBM'] |
| best_mo_hit = 0 |
| best_mo_cfg = model_configs[0] |
| for name, mtype, kwargs in model_configs: |
| r = evaluate_lono_cv(X, Y_nshd, configs, model_type=mtype, k=3, **kwargs) |
| if r['overall']['top_k_hit_rate'] > best_mo_hit: |
| best_mo_hit = r['overall']['top_k_hit_rate'] |
| best_mo_cfg = (name, mtype, kwargs) |
| model, scaler = train_meta_learner(X, Y_nshd, model_type=best_mo_cfg[1], **best_mo_cfg[2]) |
| save_model(model, scaler) |
| |
| elapsed = time.time() - start_time |
| print(f"\nTotal time: {elapsed/60:.1f} minutes") |
| |
| |
| summary = { |
| 'n_configs_original': int(len(configs_orig)), |
| 'n_configs_augmented': int(len(configs_aug)), |
| 'n_configs_total': int(len(configs_all)), |
| 'best_model': best_config[0], |
| 'best_top3_hit_rate': float(best_hit), |
| 'best_metrics': {k: float(v) if isinstance(v, (float, np.floating)) else v |
| for k, v in best_config[3].items()}, |
| } |
| with open(os.path.join(RESULTS_DIR, 'improvement_summary.json'), 'w') as f: |
| json.dump(summary, f, indent=2) |
| |
| print(f"\nSummary saved to {RESULTS_DIR}/improvement_summary.json") |
|
|