TWLab's picture
Add publication-ready ML project structure with full source code
e2b220f verified
"""
Main experiment runner.
Orchestrates the complete ML pipeline:
1. Load and preprocess data
2. Feature engineering
3. Train models (XGBoost, NN, Ensemble)
4. Cross-validation with statistical testing
5. Ablation studies
6. Per-material evaluation
7. Generate publication figures
8. Save all results
Usage:
python scripts/run_experiment.py --config configs/experiment.yaml
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.data.dataset import (
clean_dataset,
get_feature_target_arrays,
load_dataset,
split_dataset,
)
from src.features.engineering import compute_all_derived_features, get_feature_groups
from src.models.models import (
NeuralNetworkRegressor,
WeightedEnsemble,
XGBoostMultiOutput,
cross_validate_model,
)
from src.evaluation.metrics import (
compare_models_statistical,
compute_cv_summary,
compute_metrics,
per_material_evaluation,
run_ablation_study,
)
from src.visualization.plots import (
plot_feature_importance,
plot_model_comparison,
plot_per_material_performance,
plot_predicted_vs_actual,
plot_residual_analysis,
plot_training_curves,
)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
def load_config(config_path: str) -> dict:
"""Load YAML configuration file."""
with open(config_path) as f:
config = yaml.safe_load(f)
logger.info(f"Loaded config: {config_path}")
return config
def main(config_path: str):
"""Run complete experiment pipeline."""
start_time = time.time()
config = load_config(config_path)
# Setup paths
output_dir = Path(config["paths"]["figures"])
output_dir.mkdir(parents=True, exist_ok=True)
results_dir = Path(config["paths"]["results"])
results_dir.mkdir(parents=True, exist_ok=True)
models_dir = Path(config["paths"]["models"])
models_dir.mkdir(parents=True, exist_ok=True)
seed = config["experiment"]["random_seed"]
np.random.seed(seed)
# =========================================================================
# Step 1: Load and preprocess data
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 1: DATA LOADING & PREPROCESSING")
logger.info("=" * 60)
df = load_dataset(
source=config["data"]["source"],
local_path=config["data"].get("local_path"),
random_state=seed,
)
df = clean_dataset(df)
df = compute_all_derived_features(df)
# Feature and target columns
all_features = (
config["data"]["laser_features"]
+ config["data"]["material_features"]
+ config["data"]["derived_features"]
)
target_cols = config["data"]["target_columns"]
# Verify columns exist
missing = [c for c in all_features + target_cols if c not in df.columns]
if missing:
logger.error(f"Missing columns: {missing}")
raise ValueError(f"Missing columns in dataset: {missing}")
# Split data
train_df, val_df, test_df = split_dataset(
df,
test_size=config["experiment"]["test_size"],
val_size=config["experiment"]["validation_size"],
group_column=config["data"].get("group_column", "material_type"),
random_state=seed,
)
X_train, y_train = get_feature_target_arrays(train_df, all_features, target_cols)
X_val, y_val = get_feature_target_arrays(val_df, all_features, target_cols)
X_test, y_test = get_feature_target_arrays(test_df, all_features, target_cols)
logger.info(f"Features: {len(all_features)}, Targets: {len(target_cols)}")
logger.info(f"Shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
# =========================================================================
# Step 2: Train Models
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 2: MODEL TRAINING")
logger.info("=" * 60)
# XGBoost
logger.info("Training XGBoost...")
xgb_params = config["models"]["xgboost"].copy()
xgb_params["random_state"] = seed
xgb_model = XGBoostMultiOutput(xgb_params, target_cols)
xgb_model.fit(X_train, y_train, X_val, y_val)
# Neural Network
logger.info("Training Neural Network...")
nn_config = config["models"]["neural_network"]
nn_model = NeuralNetworkRegressor(
n_features=len(all_features),
n_outputs=len(target_cols),
hidden_layers=nn_config["hidden_layers"],
dropout=nn_config["dropout"],
learning_rate=nn_config["learning_rate"],
weight_decay=nn_config["weight_decay"],
batch_size=nn_config["batch_size"],
max_epochs=nn_config["max_epochs"],
patience=nn_config["patience"],
)
nn_model.fit(X_train, y_train, X_val, y_val)
# Ensemble
ens_config = config["models"]["ensemble"]
ensemble = WeightedEnsemble(
xgb_model, nn_model,
xgb_weight=ens_config["xgboost_weight"],
nn_weight=ens_config["nn_weight"],
)
# =========================================================================
# Step 3: Evaluate on Test Set
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 3: TEST SET EVALUATION")
logger.info("=" * 60)
predictions = {
"XGBoost": xgb_model.predict(X_test),
"Neural Network": nn_model.predict(X_test),
"Ensemble": ensemble.predict(X_test),
}
metrics_all = {}
for model_name, y_pred in predictions.items():
metrics = compute_metrics(y_test, y_pred, target_cols)
metrics_all[model_name] = metrics
logger.info(f"\n{model_name} Test Metrics:\n{metrics.to_string()}")
# Save metrics
for model_name, metrics_df in metrics_all.items():
metrics_df.to_csv(results_dir / f"metrics_{model_name.lower().replace(' ', '_')}.csv")
# =========================================================================
# Step 4: Cross-Validation
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 4: CROSS-VALIDATION")
logger.info("=" * 60)
n_folds = config["experiment"]["n_cv_folds"]
X_full = np.vstack([X_train, X_val])
y_full = np.vstack([y_train, y_val])
groups_full = pd.concat([train_df, val_df])["material_type"].values if "material_type" in train_df.columns else None
# XGBoost CV
logger.info("XGBoost cross-validation...")
xgb_cv = cross_validate_model(
model_factory=lambda: XGBoostMultiOutput(xgb_params, target_cols),
X=X_full, y=y_full, n_folds=n_folds, groups=groups_full, random_state=seed,
)
# NN CV
logger.info("Neural Network cross-validation...")
nn_cv = cross_validate_model(
model_factory=lambda: NeuralNetworkRegressor(
n_features=len(all_features), n_outputs=len(target_cols),
hidden_layers=nn_config["hidden_layers"], dropout=nn_config["dropout"],
learning_rate=nn_config["learning_rate"], max_epochs=nn_config["max_epochs"],
patience=nn_config["patience"],
),
X=X_full, y=y_full, n_folds=n_folds, groups=groups_full, random_state=seed,
)
# CV summaries
xgb_summary = compute_cv_summary(xgb_cv, target_cols)
nn_summary = compute_cv_summary(nn_cv, target_cols)
logger.info(f"\nXGBoost CV Summary:\n{xgb_summary.to_string()}")
logger.info(f"\nNeural Network CV Summary:\n{nn_summary.to_string()}")
xgb_summary.to_csv(results_dir / "cv_xgboost.csv")
nn_summary.to_csv(results_dir / "cv_neural_network.csv")
# =========================================================================
# Step 5: Statistical Comparison
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 5: STATISTICAL SIGNIFICANCE TESTING")
logger.info("=" * 60)
stat_test = config["evaluation"]["statistical_tests"]
comparison = compare_models_statistical(
xgb_cv, nn_cv,
model_name_a="XGBoost",
model_name_b="Neural Network",
target_names=target_cols,
metric="r2",
test=stat_test["method"],
significance_level=stat_test["significance_level"],
)
logger.info(f"\nStatistical Comparison (R²):\n{comparison.to_string()}")
comparison.to_csv(results_dir / "statistical_comparison.csv")
# =========================================================================
# Step 6: Per-Material Evaluation
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 6: PER-MATERIAL EVALUATION")
logger.info("=" * 60)
if "material_type" in test_df.columns:
mat_labels = test_df["material_type"].values
mat_results = per_material_evaluation(
y_test, predictions["Ensemble"], mat_labels, target_cols
)
logger.info(f"\nPer-Material (Ensemble):\n{mat_results.to_string()}")
mat_results.to_csv(results_dir / "per_material_evaluation.csv")
# =========================================================================
# Step 7: Generate Figures
# =========================================================================
logger.info("=" * 60)
logger.info("STEP 7: GENERATING PUBLICATION FIGURES")
logger.info("=" * 60)
fig_dir = Path(config["paths"]["figures"])
fig_format = config["visualization"].get("figure_format", "png")
# Predicted vs Actual
plot_predicted_vs_actual(
y_test, predictions["Ensemble"], target_cols,
model_name="Ensemble (XGBoost 60% + NN 40%)",
save_path=fig_dir / f"predicted_vs_actual.{fig_format}",
)
# Residual analysis
plot_residual_analysis(
y_test, predictions["Ensemble"], target_cols,
save_path=fig_dir / f"residual_analysis.{fig_format}",
)
# Feature importance
importances = xgb_model.get_feature_importance(all_features)
plot_feature_importance(
importances, top_n=12,
save_path=fig_dir / f"feature_importance.{fig_format}",
)
# Model comparison
plot_model_comparison(
metrics_all, metric="R²", target_names=target_cols,
save_path=fig_dir / f"model_comparison_r2.{fig_format}",
)
# Training curves
plot_training_curves(
nn_model.train_losses, nn_model.val_losses,
save_path=fig_dir / f"training_curves.{fig_format}",
)
# Per-material
if "material_type" in test_df.columns:
plot_per_material_performance(
mat_results, target_names=target_cols,
save_path=fig_dir / f"per_material_performance.{fig_format}",
)
# =========================================================================
# Step 8: Save Final Summary
# =========================================================================
elapsed = time.time() - start_time
summary = {
"experiment_name": config["experiment"]["name"],
"dataset_size": len(df),
"n_features": len(all_features),
"n_targets": len(target_cols),
"n_cv_folds": n_folds,
"random_seed": seed,
"best_model": "Ensemble",
"test_metrics": {
model: metrics_all[model].to_dict() for model in metrics_all
},
"elapsed_seconds": elapsed,
}
with open(results_dir / "experiment_summary.json", "w") as f:
json.dump(summary, f, indent=2, default=str)
logger.info("=" * 60)
logger.info(f"EXPERIMENT COMPLETE ({elapsed:.1f}s)")
logger.info(f"Results saved to: {results_dir}")
logger.info(f"Figures saved to: {fig_dir}")
logger.info("=" * 60)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run fs-laser hydrogel etching ML experiment")
parser.add_argument("--config", type=str, default="configs/experiment.yaml",
help="Path to experiment configuration YAML")
args = parser.parse_args()
main(args.config)