#!/bin/bash #SBATCH --job-name=ml-walltime #SBATCH --partition=b200-mig45 #SBATCH --gpus=1 #SBATCH --cpus-per-task=5 #SBATCH --mem=50G #SBATCH --time=6:00:00 #SBATCH --output=%x_%j.out # ============================================================================= # Unified Bootstrap CI + Uncertainty + Wall-time Refit # wt, smiles, chemberta embeddings # Runs sequentially: bootstrap/uncertainty first, then wall-time refit # ============================================================================= HOME_LOC=~/ SCRIPT_LOC=$HOME_LOC/PeptiVerse/training_classifiers ALT_EMB_LOC=$HOME_LOC/PeptiVerse/training_data_cleaned LOG_LOC=$SCRIPT_LOC/src_bash/logs mkdir -p $LOG_LOC DATE=$(date +%m_%d) cd $SCRIPT_LOC # ============================================================================= # Helper functions # ============================================================================= # Bootstrap CI + uncertainty # $1=OBJECTIVE $2=WT $3=UNCERTAINTY_SCRIPT $4=MODEL_TYPE $5=UNC_MODE run_bootstrap() { local OBJECTIVE=$1 local WT=$2 local SCRIPT=$3 local MODEL_TYPE=$4 local UNC_MODE=$5 local VAL_PREDS="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}/val_predictions.csv" local OUT_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}" local LOG_FILE="${LOG_LOC}/${DATE}_ci_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log" if [ ! -f "$VAL_PREDS" ]; then echo " [SKIP bootstrap] val_predictions.csv not found: $VAL_PREDS" return fi echo " [bootstrap ci] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}" python -u "$SCRIPT" \ --mode ci \ --val_preds "$VAL_PREDS" \ --out_dir "$OUT_DIR" \ --model_name "${MODEL_TYPE}_${WT}" \ >> "$LOG_FILE" 2>&1 echo " [bootstrap unc] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT} (${UNC_MODE})" python -u "$SCRIPT" \ --mode "$UNC_MODE" \ --val_preds "$VAL_PREDS" \ --out_dir "$OUT_DIR" \ --model_name "${MODEL_TYPE}_${WT}" \ >> "$LOG_FILE" 2>&1 echo " ${OUT_DIR}/" } # Wall-time refit # $1=OBJECTIVE $2=WT $3=MODEL_TYPE $4=DATASET_PATH run_walltime() { local OBJECTIVE=$1 local WT=$2 local MODEL_TYPE=$3 local DATASET_PATH=$4 local MODEL_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}" local LOG_FILE="${LOG_LOC}/${DATE}_walltime_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log" if [ ! -d "$MODEL_DIR" ]; then echo " [SKIP walltime] model_dir not found: $MODEL_DIR" return fi if [ ! -d "$DATASET_PATH" ]; then echo " [SKIP walltime] dataset not found: $DATASET_PATH" return fi echo " [walltime] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}" python -u refit_ml_walltime.py \ --model_dir "$MODEL_DIR" \ --dataset_path "$DATASET_PATH" \ --logs_dir "$LOG_LOC" \ >> "$LOG_FILE" 2>&1 echo " logged to ${LOG_LOC}/${DATE}_wall_clock_ml.jsonl" } # ============================================================================= # Dataset path lookup # $1=OBJECTIVE $2=WT # ============================================================================= get_dataset_path() { local OBJECTIVE=$1 local WT=$2 local DATA_LOC=$HOME_LOC/projects/Classifier_Weight/training_data_cleaned case "${OBJECTIVE}|${WT}" in # -- wt embeddings (ESM2 / original) ------------------------------ "hemolysis|wt") echo "${DATA_LOC}/hemolysis/hemo_wt_with_embeddings" ;; "nf|wt") echo "${DATA_LOC}/nf/nf_wt_with_embeddings" ;; "solubility|wt") echo "${DATA_LOC}/solubility/sol_wt_with_embeddings" ;; "permeability_penetrance|wt") echo "${DATA_LOC}/permeability_penetrance/perm_wt_with_embeddings_pooled" ;; # -- smiles embeddings (PeptideCLM) ------------------------------- "hemolysis|smiles") echo "${ALT_EMB_LOC}/hemolysis_peptideclm/hemo_smiles_with_embeddings" ;; "nf|smiles") echo "${ALT_EMB_LOC}/nf_peptideclm/nf_smiles_with_embeddings" ;; "permeability_pampa|smiles") echo "${ALT_EMB_LOC}/permeability_pampa_peptideclm/pampa_smiles_with_embeddings" ;; "permeability_caco2|smiles") echo "${ALT_EMB_LOC}/permeability_caco2_peptideclm/caco2_smiles_with_embeddings" ;; # -- chemberta embeddings ----------------------------------------- "hemolysis|chemberta") echo "${ALT_EMB_LOC}/hemolysis_chemberta/hemo_smiles_with_embeddings" ;; "nf|chemberta") echo "${ALT_EMB_LOC}/nf_chemberta/nf_smiles_with_embeddings" ;; "permeability_penetrance|chemberta") echo "${ALT_EMB_LOC}/permeability_chemberta/perm_smiles_with_embeddings" ;; "permeability_penetrance|peptideclm") echo "${ALT_EMB_LOC}/permeability_peptideclm/perm_smiles_with_embeddings" ;; "permeability_pampa|chemberta") echo "${ALT_EMB_LOC}/permeability_pampa_chemberta/pampa_smiles_with_embeddings" ;; "permeability_caco2|chemberta") echo "${ALT_EMB_LOC}/permeability_caco2_chemberta/caco2_smiles_with_embeddings" ;; *) echo "" ;; esac } # ============================================================================= # SECTION 1 - Classification tasks # ============================================================================= echo "" echo "============================================================" echo " SECTION 1: Classification bootstrap + walltime" echo "============================================================" CLS_MODEL_TYPES=("svm_gpu" "enet_gpu" "xgb") # hemolysis, nf - wt + smiles + chemberta for OBJECTIVE in "hemolysis" "nf"; do for WT in "wt" "smiles" "chemberta"; do for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do echo "" echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --" run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob" DPATH=$(get_dataset_path "$OBJECTIVE" "$WT") run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH" done done done # solubility, permeability_penetrance - wt + chemberta (no smiles embeddings) for OBJECTIVE in "solubility" "permeability_penetrance"; do for WT in "wt" "chemberta"; do for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do echo "" echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --" run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob" DPATH=$(get_dataset_path "$OBJECTIVE" "$WT") run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH" done done done # ============================================================================= # SECTION 2 - Regression tasks (PAMPA, Caco-2) # ============================================================================= echo "" echo "============================================================" echo " SECTION 2: Regression bootstrap + walltime" echo "============================================================" REG_MODEL_TYPES=("svr" "enet_gpu" "xgb") for OBJECTIVE in "permeability_pampa" "permeability_caco2"; do for WT in "smiles" "chemberta"; do for MODEL_TYPE in "${REG_MODEL_TYPES[@]}"; do echo "" echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --" run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty_reg.py" "$MODEL_TYPE" "uncertainty_residual" DPATH=$(get_dataset_path "$OBJECTIVE" "$WT") run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH" done done done echo "" echo "============================================================" echo "All runs completed at $(date)" echo "============================================================" conda deactivate