File size: 7,805 Bytes
04c2975 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | #!/bin/bash
#SBATCH --job-name=ml-walltime
#SBATCH --partition=b200-mig45
#SBATCH --gpus=1
#SBATCH --cpus-per-task=5
#SBATCH --mem=50G
#SBATCH --time=6:00:00
#SBATCH --output=%x_%j.out
# =============================================================================
# Unified Bootstrap CI + Uncertainty + Wall-time Refit
# wt, smiles, chemberta embeddings
# Runs sequentially: bootstrap/uncertainty first, then wall-time refit
# =============================================================================
HOME_LOC=~/
SCRIPT_LOC=$HOME_LOC/PeptiVerse/training_classifiers
ALT_EMB_LOC=$HOME_LOC/PeptiVerse/training_data_cleaned
LOG_LOC=$SCRIPT_LOC/src_bash/logs
mkdir -p $LOG_LOC
DATE=$(date +%m_%d)
cd $SCRIPT_LOC
# =============================================================================
# Helper functions
# =============================================================================
# Bootstrap CI + uncertainty
# $1=OBJECTIVE $2=WT $3=UNCERTAINTY_SCRIPT $4=MODEL_TYPE $5=UNC_MODE
run_bootstrap() {
local OBJECTIVE=$1
local WT=$2
local SCRIPT=$3
local MODEL_TYPE=$4
local UNC_MODE=$5
local VAL_PREDS="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}/val_predictions.csv"
local OUT_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
local LOG_FILE="${LOG_LOC}/${DATE}_ci_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log"
if [ ! -f "$VAL_PREDS" ]; then
echo " [SKIP bootstrap] val_predictions.csv not found: $VAL_PREDS"
return
fi
echo " [bootstrap ci] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}"
python -u "$SCRIPT" \
--mode ci \
--val_preds "$VAL_PREDS" \
--out_dir "$OUT_DIR" \
--model_name "${MODEL_TYPE}_${WT}" \
>> "$LOG_FILE" 2>&1
echo " [bootstrap unc] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT} (${UNC_MODE})"
python -u "$SCRIPT" \
--mode "$UNC_MODE" \
--val_preds "$VAL_PREDS" \
--out_dir "$OUT_DIR" \
--model_name "${MODEL_TYPE}_${WT}" \
>> "$LOG_FILE" 2>&1
echo " ${OUT_DIR}/"
}
# Wall-time refit
# $1=OBJECTIVE $2=WT $3=MODEL_TYPE $4=DATASET_PATH
run_walltime() {
local OBJECTIVE=$1
local WT=$2
local MODEL_TYPE=$3
local DATASET_PATH=$4
local MODEL_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
local LOG_FILE="${LOG_LOC}/${DATE}_walltime_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log"
if [ ! -d "$MODEL_DIR" ]; then
echo " [SKIP walltime] model_dir not found: $MODEL_DIR"
return
fi
if [ ! -d "$DATASET_PATH" ]; then
echo " [SKIP walltime] dataset not found: $DATASET_PATH"
return
fi
echo " [walltime] ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}"
python -u refit_ml_walltime.py \
--model_dir "$MODEL_DIR" \
--dataset_path "$DATASET_PATH" \
--logs_dir "$LOG_LOC" \
>> "$LOG_FILE" 2>&1
echo " logged to ${LOG_LOC}/${DATE}_wall_clock_ml.jsonl"
}
# =============================================================================
# Dataset path lookup
# $1=OBJECTIVE $2=WT
# =============================================================================
get_dataset_path() {
local OBJECTIVE=$1
local WT=$2
local DATA_LOC=$HOME_LOC/projects/Classifier_Weight/training_data_cleaned
case "${OBJECTIVE}|${WT}" in
# -- wt embeddings (ESM2 / original) ------------------------------
"hemolysis|wt") echo "${DATA_LOC}/hemolysis/hemo_wt_with_embeddings" ;;
"nf|wt") echo "${DATA_LOC}/nf/nf_wt_with_embeddings" ;;
"solubility|wt") echo "${DATA_LOC}/solubility/sol_wt_with_embeddings" ;;
"permeability_penetrance|wt") echo "${DATA_LOC}/permeability_penetrance/perm_wt_with_embeddings_pooled" ;;
# -- smiles embeddings (PeptideCLM) -------------------------------
"hemolysis|smiles") echo "${ALT_EMB_LOC}/hemolysis_peptideclm/hemo_smiles_with_embeddings" ;;
"nf|smiles") echo "${ALT_EMB_LOC}/nf_peptideclm/nf_smiles_with_embeddings" ;;
"permeability_pampa|smiles") echo "${ALT_EMB_LOC}/permeability_pampa_peptideclm/pampa_smiles_with_embeddings" ;;
"permeability_caco2|smiles") echo "${ALT_EMB_LOC}/permeability_caco2_peptideclm/caco2_smiles_with_embeddings" ;;
# -- chemberta embeddings -----------------------------------------
"hemolysis|chemberta") echo "${ALT_EMB_LOC}/hemolysis_chemberta/hemo_smiles_with_embeddings" ;;
"nf|chemberta") echo "${ALT_EMB_LOC}/nf_chemberta/nf_smiles_with_embeddings" ;;
"permeability_penetrance|chemberta") echo "${ALT_EMB_LOC}/permeability_chemberta/perm_smiles_with_embeddings" ;;
"permeability_penetrance|peptideclm") echo "${ALT_EMB_LOC}/permeability_peptideclm/perm_smiles_with_embeddings" ;;
"permeability_pampa|chemberta") echo "${ALT_EMB_LOC}/permeability_pampa_chemberta/pampa_smiles_with_embeddings" ;;
"permeability_caco2|chemberta") echo "${ALT_EMB_LOC}/permeability_caco2_chemberta/caco2_smiles_with_embeddings" ;;
*)
echo ""
;;
esac
}
# =============================================================================
# SECTION 1 - Classification tasks
# =============================================================================
echo ""
echo "============================================================"
echo " SECTION 1: Classification bootstrap + walltime"
echo "============================================================"
CLS_MODEL_TYPES=("svm_gpu" "enet_gpu" "xgb")
# hemolysis, nf - wt + smiles + chemberta
for OBJECTIVE in "hemolysis" "nf"; do
for WT in "wt" "smiles" "chemberta"; do
for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do
echo ""
echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob"
DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
done
done
done
# solubility, permeability_penetrance - wt + chemberta (no smiles embeddings)
for OBJECTIVE in "solubility" "permeability_penetrance"; do
for WT in "wt" "chemberta"; do
for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do
echo ""
echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob"
DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
done
done
done
# =============================================================================
# SECTION 2 - Regression tasks (PAMPA, Caco-2)
# =============================================================================
echo ""
echo "============================================================"
echo " SECTION 2: Regression bootstrap + walltime"
echo "============================================================"
REG_MODEL_TYPES=("svr" "enet_gpu" "xgb")
for OBJECTIVE in "permeability_pampa" "permeability_caco2"; do
for WT in "smiles" "chemberta"; do
for MODEL_TYPE in "${REG_MODEL_TYPES[@]}"; do
echo ""
echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty_reg.py" "$MODEL_TYPE" "uncertainty_residual"
DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
run_walltime "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
done
done
done
echo ""
echo "============================================================"
echo "All runs completed at $(date)"
echo "============================================================"
conda deactivate
|