#!/bin/bash # Aggregate 135 SLURM job results (265051-265185). # Writes a markdown summary to neurips26/results/run__summary.md set -uo pipefail ROOT=${PULSE_ROOT} JID_LO=265051 JID_HI=265185 TS=$(date -u +%Y%m%d_%H%M) OUT="${ROOT}/results/run_${TS}_summary.md" mkdir -p "${ROOT}/results" # tmp scratch TMP=$(mktemp -d) trap 'rm -rf "$TMP"' EXIT # 1. Walk all seed dirs in submission order; classify each. # For each seed dir, pick the slurm_.out matching one of our jids. # Status is OK if "[done] best" present, FAIL if traceback/error, TIMEOUT # if SLURM cancelled it for time, RUNNING if no exit yet, MISSING if no log. ORDER_FILE="$TMP/order.tsv" # tabletag\trow\tseed\tjid\tstatus\tacc\tepochs\tepoch_best : > "$ORDER_FILE" for tt in table1_main_comparison table3_horizon_curve table4_modality_ablation table5_component_ablation table7_missing_modality; do for row_dir in "${ROOT}/${tt}"/row*; do [ -d "$row_dir" ] || continue row=$(basename "$row_dir") for seed in 42 123 456 789 1024; do sd="${row_dir}/seeds/seed${seed}" [ -d "$sd" ] || { printf "%s\t%s\t%d\t-\tMISSING_DIR\t-\t-\t-\n" "$tt" "$row" "$seed" >> "$ORDER_FILE"; continue; } log=$(ls "${sd}"/slurm_*.out 2>/dev/null | head -1) if [ -z "$log" ]; then printf "%s\t%s\t%d\t-\tNO_LOG\t-\t-\t-\n" "$tt" "$row" "$seed" >> "$ORDER_FILE" continue fi jid=$(basename "$log" | sed 's/^slurm_//; s/\.out$//') # Determine status if grep -q "^\[done\] best" "$log"; then status=OK line=$(grep "^\[done\] best" "$log" | head -1) acc=$(echo "$line" | grep -oE "action@1 = [0-9.]+" | awk '{print $3}') epoch_best=$(echo "$line" | grep -oE "epoch [0-9]+" | head -1 | awk '{print $2}') # last reported epoch number last_e=$(grep -E "^ E +[0-9]+" "$log" | tail -1 | awk '{print $2}') printf "%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n" "$tt" "$row" "$seed" "$jid" "OK" "${acc}" "${last_e:-?}" "${epoch_best:-?}" >> "$ORDER_FILE" elif grep -qE "DUE TO TIME LIMIT|CANCELLED.*TIME" "$log"; then printf "%s\t%s\t%d\t%s\tTIMEOUT\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE" elif grep -qE "Traceback|RuntimeError|invalid choice|CUDA error" "$log"; then err=$(grep -E "Traceback|RuntimeError|invalid choice|CUDA error" "$log" | tail -1 | head -c 120) printf "%s\t%s\t%d\t%s\tFAIL\t-\t-\t-\t%s\n" "$tt" "$row" "$seed" "$jid" "$err" >> "$ORDER_FILE" elif squeue -j "$jid" -h 2>/dev/null | grep -q .; then printf "%s\t%s\t%d\t%s\tRUNNING\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE" else # fell off queue without [done] and without typical error markers printf "%s\t%s\t%d\t%s\tEXITED_NO_DONE\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE" fi done done done # 2. Build markdown { echo "# Run summary — $(date '+%Y-%m-%d %H:%M %Z')" echo echo "Job range: \`${JID_LO}-${JID_HI}\` (135 expected)" echo echo "## Overall status" echo echo "| status | count |" echo "|---|---|" awk -F'\t' '{print $5}' "$ORDER_FILE" | sort | uniq -c | awk '{printf "| %s | %d |\n", $2, $1}' echo echo "## Per-row mean ± std (action@1)" echo echo "| table | row | n_ok | n_fail | mean | std | best_seed | best_acc | epochs (median) | best_epoch (median) |" echo "|---|---|---:|---:|---:|---:|---|---:|---:|---:|" awk -F'\t' '{key=$1"\t"$2; if($5=="OK"){n[key]++; sum[key]+=$6; ss[key]+=($6*$6); if($6>maxa[key]){maxa[key]=$6; bestseed[key]=$3} le[key]=le[key]" "$7; be[key]=be[key]" "$8} else if($5!="OK"){fail[key]++}} END{for(k in n){tt=k; sub(/\t.*/,"",tt); rr=k; sub(/.*\t/,"",rr); m=sum[k]/n[k]; v=ss[k]/n[k] - m*m; if(v<0)v=0; sd=sqrt(v); # median of last_epoch list split(le[k], A, " "); cnt=0; for(i in A){if(A[i]!=""){cnt++; B[cnt]=A[i]+0}} asort(B); med_le=cnt? B[int((cnt+1)/2)] : "-"; delete B; split(be[k], A, " "); cnt=0; for(i in A){if(A[i]!=""){cnt++; B[cnt]=A[i]+0}} asort(B); med_be=cnt? B[int((cnt+1)/2)] : "-"; fk=fail[k]+0; printf "| %s | %s | %d | %d | %.4f | %.4f | seed%s | %.4f | %s | %s |\n", tt, rr, n[k], fk, m, sd, bestseed[k], maxa[k], med_le, med_be }}' "$ORDER_FILE" | sort echo echo "## Failed / non-OK jobs" echo awk -F'\t' '$5!="OK" {printf "- **%s/%s seed%s** jid=%s status=%s %s\n", $1,$2,$3,$4,$5,$9}' "$ORDER_FILE" || true if ! awk -F'\t' '$5!="OK"' "$ORDER_FILE" | grep -q .; then echo "_None._" fi echo echo "## Notes / known operational concerns" echo echo "- These are operational results only. Most jobs trigger early-stop (patience=12) at epoch 1–18 instead of running the full 40 epochs, because validation metric saturates very early." echo "- \`best action@1\` observed in spot-check ranged 0.6%–3.4% (17 verb × 34 noun = 578 action classes; random ≈ 0.17%). This is a model/hyperparameter issue, not an infra issue." echo "- If you want to revisit hparams: try larger patience, lower lr, or warmup. The data loader and GPU stack are confirmed working (cu121 / A800)." echo echo "## Per-table seed-level details" echo for tt in table1_main_comparison table3_horizon_curve table4_modality_ablation table5_component_ablation table7_missing_modality; do echo "### ${tt}" echo echo "| row | seed42 | seed123 | seed456 | seed789 | seed1024 |" echo "|---|---|---|---|---|---|" awk -F'\t' -v tt="$tt" '$1==tt {key=$2; cell=($5=="OK"? sprintf("%.4f",$6) : "·"$5); arr[key,$3]=cell; rows[key]=1} END{for(r in rows){printf "| %s | %s | %s | %s | %s | %s |\n", r, (arr[r,42]!=""?arr[r,42]:"-"), (arr[r,123]!=""?arr[r,123]:"-"), (arr[r,456]!=""?arr[r,456]:"-"), (arr[r,789]!=""?arr[r,789]:"-"), (arr[r,1024]!=""?arr[r,1024]:"-")}}' "$ORDER_FILE" | sort echo done } > "$OUT" echo "Wrote $OUT" ls -la "$OUT"