PULSE-code / scripts /summarize_135.sh
velvet-pine-22's picture
Upload folder using huggingface_hub
b4b2877 verified
#!/bin/bash
# Aggregate 135 SLURM job results (265051-265185).
# Writes a markdown summary to neurips26/results/run_<UTC-tag>_summary.md
set -uo pipefail
ROOT=${PULSE_ROOT}
JID_LO=265051
JID_HI=265185
TS=$(date -u +%Y%m%d_%H%M)
OUT="${ROOT}/results/run_${TS}_summary.md"
mkdir -p "${ROOT}/results"
# tmp scratch
TMP=$(mktemp -d)
trap 'rm -rf "$TMP"' EXIT
# 1. Walk all seed dirs in submission order; classify each.
# For each seed dir, pick the slurm_<jid>.out matching one of our jids.
# Status is OK if "[done] best" present, FAIL if traceback/error, TIMEOUT
# if SLURM cancelled it for time, RUNNING if no exit yet, MISSING if no log.
ORDER_FILE="$TMP/order.tsv" # tabletag\trow\tseed\tjid\tstatus\tacc\tepochs\tepoch_best
: > "$ORDER_FILE"
for tt in table1_main_comparison table3_horizon_curve table4_modality_ablation table5_component_ablation table7_missing_modality; do
for row_dir in "${ROOT}/${tt}"/row*; do
[ -d "$row_dir" ] || continue
row=$(basename "$row_dir")
for seed in 42 123 456 789 1024; do
sd="${row_dir}/seeds/seed${seed}"
[ -d "$sd" ] || { printf "%s\t%s\t%d\t-\tMISSING_DIR\t-\t-\t-\n" "$tt" "$row" "$seed" >> "$ORDER_FILE"; continue; }
log=$(ls "${sd}"/slurm_*.out 2>/dev/null | head -1)
if [ -z "$log" ]; then
printf "%s\t%s\t%d\t-\tNO_LOG\t-\t-\t-\n" "$tt" "$row" "$seed" >> "$ORDER_FILE"
continue
fi
jid=$(basename "$log" | sed 's/^slurm_//; s/\.out$//')
# Determine status
if grep -q "^\[done\] best" "$log"; then
status=OK
line=$(grep "^\[done\] best" "$log" | head -1)
acc=$(echo "$line" | grep -oE "action@1 = [0-9.]+" | awk '{print $3}')
epoch_best=$(echo "$line" | grep -oE "epoch [0-9]+" | head -1 | awk '{print $2}')
# last reported epoch number
last_e=$(grep -E "^ E +[0-9]+" "$log" | tail -1 | awk '{print $2}')
printf "%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n" "$tt" "$row" "$seed" "$jid" "OK" "${acc}" "${last_e:-?}" "${epoch_best:-?}" >> "$ORDER_FILE"
elif grep -qE "DUE TO TIME LIMIT|CANCELLED.*TIME" "$log"; then
printf "%s\t%s\t%d\t%s\tTIMEOUT\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE"
elif grep -qE "Traceback|RuntimeError|invalid choice|CUDA error" "$log"; then
err=$(grep -E "Traceback|RuntimeError|invalid choice|CUDA error" "$log" | tail -1 | head -c 120)
printf "%s\t%s\t%d\t%s\tFAIL\t-\t-\t-\t%s\n" "$tt" "$row" "$seed" "$jid" "$err" >> "$ORDER_FILE"
elif squeue -j "$jid" -h 2>/dev/null | grep -q .; then
printf "%s\t%s\t%d\t%s\tRUNNING\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE"
else
# fell off queue without [done] and without typical error markers
printf "%s\t%s\t%d\t%s\tEXITED_NO_DONE\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE"
fi
done
done
done
# 2. Build markdown
{
echo "# Run summary — $(date '+%Y-%m-%d %H:%M %Z')"
echo
echo "Job range: \`${JID_LO}-${JID_HI}\` (135 expected)"
echo
echo "## Overall status"
echo
echo "| status | count |"
echo "|---|---|"
awk -F'\t' '{print $5}' "$ORDER_FILE" | sort | uniq -c | awk '{printf "| %s | %d |\n", $2, $1}'
echo
echo "## Per-row mean ± std (action@1)"
echo
echo "| table | row | n_ok | n_fail | mean | std | best_seed | best_acc | epochs (median) | best_epoch (median) |"
echo "|---|---|---:|---:|---:|---:|---|---:|---:|---:|"
awk -F'\t' '{key=$1"\t"$2; if($5=="OK"){n[key]++; sum[key]+=$6; ss[key]+=($6*$6); if($6>maxa[key]){maxa[key]=$6; bestseed[key]=$3} le[key]=le[key]" "$7; be[key]=be[key]" "$8} else if($5!="OK"){fail[key]++}}
END{for(k in n){tt=k; sub(/\t.*/,"",tt); rr=k; sub(/.*\t/,"",rr);
m=sum[k]/n[k]; v=ss[k]/n[k] - m*m; if(v<0)v=0; sd=sqrt(v);
# median of last_epoch list
split(le[k], A, " "); cnt=0; for(i in A){if(A[i]!=""){cnt++; B[cnt]=A[i]+0}}
asort(B); med_le=cnt? B[int((cnt+1)/2)] : "-"; delete B;
split(be[k], A, " "); cnt=0; for(i in A){if(A[i]!=""){cnt++; B[cnt]=A[i]+0}}
asort(B); med_be=cnt? B[int((cnt+1)/2)] : "-";
fk=fail[k]+0;
printf "| %s | %s | %d | %d | %.4f | %.4f | seed%s | %.4f | %s | %s |\n", tt, rr, n[k], fk, m, sd, bestseed[k], maxa[k], med_le, med_be
}}' "$ORDER_FILE" | sort
echo
echo "## Failed / non-OK jobs"
echo
awk -F'\t' '$5!="OK" {printf "- **%s/%s seed%s** jid=%s status=%s %s\n", $1,$2,$3,$4,$5,$9}' "$ORDER_FILE" || true
if ! awk -F'\t' '$5!="OK"' "$ORDER_FILE" | grep -q .; then
echo "_None._"
fi
echo
echo "## Notes / known operational concerns"
echo
echo "- These are operational results only. Most jobs trigger early-stop (patience=12) at epoch 1–18 instead of running the full 40 epochs, because validation metric saturates very early."
echo "- \`best action@1\` observed in spot-check ranged 0.6%–3.4% (17 verb × 34 noun = 578 action classes; random ≈ 0.17%). This is a model/hyperparameter issue, not an infra issue."
echo "- If you want to revisit hparams: try larger patience, lower lr, or warmup. The data loader and GPU stack are confirmed working (cu121 / A800)."
echo
echo "## Per-table seed-level details"
echo
for tt in table1_main_comparison table3_horizon_curve table4_modality_ablation table5_component_ablation table7_missing_modality; do
echo "### ${tt}"
echo
echo "| row | seed42 | seed123 | seed456 | seed789 | seed1024 |"
echo "|---|---|---|---|---|---|"
awk -F'\t' -v tt="$tt" '$1==tt {key=$2; cell=($5=="OK"? sprintf("%.4f",$6) : "·"$5); arr[key,$3]=cell; rows[key]=1}
END{for(r in rows){printf "| %s | %s | %s | %s | %s | %s |\n", r, (arr[r,42]!=""?arr[r,42]:"-"), (arr[r,123]!=""?arr[r,123]:"-"), (arr[r,456]!=""?arr[r,456]:"-"), (arr[r,789]!=""?arr[r,789]:"-"), (arr[r,1024]!=""?arr[r,1024]:"-")}}' "$ORDER_FILE" | sort
echo
done
} > "$OUT"
echo "Wrote $OUT"
ls -la "$OUT"