Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16
Browse files
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/bench.slurm
CHANGED
|
@@ -31,6 +31,12 @@ update_status() {
|
|
| 31 |
done
|
| 32 |
}
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Misc initializations.
|
| 35 |
echo "========================"
|
| 36 |
echo "START TIME: $(date)"
|
|
@@ -75,9 +81,21 @@ job_id=${SLURM_JOB_ID}
|
|
| 75 |
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
|
| 76 |
|
| 77 |
# Run the main command
|
| 78 |
-
srun -u $LAUNCHER $CMD
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
exit_status=$?
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Update status based on the exit status of `srun`
|
| 82 |
if [ $exit_status -eq 0 ]; then
|
| 83 |
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
|
@@ -99,7 +117,6 @@ if [ $exit_status -eq 0 ]; then
|
|
| 99 |
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
|
| 100 |
fi
|
| 101 |
|
| 102 |
-
|
| 103 |
# Push to hub the folder using huggingface_cli
|
| 104 |
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
|
| 105 |
|
|
|
|
| 31 |
done
|
| 32 |
}
|
| 33 |
|
| 34 |
+
dump_stack_trace() {
|
| 35 |
+
local pid=$1
|
| 36 |
+
local output_file=$2
|
| 37 |
+
py-spy dump --pid $pid > $output_file
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
# Misc initializations.
|
| 41 |
echo "========================"
|
| 42 |
echo "START TIME: $(date)"
|
|
|
|
| 81 |
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
|
| 82 |
|
| 83 |
# Run the main command
|
| 84 |
+
srun -u $LAUNCHER $CMD &
|
| 85 |
+
|
| 86 |
+
main_pid=$!
|
| 87 |
+
|
| 88 |
+
# Wait for the main process to finish
|
| 89 |
+
wait $main_pid
|
| 90 |
exit_status=$?
|
| 91 |
|
| 92 |
+
# If the exit status is non-zero, dump the stack trace
|
| 93 |
+
if [ $exit_status -ne 0 ]; then
|
| 94 |
+
dump_file="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_${SLURM_JOB_ID}_${SLURM_PROCID}.txt"
|
| 95 |
+
echo "Job crashed. Dumping stack trace to $dump_file"
|
| 96 |
+
dump_stack_trace $main_pid $dump_file
|
| 97 |
+
fi
|
| 98 |
+
|
| 99 |
# Update status based on the exit status of `srun`
|
| 100 |
if [ $exit_status -eq 0 ]; then
|
| 101 |
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
|
|
|
| 117 |
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
|
| 118 |
fi
|
| 119 |
|
|
|
|
| 120 |
# Push to hub the folder using huggingface_cli
|
| 121 |
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
|
| 122 |
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_7398594_0.txt
ADDED
|
File without changes
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|