File size: 4,791 Bytes
cfcbbc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
#!/bin/bash
#
# test_models_parallel_gnu.sh - Run multiple ATLAS models in parallel using GNU parallel
#
# This script processes multiple LLM models for ATLAS HβΞ³Ξ³ analysis.
# For each model, it runs 5 independent Snakemake workflows in parallel.
#
# Usage:
# ./test_models_parallel_gnu.sh [output_name] [max_concurrent_models] [tasks_per_model]
#
# Arguments:
# output_name: Name of output directory (default: "test")
# max_concurrent_models: Maximum models to run simultaneously (default: 5)
# tasks_per_model: Number of parallel tasks per model (default: 5)
#
# Examples:
# ./test_models_parallel_gnu.sh # Basic usage
# ./test_models_parallel_gnu.sh experiment1 # Custom output name
# ./test_models_parallel_gnu.sh test 3 5 # 3 models at once, 5 tasks each
# ./test_models_parallel_gnu.sh large_test 10 5 # 10 models, 5 tasks each
#
# Requirements:
# - GNU parallel must be installed
# - models.txt file with list of models to test
# - All workflow/*.smk files must be present
# - Python environment with required packages
#
# Features:
# - Scales to 20-30 models with 10 jobs each (200-300 total jobs)
# - Independent task execution - order doesn't matter
# - Automatic resource management via GNU parallel
# - Comprehensive error handling and logging
# - Temporary workspace in /dev/shm for fast I/O
#
# Output Structure:
# output_name/
# βββ model1_timestamp1/
# β βββ generated_code/
# β βββ logs/
# β βββ plots/
# β βββ prompt_pairs/
# β βββ snakemake_log/
# β βββ stats.csv
# βββ model2_timestamp2/
# βββ ...
#
module load python
# Save the directory where the script was started
ORIG_DIR=$(pwd)
# Create a unique random folder in /dev/shm
TMPDIR=$(mktemp -d /dev/shm/llm_run_temp_XXXXXX)
WORKDIR="$TMPDIR/llm_for_analysis"
conda activate llm_env
# Get the root of the current Git repository
SRC_DIR=$(git rev-parse --show-toplevel)
echo "Using Git repository root: $SRC_DIR"
# Copy files from the Git repo root, excluding .git, results/, and .snakemake/
rsync -av \
--exclude='.git' \
--exclude='results/' \
--exclude='.snakemake/' \
--exclude='test/' \
"$SRC_DIR/" \
"$WORKDIR/"
chmod +x "$WORKDIR/test_stats_parallel.sh"
cd "$WORKDIR"
mkdir -p results
MODEL_LIST="models.txt"
OUT_NAME="${1:-test}" # Take from first argument, default to "test"
MAX_JOBS="${2:-5}" # Maximum concurrent models (default 5)
TASK_JOBS="${3:-5}" # Jobs per model task (default 5)
echo "Configuration:"
echo " Output directory: $OUT_NAME"
echo " Max concurrent models: $MAX_JOBS"
echo " Tasks per model: $TASK_JOBS"
echo " Total potential jobs: $((MAX_JOBS * TASK_JOBS))"
echo ""
# Function to process a single model
process_model() {
local model=$1
local out_name=$2
local src_dir=$3
local work_dir=$4
# Use timestamp for unique run naming
local timestamp=$(date +"%Y%m%d_%H%M%S")
local MODEL_SAFE="${model//\//_}_$timestamp"
export MODEL_NAME="$model"
echo "Starting model [$timestamp]: $model"
# Create config file for this run
cat > config.yml <<EOF
model: '$model'
name: '$MODEL_SAFE'
out_dir: '$work_dir/results/$MODEL_SAFE'
EOF
# Run the parallel test script
if bash test_stats_parallel.sh; then
echo "Model $model completed successfully"
# Save results for this model
local DEST_DIR="$src_dir/$out_name/"
mkdir -p "$DEST_DIR"
cp -r "$work_dir/results/$MODEL_SAFE" "$DEST_DIR/"
mkdir -p "$DEST_DIR/$MODEL_SAFE/snakemake_log/"
cp -r "$work_dir/.snakemake/log/"* "$DEST_DIR/$MODEL_SAFE/snakemake_log/" || true
cp stats.csv "$DEST_DIR/$MODEL_SAFE/stats.csv" || true
return 0
else
echo "ERROR: Model $model failed"
return 1
fi
}
export -f process_model
# Export variables for the function
export SRC_DIR
export WORKDIR
# Read models into an array
mapfile -t MODELS < "$MODEL_LIST"
echo "Starting GNU parallel processing of ${#MODELS[@]} models..."
echo "Models to process: ${MODELS[*]}"
echo ""
# Use GNU parallel to process models
# --halt soon,fail=1: Stop as soon as one job fails
# --jobs $MAX_JOBS: Limit concurrent models
# --line-buffer: Show output as it comes
parallel --no-notice --halt soon,fail=1 --jobs "$MAX_JOBS" --line-buffer \
process_model {} "$OUT_NAME" "$SRC_DIR" "$WORKDIR" ::: "${MODELS[@]}"
# Check if all models succeeded
if [ $? -eq 0 ]; then
echo ""
echo "SUCCESS: All models processed successfully!"
echo "Results saved to: $SRC_DIR/$OUT_NAME/"
else
echo ""
echo "ERROR: Some models failed to process"
exit 1
fi
# Return to the original directory
cd "$ORIG_DIR"
|