File size: 4,791 Bytes

cfcbbc8

#!/bin/bash
#
# test_models_parallel_gnu.sh - Run multiple ATLAS models in parallel using GNU parallel
#
# This script processes multiple LLM models for ATLAS H→γγ analysis.
# For each model, it runs 5 independent Snakemake workflows in parallel.
#
# Usage:
#   ./test_models_parallel_gnu.sh [output_name] [max_concurrent_models] [tasks_per_model]
#
# Arguments:
#   output_name: Name of output directory (default: "test")
#   max_concurrent_models: Maximum models to run simultaneously (default: 5)
#   tasks_per_model: Number of parallel tasks per model (default: 5)
#
# Examples:
#   ./test_models_parallel_gnu.sh                    # Basic usage
#   ./test_models_parallel_gnu.sh experiment1       # Custom output name
#   ./test_models_parallel_gnu.sh test 3 5          # 3 models at once, 5 tasks each
#   ./test_models_parallel_gnu.sh large_test 10 5   # 10 models, 5 tasks each
#
# Requirements:
#   - GNU parallel must be installed
#   - models.txt file with list of models to test
#   - All workflow/*.smk files must be present
#   - Python environment with required packages
#
# Features:
#   - Scales to 20-30 models with 10 jobs each (200-300 total jobs)
#   - Independent task execution - order doesn't matter
#   - Automatic resource management via GNU parallel
#   - Comprehensive error handling and logging
#   - Temporary workspace in /dev/shm for fast I/O
#
# Output Structure:
#   output_name/
#   ├── model1_timestamp1/
#   │   ├── generated_code/
#   │   ├── logs/
#   │   ├── plots/
#   │   ├── prompt_pairs/
#   │   ├── snakemake_log/
#   │   └── stats.csv
#   └── model2_timestamp2/
#       └── ...
#

module load python

# Save the directory where the script was started
ORIG_DIR=$(pwd)

# Create a unique random folder in /dev/shm
TMPDIR=$(mktemp -d /dev/shm/llm_run_temp_XXXXXX)
WORKDIR="$TMPDIR/llm_for_analysis"

conda activate llm_env

# Get the root of the current Git repository
SRC_DIR=$(git rev-parse --show-toplevel)
echo "Using Git repository root: $SRC_DIR"

# Copy files from the Git repo root, excluding .git, results/, and .snakemake/
rsync -av \
    --exclude='.git' \
    --exclude='results/' \
    --exclude='.snakemake/' \
    --exclude='test/' \
    "$SRC_DIR/" \
    "$WORKDIR/"

chmod +x "$WORKDIR/test_stats_parallel.sh"
cd "$WORKDIR"
mkdir -p results

MODEL_LIST="models.txt"
OUT_NAME="${1:-test}"   # Take from first argument, default to "test"
MAX_JOBS="${2:-5}"     # Maximum concurrent models (default 5)
TASK_JOBS="${3:-5}"    # Jobs per model task (default 5)

echo "Configuration:"
echo "  Output directory: $OUT_NAME"
echo "  Max concurrent models: $MAX_JOBS"
echo "  Tasks per model: $TASK_JOBS"
echo "  Total potential jobs: $((MAX_JOBS * TASK_JOBS))"
echo ""

# Function to process a single model
process_model() {
    local model=$1
    local out_name=$2
    local src_dir=$3
    local work_dir=$4

    # Use timestamp for unique run naming
    local timestamp=$(date +"%Y%m%d_%H%M%S")
    local MODEL_SAFE="${model//\//_}_$timestamp"
    export MODEL_NAME="$model"

    echo "Starting model [$timestamp]: $model"

    # Create config file for this run
    cat > config.yml <<EOF
model: '$model'
name: '$MODEL_SAFE'
out_dir: '$work_dir/results/$MODEL_SAFE'
EOF

    # Run the parallel test script
    if bash test_stats_parallel.sh; then
        echo "Model $model completed successfully"

        # Save results for this model
        local DEST_DIR="$src_dir/$out_name/"
        mkdir -p "$DEST_DIR"
        cp -r "$work_dir/results/$MODEL_SAFE" "$DEST_DIR/"
        mkdir -p "$DEST_DIR/$MODEL_SAFE/snakemake_log/"
        cp -r "$work_dir/.snakemake/log/"* "$DEST_DIR/$MODEL_SAFE/snakemake_log/" || true
        cp stats.csv "$DEST_DIR/$MODEL_SAFE/stats.csv" || true

        return 0
    else
        echo "ERROR: Model $model failed"
        return 1
    fi
}
export -f process_model

# Export variables for the function
export SRC_DIR
export WORKDIR

# Read models into an array
mapfile -t MODELS < "$MODEL_LIST"

echo "Starting GNU parallel processing of ${#MODELS[@]} models..."
echo "Models to process: ${MODELS[*]}"
echo ""

# Use GNU parallel to process models
# --halt soon,fail=1: Stop as soon as one job fails
# --jobs $MAX_JOBS: Limit concurrent models
# --line-buffer: Show output as it comes
parallel --no-notice --halt soon,fail=1 --jobs "$MAX_JOBS" --line-buffer \
    process_model {} "$OUT_NAME" "$SRC_DIR" "$WORKDIR" ::: "${MODELS[@]}"

# Check if all models succeeded
if [ $? -eq 0 ]; then
    echo ""
    echo "SUCCESS: All models processed successfully!"
    echo "Results saved to: $SRC_DIR/$OUT_NAME/"
else
    echo ""
    echo "ERROR: Some models failed to process"
    exit 1
fi

# Return to the original directory
cd "$ORIG_DIR"