File size: 4,791 Bytes
cfcbbc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/bin/bash
#
# test_models_parallel_gnu.sh - Run multiple ATLAS models in parallel using GNU parallel
#
# This script processes multiple LLM models for ATLAS H→γγ analysis.
# For each model, it runs 5 independent Snakemake workflows in parallel.
#
# Usage:
#   ./test_models_parallel_gnu.sh [output_name] [max_concurrent_models] [tasks_per_model]
#
# Arguments:
#   output_name: Name of output directory (default: "test")
#   max_concurrent_models: Maximum models to run simultaneously (default: 5)
#   tasks_per_model: Number of parallel tasks per model (default: 5)
#
# Examples:
#   ./test_models_parallel_gnu.sh                    # Basic usage
#   ./test_models_parallel_gnu.sh experiment1       # Custom output name
#   ./test_models_parallel_gnu.sh test 3 5          # 3 models at once, 5 tasks each
#   ./test_models_parallel_gnu.sh large_test 10 5   # 10 models, 5 tasks each
#
# Requirements:
#   - GNU parallel must be installed
#   - models.txt file with list of models to test
#   - All workflow/*.smk files must be present
#   - Python environment with required packages
#
# Features:
#   - Scales to 20-30 models with 10 jobs each (200-300 total jobs)
#   - Independent task execution - order doesn't matter
#   - Automatic resource management via GNU parallel
#   - Comprehensive error handling and logging
#   - Temporary workspace in /dev/shm for fast I/O
#
# Output Structure:
#   output_name/
#   β”œβ”€β”€ model1_timestamp1/
#   β”‚   β”œβ”€β”€ generated_code/
#   β”‚   β”œβ”€β”€ logs/
#   β”‚   β”œβ”€β”€ plots/
#   β”‚   β”œβ”€β”€ prompt_pairs/
#   β”‚   β”œβ”€β”€ snakemake_log/
#   β”‚   └── stats.csv
#   └── model2_timestamp2/
#       └── ...
#

module load python

# Save the directory where the script was started
ORIG_DIR=$(pwd)

# Create a unique random folder in /dev/shm
TMPDIR=$(mktemp -d /dev/shm/llm_run_temp_XXXXXX)
WORKDIR="$TMPDIR/llm_for_analysis"

conda activate llm_env

# Get the root of the current Git repository
SRC_DIR=$(git rev-parse --show-toplevel)
echo "Using Git repository root: $SRC_DIR"

# Copy files from the Git repo root, excluding .git, results/, and .snakemake/
rsync -av \
    --exclude='.git' \
    --exclude='results/' \
    --exclude='.snakemake/' \
    --exclude='test/' \
    "$SRC_DIR/" \
    "$WORKDIR/"

chmod +x "$WORKDIR/test_stats_parallel.sh"
cd "$WORKDIR"
mkdir -p results

MODEL_LIST="models.txt"
OUT_NAME="${1:-test}"   # Take from first argument, default to "test"
MAX_JOBS="${2:-5}"     # Maximum concurrent models (default 5)
TASK_JOBS="${3:-5}"    # Jobs per model task (default 5)

echo "Configuration:"
echo "  Output directory: $OUT_NAME"
echo "  Max concurrent models: $MAX_JOBS"
echo "  Tasks per model: $TASK_JOBS"
echo "  Total potential jobs: $((MAX_JOBS * TASK_JOBS))"
echo ""

# Function to process a single model
process_model() {
    local model=$1
    local out_name=$2
    local src_dir=$3
    local work_dir=$4

    # Use timestamp for unique run naming
    local timestamp=$(date +"%Y%m%d_%H%M%S")
    local MODEL_SAFE="${model//\//_}_$timestamp"
    export MODEL_NAME="$model"

    echo "Starting model [$timestamp]: $model"

    # Create config file for this run
    cat > config.yml <<EOF
model: '$model'
name: '$MODEL_SAFE'
out_dir: '$work_dir/results/$MODEL_SAFE'
EOF

    # Run the parallel test script
    if bash test_stats_parallel.sh; then
        echo "Model $model completed successfully"

        # Save results for this model
        local DEST_DIR="$src_dir/$out_name/"
        mkdir -p "$DEST_DIR"
        cp -r "$work_dir/results/$MODEL_SAFE" "$DEST_DIR/"
        mkdir -p "$DEST_DIR/$MODEL_SAFE/snakemake_log/"
        cp -r "$work_dir/.snakemake/log/"* "$DEST_DIR/$MODEL_SAFE/snakemake_log/" || true
        cp stats.csv "$DEST_DIR/$MODEL_SAFE/stats.csv" || true

        return 0
    else
        echo "ERROR: Model $model failed"
        return 1
    fi
}
export -f process_model

# Export variables for the function
export SRC_DIR
export WORKDIR

# Read models into an array
mapfile -t MODELS < "$MODEL_LIST"

echo "Starting GNU parallel processing of ${#MODELS[@]} models..."
echo "Models to process: ${MODELS[*]}"
echo ""

# Use GNU parallel to process models
# --halt soon,fail=1: Stop as soon as one job fails
# --jobs $MAX_JOBS: Limit concurrent models
# --line-buffer: Show output as it comes
parallel --no-notice --halt soon,fail=1 --jobs "$MAX_JOBS" --line-buffer \
    process_model {} "$OUT_NAME" "$SRC_DIR" "$WORKDIR" ::: "${MODELS[@]}"

# Check if all models succeeded
if [ $? -eq 0 ]; then
    echo ""
    echo "SUCCESS: All models processed successfully!"
    echo "Results saved to: $SRC_DIR/$OUT_NAME/"
else
    echo ""
    echo "ERROR: Some models failed to process"
    exit 1
fi

# Return to the original directory
cd "$ORIG_DIR"