File size: 23,298 Bytes

96abbd8

#!/bin/bash

set -uo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
OPEN_ROOT="$(cd "${PROJECT_ROOT}/../.." && pwd)"
SRC_DIR="${PROJECT_ROOT}/src"
export PYTHONPATH="${SRC_DIR}:${PYTHONPATH:-}"

# Generate and Evaluate - Combined pipeline for generation + evaluation
# Usage: ./run_generate_and_evaluate.sh [model_name] [max_problems] [num_workers] [timeout] [tolerance] [dataset_name]
#
# Environment Variables:
#   REFRESH_DEBUG_MEMORY - Set to "false" to disable auto-backup and clearing of debug memory (default: true)
#   RUN_ALL_BENCHMARKS - Set to "true" to run all benchmarks in ./data/benchmarks/ (default: true)
#   USE_HF_OFFLINE - Set to "false" to allow downloading models from Hugging Face (default: true)
#   PARALLEL_BENCHMARKS - Set to "true" to run benchmarks in parallel (default: true)
#   MAX_PARALLEL_JOBS - Maximum number of parallel jobs (default: 4)
#   DATASET_NAME - Dataset to run when RUN_ALL_BENCHMARKS=false (default: IndustryOR)
#   EMBEDDING_MODEL - Optional embedding model name or local path passed to memory retrieval
#
# Example:
#   ./run_generate_and_evaluate.sh                    # Run with default settings (all benchmarks, offline mode, parallel)
#   RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh  # Run single dataset
#   RUN_ALL_BENCHMARKS=false ./run_generate_and_evaluate.sh gpt-4o 100 64 90 0.05 OPT-Principled
#   USE_HF_OFFLINE=false ./run_generate_and_evaluate.sh  # Allow downloading models
#   REFRESH_DEBUG_MEMORY=false ./run_generate_and_evaluate.sh  # Run without refreshing debug memory
#   PARALLEL_BENCHMARKS=false ./run_generate_and_evaluate.sh  # Run sequentially
#   MAX_PARALLEL_JOBS=2 ./run_generate_and_evaluate.sh  # Limit to 2 parallel jobs

MODEL=${1:-"gpt-4o"}
MAX_PROBLEMS=${2:-1000}
NUM_WORKERS=${3:-100}
TIMEOUT=${4:-60}
TOLERANCE=${5:-0.05}

# Configuration: Auto-backup and clear debug memory before running
# Set to "false" to disable this feature
REFRESH_DEBUG_MEMORY=${REFRESH_DEBUG_MEMORY:-true}

# Configuration: Run all benchmarks or single dataset
RUN_ALL_BENCHMARKS=${RUN_ALL_BENCHMARKS:-true}

# Configuration: Use offline mode for Hugging Face (avoid network calls)
# Set to "false" if you need to download models for the first time
USE_HF_OFFLINE=${USE_HF_OFFLINE:-true}

# Configuration: Run benchmarks in parallel
# Set to "true" to enable concurrent datasets (default: sequential datasets)
PARALLEL_BENCHMARKS=${PARALLEL_BENCHMARKS:-false}

# Configuration: Maximum number of parallel jobs
# Adjust based on your system resources
MAX_PARALLEL_JOBS=${MAX_PARALLEL_JOBS:-4}

# Default single dataset
DEFAULT_DATASET=${DATASET_NAME:-${6:-"IndustryOR"}}
# DEFAULT_DATASET="ComplexOR"
TEMPERATURE=${TEMPERATURE:-0.01}
MEMORY_DIR="${PROJECT_ROOT}/memory_storage"
MEMORY_TOP_K=${MEMORY_TOP_K:-3}
PARALLEL=${PARALLEL:-128}
MAIN_TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
OUTPUT_DIR="${OPEN_ROOT}/results/Agora-Opt/generate_and_evaluate"
MAX_RETRIES=${MAX_RETRIES:-5}
BENCHMARKS_DIR="${PROJECT_ROOT}/../../data/benchmarks"
EMBEDDING_MODEL=${EMBEDDING_MODEL:-}

GENERATE_CLI="${PROJECT_ROOT}/scripts/generate_with_memory.py"
EXECUTE_CLI="${PROJECT_ROOT}/scripts/execute.py"

if [ -d "${BENCHMARKS_DIR}" ]; then
    BENCHMARKS_DIR="$(cd "${BENCHMARKS_DIR}" && pwd)"
elif [ -d "${PROJECT_ROOT}/clean_benchmarks" ]; then
    BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/clean_benchmarks" && pwd)"
elif [ -d "${PROJECT_ROOT}/../clean_benchmarks" ]; then
    BENCHMARKS_DIR="$(cd "${PROJECT_ROOT}/../clean_benchmarks" && pwd)"
fi

# Create output directory
mkdir -p "${OUTPUT_DIR}"

ensure_or_debate_env() {
    if [ "${CONDA_DEFAULT_ENV:-}" = "or-debate" ] && command -v python >/dev/null 2>&1; then
        return 0
    fi

    if ! command -v conda >/dev/null 2>&1; then
        echo "❌ conda command not found. Please install Conda or activate the or-debate environment manually."
        return 1
    fi

    local conda_bin
    local conda_base
    conda_bin="$(command -v conda)"
    conda_base="$(cd "$(dirname "${conda_bin}")/.." && pwd)"

    if [ -f "${conda_base}/etc/profile.d/conda.sh" ]; then
        # shellcheck disable=SC1090
        source "${conda_base}/etc/profile.d/conda.sh"
    else
        eval "$("${conda_bin}" shell.bash hook)"
    fi

    conda activate or-debate
}

# ============================================
# Function: Backup and Clear Debug Memory
# ============================================
backup_debug_memory() {
    if [ "${REFRESH_DEBUG_MEMORY}" = "true" ]; then
        DEBUG_MEMORY_FILE="${MEMORY_DIR}/debug_memory.jsonl"
        BACKUP_DIR="${MEMORY_DIR}/backups/${MAIN_TIMESTAMP}"

        if [ -f "${DEBUG_MEMORY_FILE}" ]; then
            echo "================================================"
            echo "🗂️  Backing up debug memory..."
            echo "================================================"
            
            # Create backup directory
            mkdir -p ${BACKUP_DIR}
            
            # Copy debug_memory.jsonl to backup
            cp "${DEBUG_MEMORY_FILE}" "${BACKUP_DIR}/debug_memory.jsonl"
            
            # Get file size and line count
            FILE_SIZE=$(du -h "${DEBUG_MEMORY_FILE}" | cut -f1)
            LINE_COUNT=$(wc -l < "${DEBUG_MEMORY_FILE}")
            
            echo "✅ Backed up debug memory:"
            echo "  Location: ${BACKUP_DIR}/debug_memory.jsonl"
            echo "  Size:     ${FILE_SIZE}"
            echo "  Lines:    ${LINE_COUNT}"
            
            # Clear the original file
            > "${DEBUG_MEMORY_FILE}"
            echo "✅ Cleared original debug memory file"
            echo ""
        else
            echo "ℹ️  No debug memory file found, skipping backup"
            echo ""
        fi
    else
        echo "ℹ️  Debug memory refresh is disabled (REFRESH_DEBUG_MEMORY=false)"
        echo ""
    fi
}

normalize_dataset_name() {
    local dataset_name="$1"
    dataset_name="${dataset_name%.jsonl}"
    case "${dataset_name}" in
        ComplexLP_clean) echo "ComplexLP" ;;
        EasyLP_clean) echo "EasyLP" ;;
        IndustryOR_clean|IndustryOR_v2|IndustryOR_fixedV2|IndustryOR_fixedV2_clean) echo "IndustryOR" ;;
        NL4Opt|NL4Opt_clean|NL4OPT_clean) echo "NL4OPT" ;;
        NLP4LP_clean) echo "NLP4LP" ;;
        ComplexOR_clean) echo "ComplexOR" ;;
        ReSocratic_clean) echo "ReSocratic" ;;
        combined|combined_dataset|OPT-Principled_clean) echo "OPT-Principled" ;;
        *) echo "${dataset_name}" ;;
    esac
}

DEFAULT_DATASET="$(normalize_dataset_name "${DEFAULT_DATASET}")"

# ============================================
# Function: Run single dataset (core logic)
# ============================================
process_dataset() {
    local DATASET_NAME
    DATASET_NAME="$(normalize_dataset_name "$1")"
    local TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
    local OUTPUT_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_${TIMESTAMP}.jsonl"
    local EVAL_FILE="${OUTPUT_DIR}/${MODEL}_${DATASET_NAME}_eval_${TIMESTAMP}.jsonl"
    local EVAL_REPORT="${EVAL_FILE}/evaluation_report.json"
    
    echo ""
    echo "╔════════════════════════════════════════════════╗"
    echo "║  Processing Dataset: ${DATASET_NAME}"
    echo "╚════════════════════════════════════════════════╝"
    echo ""
    
    # ============================================
    # STEP 1: Generation
    # ============================================
    echo "================================================"
    echo "📝 STEP 1/2: Generating code with memory..."
    echo "================================================"
    echo "Dataset: ${DATASET_NAME}"
    echo ""
    
    local generate_args=(
        --dataset "${DATASET_NAME}"
        --model "${MODEL}"
        --temperature "${TEMPERATURE}"
        --max_problems "${MAX_PROBLEMS}"
        --memory_dir "${MEMORY_DIR}"
        --memory_top_k "${MEMORY_TOP_K}"
        --parallel "${PARALLEL}"
        --output "${OUTPUT_FILE}"
        --max_retries "${MAX_RETRIES}"
        --execution_timeout 60
    )

    if [ -n "${EMBEDDING_MODEL}" ]; then
        generate_args+=(--embedding_model "${EMBEDDING_MODEL}")
    fi

    python "${GENERATE_CLI}" "${generate_args[@]}"

    EXIT_CODE=$?
    
    if [ ${EXIT_CODE} -ne 0 ]; then
        echo ""
        echo "❌ Generation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
        return 1
    fi
    
    echo ""
    echo "✅ Generation completed for ${DATASET_NAME}!"
    echo ""
    
    # Show generation summary
    if [ -f "${OUTPUT_FILE}" ]; then
        TOTAL=$(wc -l < ${OUTPUT_FILE})
        SUCCESS=$(grep -c '"status": "success"' "${OUTPUT_FILE}" 2>/dev/null || true)
        if [ -z "${SUCCESS}" ]; then
            SUCCESS=0
        fi
        echo "📊 Generation Summary:"
        echo "  Total problems: ${TOTAL}"
        echo "  Successful:     ${SUCCESS}"

        if [ "${SUCCESS}" -eq 0 ]; then
            echo ""
            echo "❌ Generation produced zero successful solutions for ${DATASET_NAME}"
            echo "   Refusing to continue with an incomplete run."
            return 1
        fi
    fi
    
    echo ""
    
    # ============================================
    # STEP 2: Evaluation
    # ============================================
    echo "================================================"
    echo "🔍 STEP 2/2: Executing and evaluating..."
    echo "================================================"
    echo ""
    
    local execute_args=(
        --input_file "${OUTPUT_FILE}"
        --output_dir "${EVAL_FILE}"
        --num_workers "${NUM_WORKERS}"
        --timeout "${TIMEOUT}"
        --tolerance "${TOLERANCE}"
        --use_relative_tolerance
    )

    if [ -n "${EMBEDDING_MODEL}" ]; then
        execute_args+=(--embedding_model "${EMBEDDING_MODEL}")
    fi

    python "${EXECUTE_CLI}" "${execute_args[@]}"
    EXIT_CODE=$?
    
    if [ ${EXIT_CODE} -ne 0 ]; then
        echo ""
        echo "❌ Evaluation failed for ${DATASET_NAME} with exit code ${EXIT_CODE}"
        return 1
    fi
    
    echo ""
    echo "✅ Evaluation completed for ${DATASET_NAME}!"
    echo ""
    
    # Show evaluation report if exists
    if [ -f "${EVAL_REPORT}" ]; then
        echo "📊 Evaluation Results for ${DATASET_NAME}:"
        cat "${EVAL_REPORT}" | jq '{
            accuracy: .accuracy,
            correct: .correct,
            total: .total_problems,
            status_counts: .status_counts
        }' 2>/dev/null || cat "${EVAL_REPORT}"
        echo ""
        
        # Store results for final summary (with lock for parallel execution)
        ACCURACY=$(cat "${EVAL_REPORT}" | jq -r '.accuracy' 2>/dev/null || echo "N/A")
        CORRECT=$(cat "${EVAL_REPORT}" | jq -r '.correct' 2>/dev/null || echo "N/A")
        TOTAL_PROBS=$(cat "${EVAL_REPORT}" | jq -r '.total_problems' 2>/dev/null || echo "N/A")
        
        # Use lock to safely append to results file (fallback to simple append if flock not available)
        RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
        if command -v flock >/dev/null 2>&1; then
            (
                flock -x 200
                echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
            ) 200>"${RESULTS_LOCK}"
        else
            # Fallback: use simple append (may have race condition but unlikely with small writes)
            echo "${DATASET_NAME}|${ACCURACY}|${CORRECT}|${TOTAL_PROBS}|${EVAL_FILE}" >> "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
        fi
    fi
    
    echo "================================================"
    echo ""
    
    if [ -f "${EVAL_REPORT}" ]; then
        return 0
    else
        return 1
    fi
}

# ============================================
# Function: Run single dataset (internal, supports logging)
# ============================================
run_single_dataset_internal() {
    local DATASET_NAME=$1
    local LOG_FILE=$2
    local STREAM_OUTPUT=${3:-false}
    
    if [ "${STREAM_OUTPUT}" = "true" ]; then
        process_dataset "${DATASET_NAME}" |& tee "${LOG_FILE}"
        local EXIT_CODE=${PIPESTATUS[0]}
        return ${EXIT_CODE}
    else
        process_dataset "${DATASET_NAME}" > "${LOG_FILE}" 2>&1
        return $?
    fi
}

# ============================================
# Function: Run single dataset (wrapper for sequential execution)
# ============================================
run_single_dataset() {
    local DATASET_NAME=$1
    local STREAM_OUTPUT=${2:-false}
    local LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
    
    run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}" "${STREAM_OUTPUT}"
    local EXIT_CODE=$?
    
    # Display output only when we did not already stream it live
    if [ "${STREAM_OUTPUT}" != "true" ]; then
        cat "${LOG_FILE}"
    fi
    
    return ${EXIT_CODE}
}

# ============================================
# Main Execution
# ============================================

echo "================================================"
echo "🚀 Generate + Evaluate Pipeline"
echo "================================================"
echo "Model:        ${MODEL}"
echo "Max problems: ${MAX_PROBLEMS}"
echo "Temperature:  ${TEMPERATURE}"
echo "Memory dir:   ${MEMORY_DIR}"
echo "Memory Top-K: ${MEMORY_TOP_K}"
if [ -n "${EMBEDDING_MODEL}" ]; then
    echo "Embedding:    ${EMBEDDING_MODEL}"
else
    echo "Embedding:    MemoryBank default"
fi
echo "Parallel:     ${PARALLEL}"
echo "Refresh Memory: ${REFRESH_DEBUG_MEMORY}"
echo "Run All Benchmarks: ${RUN_ALL_BENCHMARKS}"
echo "HF Offline:   ${USE_HF_OFFLINE}"
echo "Parallel Benchmarks: ${PARALLEL_BENCHMARKS}"
if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
    echo "Max Parallel Jobs: ${MAX_PARALLEL_JOBS}"
fi
echo ""
echo "Eval Workers: ${NUM_WORKERS}"
echo "Eval Timeout: ${TIMEOUT}s"
echo "Tolerance:    ${TOLERANCE} (relative)"
echo ""
echo "Max retries:  ${MAX_RETRIES}"
echo "================================================"
echo ""

# Activate environment
ensure_or_debate_env || exit 1

# Set Hugging Face offline mode if enabled
if [ "${USE_HF_OFFLINE}" = "true" ]; then
    echo "ℹ️  Hugging Face offline mode enabled (using local cache)"
    export HF_HUB_OFFLINE=1
    export TRANSFORMERS_OFFLINE=1
    export HF_DATASETS_OFFLINE=1
else
    echo "ℹ️  Hugging Face online mode (may download models if needed)"
fi
echo ""

# Backup and clear debug memory (only once at the beginning)
backup_debug_memory

# ============================================
# Run benchmarks
# ============================================
if [ "${RUN_ALL_BENCHMARKS}" = "true" ]; then
    if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
        echo "================================================"
        echo "🔄 Running ALL benchmarks in PARALLEL"
        echo "================================================"
    else
        echo "================================================"
        echo "🔄 Running ALL benchmarks SEQUENTIALLY"
        echo "================================================"
    fi
    echo ""
    
    # Define benchmark dataset names in specified order (without .jsonl extension)
    # Modify this array to change the execution order
    BENCHMARK_NAMES=(
        "NL4OPT"
        "EasyLP"
        "ComplexLP"
        "NLP4LP"
        "ComplexOR"
        "IndustryOR"
        "ReSocratic"
        "OPT-Principled"
    )
    
    # Count total benchmarks
    TOTAL_BENCHMARKS=${#BENCHMARK_NAMES[@]}
    FAILED=0
    SKIPPED=0
    
    echo "Total benchmarks to process: ${TOTAL_BENCHMARKS}"
    echo ""
    echo "Execution order:"
    for i in "${!BENCHMARK_NAMES[@]}"; do
        echo "  $((i+1)). ${BENCHMARK_NAMES[$i]}"
    done
    echo ""
    
    # Initialize batch results file
    echo "Dataset|Accuracy|Correct|Total|Output" > "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
    
    # Create lock file for parallel execution
    RESULTS_LOCK="${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.lock"
    touch "${RESULTS_LOCK}"
    
    # Process benchmarks (parallel or sequential)
    if [ "${PARALLEL_BENCHMARKS}" = "true" ]; then
        # Parallel execution
        declare -a PIDS=()
        declare -a DATASET_NAMES=()
        CURRENT_JOBS=0
        
        for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
            BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
            
            # Check if file exists
            if [ ! -f "${BENCHMARK_FILE}" ]; then
                echo "⚠️  File not found: ${BENCHMARK_FILE}"
                echo "   Skipping ${DATASET_NAME}..."
                SKIPPED=$((SKIPPED + 1))
                continue
            fi
            
            # Wait for available slot if max jobs reached
            while true; do
                # Count running jobs
                CURRENT_JOBS=0
                for PID in "${PIDS[@]}"; do
                    if kill -0 ${PID} 2>/dev/null; then
                        CURRENT_JOBS=$((CURRENT_JOBS + 1))
                    fi
                done
                
                # Break if we have available slots
                if [ ${CURRENT_JOBS} -lt ${MAX_PARALLEL_JOBS} ]; then
                    break
                fi
                
                # Wait a bit before checking again
                sleep 1
            done
            
            # Start job in background
            LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
            echo "🚀 Starting ${DATASET_NAME} (log: ${LOG_FILE})"
            
            (
                run_single_dataset_internal "${DATASET_NAME}" "${LOG_FILE}"
                EXIT_CODE=$?
                if [ ${EXIT_CODE} -ne 0 ]; then
                    echo "[${DATASET_NAME}] ❌ Failed with exit code ${EXIT_CODE}" >> "${OUTPUT_DIR}/failures_${MAIN_TIMESTAMP}.txt"
                else
                    echo "[${DATASET_NAME}] ✅ Completed successfully" >> "${OUTPUT_DIR}/success_${MAIN_TIMESTAMP}.txt"
                fi
            ) &
            
            PID=$!
            PIDS+=(${PID})
            DATASET_NAMES+=("${DATASET_NAME}")
        done
        
        # Wait for all jobs to complete
        echo ""
        echo "⏳ Waiting for all jobs to complete..."
        echo ""
        
        for i in "${!PIDS[@]}"; do
            PID=${PIDS[$i]}
            DATASET_NAME=${DATASET_NAMES[$i]}
            wait ${PID}
            EXIT_CODE=$?
            if [ ${EXIT_CODE} -ne 0 ]; then
                FAILED=$((FAILED + 1))
                echo "⚠️  ${DATASET_NAME} failed with exit code ${EXIT_CODE}"
            fi
        done
        
        # Clean up lock file
        rm -f "${RESULTS_LOCK}"
        
        echo ""
        echo "================================================"
        echo "📋 Individual Job Logs:"
        echo "================================================"
        for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
            LOG_FILE="${OUTPUT_DIR}/${DATASET_NAME}_${MAIN_TIMESTAMP}.log"
            if [ -f "${LOG_FILE}" ]; then
                echo ""
                echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
                echo "  ${DATASET_NAME} - Log File: ${LOG_FILE}"
                echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
                tail -20 "${LOG_FILE}"
            fi
        done
        echo ""
        
    else
        # Sequential execution
        CURRENT=0
        for DATASET_NAME in "${BENCHMARK_NAMES[@]}"; do
            CURRENT=$((CURRENT + 1))
            BENCHMARK_FILE="${BENCHMARKS_DIR}/${DATASET_NAME}.jsonl"
            
            echo ""
            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
            echo "  Progress: ${CURRENT}/${TOTAL_BENCHMARKS}"
            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
            
            # Check if file exists
            if [ ! -f "${BENCHMARK_FILE}" ]; then
                echo "⚠️  File not found: ${BENCHMARK_FILE}"
                echo "   Skipping..."
                SKIPPED=$((SKIPPED + 1))
                continue
            fi
            
            run_single_dataset "${DATASET_NAME}" true
            
            if [ $? -ne 0 ]; then
                FAILED=$((FAILED + 1))
                echo "⚠️  Failed to process ${DATASET_NAME}, continuing..."
            fi
            
            echo ""
        done
        
        # Clean up lock file
        rm -f "${RESULTS_LOCK}"
    fi
    
    # ============================================
    # Final Summary for All Benchmarks
    # ============================================
    echo ""
    echo "================================================"
    echo "🎉 All Benchmarks Complete!"
    echo "================================================"
    echo ""
    echo "Summary:"
    echo "  Total benchmarks: ${TOTAL_BENCHMARKS}"
    echo "  Successful:       $((TOTAL_BENCHMARKS - FAILED - SKIPPED))"
    echo "  Failed:           ${FAILED}"
    echo "  Skipped:          ${SKIPPED}"
    echo ""
    echo "📊 Detailed Results:"
    echo "================================================"
    
    # Display formatted results table
    if [ -f "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" ]; then
        echo ""
        printf "%-35s | %-10s | %-10s | %-10s\n" "Dataset" "Accuracy" "Correct" "Total"
        echo "--------------------------------------------------------------------------------"
        tail -n +2 "${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt" | while IFS='|' read -r dataset accuracy correct total output; do
            printf "%-35s | %-10s | %-10s | %-10s\n" "${dataset}" "${accuracy}" "${correct}" "${total}"
        done
        echo ""
        echo "📁 Full results saved to: ${OUTPUT_DIR}/batch_results_${MAIN_TIMESTAMP}.txt"
    fi
    
    echo ""
    echo "================================================"
    
else
    # Run single dataset mode
    echo "================================================"
    echo "📝 Running single dataset: ${DEFAULT_DATASET}"
    echo "================================================"
    echo ""

    BENCHMARK_FILE="${BENCHMARKS_DIR}/${DEFAULT_DATASET}.jsonl"
    if [ ! -f "${BENCHMARK_FILE}" ]; then
        echo "❌ Dataset file not found: ${BENCHMARK_FILE}"
        exit 1
    fi
    
    run_single_dataset "${DEFAULT_DATASET}" true
    
    if [ $? -ne 0 ]; then
        echo ""
        echo "❌ Pipeline failed"
        exit 1
    fi
    
    echo ""
    echo "🎉 Pipeline Complete!"
fi

echo ""
echo "✨ All done! Check the results above."
echo ""