#!/bin/bash # Evaluate GUI-Shift model on GUI benchmarks set -e # Default values MODEL_PATH="" DATASET="" BENCHMARK="androidcontrol_low" OUTPUT="evaluation_results.json" DEVICE="cuda" # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --model_path) MODEL_PATH="$2" shift 2 ;; --dataset) DATASET="$2" shift 2 ;; --benchmark) BENCHMARK="$2" shift 2 ;; --output) OUTPUT="$2" shift 2 ;; --device) DEVICE="$2" shift 2 ;; *) echo "Unknown option: $1" exit 1 ;; esac done if [ -z "$MODEL_PATH" ] || [ -z "$DATASET" ]; then echo "Error: --model_path and --dataset are required" echo "Usage: bash scripts/evaluate.sh --model_path ./checkpoints/gui-shift --dataset ./data/eval/androidcontrol_test.jsonl --benchmark androidcontrol_low" exit 1 fi echo "=== GUI-Shift Evaluation ===" echo "Model: $MODEL_PATH" echo "Dataset: $DATASET" echo "Benchmark: $BENCHMARK" echo "Output: $OUTPUT" echo "Device: $DEVICE" echo "" python src/evaluation/eval_gui.py \ --model_path "$MODEL_PATH" \ --dataset "$DATASET" \ --benchmark "$BENCHMARK" \ --output "$OUTPUT" \ --device "$DEVICE" echo "" echo "Evaluation complete! Results saved to: $OUTPUT"