| # Evaluate GUI-Shift model on GUI benchmarks | |
| set -e | |
| # Default values | |
| MODEL_PATH="" | |
| DATASET="" | |
| BENCHMARK="androidcontrol_low" | |
| OUTPUT="evaluation_results.json" | |
| DEVICE="cuda" | |
| # Parse arguments | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --model_path) | |
| MODEL_PATH="$2" | |
| shift 2 | |
| ;; | |
| --dataset) | |
| DATASET="$2" | |
| shift 2 | |
| ;; | |
| --benchmark) | |
| BENCHMARK="$2" | |
| shift 2 | |
| ;; | |
| --output) | |
| OUTPUT="$2" | |
| shift 2 | |
| ;; | |
| --device) | |
| DEVICE="$2" | |
| shift 2 | |
| ;; | |
| *) | |
| echo "Unknown option: $1" | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| if [ -z "$MODEL_PATH" ] || [ -z "$DATASET" ]; then | |
| echo "Error: --model_path and --dataset are required" | |
| echo "Usage: bash scripts/evaluate.sh --model_path ./checkpoints/gui-shift --dataset ./data/eval/androidcontrol_test.jsonl --benchmark androidcontrol_low" | |
| exit 1 | |
| fi | |
| echo "=== GUI-Shift Evaluation ===" | |
| echo "Model: $MODEL_PATH" | |
| echo "Dataset: $DATASET" | |
| echo "Benchmark: $BENCHMARK" | |
| echo "Output: $OUTPUT" | |
| echo "Device: $DEVICE" | |
| echo "" | |
| python src/evaluation/eval_gui.py \ | |
| --model_path "$MODEL_PATH" \ | |
| --dataset "$DATASET" \ | |
| --benchmark "$BENCHMARK" \ | |
| --output "$OUTPUT" \ | |
| --device "$DEVICE" | |
| echo "" | |
| echo "Evaluation complete! Results saved to: $OUTPUT" | |