gui-shift / scripts /evaluate.sh
luanns's picture
Upload scripts/evaluate.sh
08be4b2 verified
#!/bin/bash
# Evaluate GUI-Shift model on GUI benchmarks
set -e
# Default values
MODEL_PATH=""
DATASET=""
BENCHMARK="androidcontrol_low"
OUTPUT="evaluation_results.json"
DEVICE="cuda"
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model_path)
MODEL_PATH="$2"
shift 2
;;
--dataset)
DATASET="$2"
shift 2
;;
--benchmark)
BENCHMARK="$2"
shift 2
;;
--output)
OUTPUT="$2"
shift 2
;;
--device)
DEVICE="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ -z "$MODEL_PATH" ] || [ -z "$DATASET" ]; then
echo "Error: --model_path and --dataset are required"
echo "Usage: bash scripts/evaluate.sh --model_path ./checkpoints/gui-shift --dataset ./data/eval/androidcontrol_test.jsonl --benchmark androidcontrol_low"
exit 1
fi
echo "=== GUI-Shift Evaluation ==="
echo "Model: $MODEL_PATH"
echo "Dataset: $DATASET"
echo "Benchmark: $BENCHMARK"
echo "Output: $OUTPUT"
echo "Device: $DEVICE"
echo ""
python src/evaluation/eval_gui.py \
--model_path "$MODEL_PATH" \
--dataset "$DATASET" \
--benchmark "$BENCHMARK" \
--output "$OUTPUT" \
--device "$DEVICE"
echo ""
echo "Evaluation complete! Results saved to: $OUTPUT"