File size: 1,436 Bytes
08be4b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash
# Evaluate GUI-Shift model on GUI benchmarks

set -e

# Default values
MODEL_PATH=""
DATASET=""
BENCHMARK="androidcontrol_low"
OUTPUT="evaluation_results.json"
DEVICE="cuda"

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --model_path)
            MODEL_PATH="$2"
            shift 2
            ;;
        --dataset)
            DATASET="$2"
            shift 2
            ;;
        --benchmark)
            BENCHMARK="$2"
            shift 2
            ;;
        --output)
            OUTPUT="$2"
            shift 2
            ;;
        --device)
            DEVICE="$2"
            shift 2
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

if [ -z "$MODEL_PATH" ] || [ -z "$DATASET" ]; then
    echo "Error: --model_path and --dataset are required"
    echo "Usage: bash scripts/evaluate.sh --model_path ./checkpoints/gui-shift --dataset ./data/eval/androidcontrol_test.jsonl --benchmark androidcontrol_low"
    exit 1
fi

echo "=== GUI-Shift Evaluation ==="
echo "Model: $MODEL_PATH"
echo "Dataset: $DATASET"
echo "Benchmark: $BENCHMARK"
echo "Output: $OUTPUT"
echo "Device: $DEVICE"
echo ""

python src/evaluation/eval_gui.py \
    --model_path "$MODEL_PATH" \
    --dataset "$DATASET" \
    --benchmark "$BENCHMARK" \
    --output "$OUTPUT" \
    --device "$DEVICE"

echo ""
echo "Evaluation complete! Results saved to: $OUTPUT"