luanns commited on
Commit
08be4b2
·
verified ·
1 Parent(s): 0ec5e1f

Upload scripts/evaluate.sh

Browse files
Files changed (1) hide show
  1. scripts/evaluate.sh +65 -0
scripts/evaluate.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Evaluate GUI-Shift model on GUI benchmarks
3
+
4
+ set -e
5
+
6
+ # Default values
7
+ MODEL_PATH=""
8
+ DATASET=""
9
+ BENCHMARK="androidcontrol_low"
10
+ OUTPUT="evaluation_results.json"
11
+ DEVICE="cuda"
12
+
13
+ # Parse arguments
14
+ while [[ $# -gt 0 ]]; do
15
+ case $1 in
16
+ --model_path)
17
+ MODEL_PATH="$2"
18
+ shift 2
19
+ ;;
20
+ --dataset)
21
+ DATASET="$2"
22
+ shift 2
23
+ ;;
24
+ --benchmark)
25
+ BENCHMARK="$2"
26
+ shift 2
27
+ ;;
28
+ --output)
29
+ OUTPUT="$2"
30
+ shift 2
31
+ ;;
32
+ --device)
33
+ DEVICE="$2"
34
+ shift 2
35
+ ;;
36
+ *)
37
+ echo "Unknown option: $1"
38
+ exit 1
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ if [ -z "$MODEL_PATH" ] || [ -z "$DATASET" ]; then
44
+ echo "Error: --model_path and --dataset are required"
45
+ echo "Usage: bash scripts/evaluate.sh --model_path ./checkpoints/gui-shift --dataset ./data/eval/androidcontrol_test.jsonl --benchmark androidcontrol_low"
46
+ exit 1
47
+ fi
48
+
49
+ echo "=== GUI-Shift Evaluation ==="
50
+ echo "Model: $MODEL_PATH"
51
+ echo "Dataset: $DATASET"
52
+ echo "Benchmark: $BENCHMARK"
53
+ echo "Output: $OUTPUT"
54
+ echo "Device: $DEVICE"
55
+ echo ""
56
+
57
+ python src/evaluation/eval_gui.py \
58
+ --model_path "$MODEL_PATH" \
59
+ --dataset "$DATASET" \
60
+ --benchmark "$BENCHMARK" \
61
+ --output "$OUTPUT" \
62
+ --device "$DEVICE"
63
+
64
+ echo ""
65
+ echo "Evaluation complete! Results saved to: $OUTPUT"