XinBB commited on
Commit
cd06b59
·
verified ·
1 Parent(s): 22f6d8c

Delete eval with huggingface_hub

Browse files
eval/logs/rec_results_android_studio_macos_8k_SFT_0.json DELETED
@@ -1,1605 +0,0 @@
1
- {
2
- "accuracy": 15.0,
3
- "results": [
4
- {
5
- "image": "android_studio_mac/screenshot_2024-11-28_11-52-28.png",
6
- "question": "go to previous page in virtual Android machine in Android studio",
7
- "resized_size": [
8
- 1400,
9
- 2492
10
- ],
11
- "ground_truth": [
12
- 0.5294270833333333,
13
- 0.4152777777777778,
14
- 0.5348958333333333,
15
- 0.425
16
- ],
17
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [548, 503]}}\n</tool_call>",
18
- "extracted_answer": [
19
- 0.21990369181380418,
20
- 0.35928571428571426
21
- ],
22
- "correct": 0
23
- },
24
- {
25
- "image": "android_studio_mac/screenshot_2024-11-07_15-03-23.png",
26
- "question": "reverse the sorting by device name in AVD in android studio",
27
- "resized_size": [
28
- 1400,
29
- 2492
30
- ],
31
- "ground_truth": [
32
- 0.12604166666666666,
33
- 0.12129629629629629,
34
- 0.1296875,
35
- 0.12685185185185185
36
- ],
37
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [337, 156]}}\n</tool_call>",
38
- "extracted_answer": [
39
- 0.13523274478330657,
40
- 0.11142857142857143
41
- ],
42
- "correct": 0
43
- },
44
- {
45
- "image": "android_studio_mac/screenshot_2024-11-07_15-30-52.png",
46
- "question": "open physical device panel of the device manager in android studio",
47
- "resized_size": [
48
- 1400,
49
- 2492
50
- ],
51
- "ground_truth": [
52
- 0.12291666666666666,
53
- 0.09212962962962963,
54
- 0.13932291666666666,
55
- 0.10324074074074074
56
- ],
57
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [305, 156]}}\n</tool_call>",
58
- "extracted_answer": [
59
- 0.1223916532905297,
60
- 0.11142857142857143
61
- ],
62
- "correct": 0
63
- },
64
- {
65
- "image": "android_studio_mac/screenshot_2024-11-05_16-07-20.png",
66
- "question": "where is the stop button in android studio?",
67
- "resized_size": [
68
- 1400,
69
- 2492
70
- ],
71
- "ground_truth": [
72
- 0.54609375,
73
- 0.07083333333333333,
74
- 0.5520833333333334,
75
- 0.08055555555555556
76
- ],
77
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 86]}}\n</tool_call>",
78
- "extracted_answer": [
79
- 0.5445425361155698,
80
- 0.06142857142857143
81
- ],
82
- "correct": 0
83
- },
84
- {
85
- "image": "android_studio_mac/screenshot_2024-11-28_17-59-02.png",
86
- "question": "open marscode ai logger in android studio",
87
- "resized_size": [
88
- 1400,
89
- 2492
90
- ],
91
- "ground_truth": [
92
- 0.44244791666666666,
93
- 0.8569444444444444,
94
- 0.47942708333333334,
95
- 0.8680555555555556
96
- ],
97
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [875, 1183]}}\n</tool_call>",
98
- "extracted_answer": [
99
- 0.351123595505618,
100
- 0.845
101
- ],
102
- "correct": 0
103
- },
104
- {
105
- "image": "android_studio_mac/screenshot_2024-11-28_17-04-51.png",
106
- "question": "Increase the volume of the Android virtual machine in android studio",
107
- "resized_size": [
108
- 1400,
109
- 2492
110
- ],
111
- "ground_truth": [
112
- 0.5861979166666667,
113
- 0.3388888888888889,
114
- 0.5911458333333334,
115
- 0.34953703703703703
116
- ],
117
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1586, 473]}}\n</tool_call>",
118
- "extracted_answer": [
119
- 0.6364365971107544,
120
- 0.33785714285714286
121
- ],
122
- "correct": 0
123
- },
124
- {
125
- "image": "android_studio_mac/screenshot_2024-11-28_17-31-44.png",
126
- "question": "switch the android virtual machine to pixel xl api 34 in android studio",
127
- "resized_size": [
128
- 1400,
129
- 2492
130
- ],
131
- "ground_truth": [
132
- 0.41041666666666665,
133
- 0.29953703703703705,
134
- 0.4401041666666667,
135
- 0.30972222222222223
136
- ],
137
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [608, 537]}}\n</tool_call>",
138
- "extracted_answer": [
139
- 0.24398073836276082,
140
- 0.38357142857142856
141
- ],
142
- "correct": 0
143
- },
144
- {
145
- "image": "android_studio_mac/screenshot_2024-11-28_11-19-16.png",
146
- "question": "login with Marscode in android studio",
147
- "resized_size": [
148
- 1400,
149
- 2492
150
- ],
151
- "ground_truth": [
152
- 0.4942708333333333,
153
- 0.30925925925925923,
154
- 0.5182291666666666,
155
- 0.3351851851851852
156
- ],
157
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1273, 450]}}\n</tool_call>",
158
- "extracted_answer": [
159
- 0.5108346709470305,
160
- 0.32142857142857145
161
- ],
162
- "correct": 1
163
- },
164
- {
165
- "image": "android_studio_mac/screenshot_2024-11-28_14-43-59.png",
166
- "question": "turn off bluetooth of virtual Android machine in Android studio ",
167
- "resized_size": [
168
- 1400,
169
- 2492
170
- ],
171
- "ground_truth": [
172
- 0.57421875,
173
- 0.3638888888888889,
174
- 0.6270833333333333,
175
- 0.39444444444444443
176
- ],
177
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 526]}}\n</tool_call>",
178
- "extracted_answer": [
179
- 0.5565810593900482,
180
- 0.3757142857142857
181
- ],
182
- "correct": 0
183
- },
184
- {
185
- "image": "android_studio_mac/screenshot_2024-11-07_15-10-30.png",
186
- "question": "refresh virtual device of device manager in android studio ",
187
- "resized_size": [
188
- 1400,
189
- 2492
190
- ],
191
- "ground_truth": [
192
- 0.13802083333333334,
193
- 0.10694444444444444,
194
- 0.14635416666666667,
195
- 0.11805555555555555
196
- ],
197
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [327, 156]}}\n</tool_call>",
198
- "extracted_answer": [
199
- 0.1312199036918138,
200
- 0.11142857142857143
201
- ],
202
- "correct": 0
203
- },
204
- {
205
- "image": "android_studio_mac/screenshot_2024-11-28_11-50-55.png",
206
- "question": "open Recent Apps in virtual Android machine in Android studio",
207
- "resized_size": [
208
- 1400,
209
- 2492
210
- ],
211
- "ground_truth": [
212
- 0.5372395833333333,
213
- 0.3347222222222222,
214
- 0.5427083333333333,
215
- 0.3453703703703704
216
- ],
217
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [375, 568]}}\n</tool_call>",
218
- "extracted_answer": [
219
- 0.15048154093097912,
220
- 0.4057142857142857
221
- ],
222
- "correct": 0
223
- },
224
- {
225
- "image": "android_studio_mac/screenshot_2024-11-28_11-18-32.png",
226
- "question": "record virtual android phone screen in android studio",
227
- "resized_size": [
228
- 1400,
229
- 2492
230
- ],
231
- "ground_truth": [
232
- 0.5106770833333333,
233
- 0.4759259259259259,
234
- 0.5174479166666667,
235
- 0.48564814814814816
236
- ],
237
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1385, 876]}}\n</tool_call>",
238
- "extracted_answer": [
239
- 0.5557784911717496,
240
- 0.6257142857142857
241
- ],
242
- "correct": 0
243
- },
244
- {
245
- "image": "android_studio_mac/screenshot_2024-11-05_16-04-38.png",
246
- "question": "search information of project in android studio",
247
- "resized_size": [
248
- 1400,
249
- 2492
250
- ],
251
- "ground_truth": [
252
- 0.56328125,
253
- 0.0699074074074074,
254
- 0.5684895833333333,
255
- 0.08287037037037037
256
- ],
257
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 142]}}\n</tool_call>",
258
- "extracted_answer": [
259
- 0.5445425361155698,
260
- 0.10142857142857142
261
- ],
262
- "correct": 0
263
- },
264
- {
265
- "image": "android_studio_mac/screenshot_2024-11-28_15-28-57.png",
266
- "question": "search emoji whthin the virtual android studio in android studio",
267
- "resized_size": [
268
- 1400,
269
- 2492
270
- ],
271
- "ground_truth": [
272
- 0.52265625,
273
- 0.5958333333333333,
274
- 0.57734375,
275
- 0.6111111111111112
276
- ],
277
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1375, 816]}}\n</tool_call>",
278
- "extracted_answer": [
279
- 0.5517656500802568,
280
- 0.5828571428571429
281
- ],
282
- "correct": 0
283
- },
284
- {
285
- "image": "android_studio_mac/screenshot_2024-11-28_15-39-12.png",
286
- "question": "undo the action of the picture within the virtual android machine in android studio",
287
- "resized_size": [
288
- 1400,
289
- 2492
290
- ],
291
- "ground_truth": [
292
- 0.57109375,
293
- 0.7337962962962963,
294
- 0.5809895833333333,
295
- 0.7523148148148148
296
- ],
297
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1437, 1012]}}\n</tool_call>",
298
- "extracted_answer": [
299
- 0.576645264847512,
300
- 0.7228571428571429
301
- ],
302
- "correct": 0
303
- },
304
- {
305
- "image": "android_studio_mac/screenshot_2024-11-07_14-57-01.png",
306
- "question": "close bookmark panel",
307
- "resized_size": [
308
- 1400,
309
- 2492
310
- ],
311
- "ground_truth": [
312
- 0.25442708333333336,
313
- 0.33287037037037037,
314
- 0.2604166666666667,
315
- 0.3435185185185185
316
- ],
317
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [607, 473]}}\n</tool_call>",
318
- "extracted_answer": [
319
- 0.24357945425361155,
320
- 0.33785714285714286
321
- ],
322
- "correct": 0
323
- },
324
- {
325
- "image": "android_studio_mac/screenshot_2024-11-07_15-33-17.png",
326
- "question": "open settings of terminal in android studio",
327
- "resized_size": [
328
- 1400,
329
- 2492
330
- ],
331
- "ground_truth": [
332
- 0.14921875,
333
- 0.612037037037037,
334
- 0.17786458333333333,
335
- 0.6236111111111111
336
- ],
337
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
338
- "extracted_answer": [
339
- 0.14325842696629212,
340
- 0.5828571428571429
341
- ],
342
- "correct": 0
343
- },
344
- {
345
- "image": "android_studio_mac/screenshot_2024-11-28_17-12-20.png",
346
- "question": "turn off notification dot of google in android virtual machine in android studio",
347
- "resized_size": [
348
- 1400,
349
- 2492
350
- ],
351
- "ground_truth": [
352
- 0.6489583333333333,
353
- 0.6916666666666667,
354
- 0.6614583333333334,
355
- 0.7032407407407407
356
- ],
357
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1635, 978]}}\n</tool_call>",
358
- "extracted_answer": [
359
- 0.656099518459069,
360
- 0.6985714285714286
361
- ],
362
- "correct": 1
363
- },
364
- {
365
- "image": "android_studio_mac/screenshot_2024-11-28_16-09-39.png",
366
- "question": "turn on dark theme of the virtual android macine",
367
- "resized_size": [
368
- 1400,
369
- 2492
370
- ],
371
- "ground_truth": [
372
- 0.5903645833333333,
373
- 0.6708333333333333,
374
- 0.60234375,
375
- 0.6824074074074075
376
- ],
377
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1478, 950]}}\n</tool_call>",
378
- "extracted_answer": [
379
- 0.5930979133226324,
380
- 0.6785714285714286
381
- ],
382
- "correct": 1
383
- },
384
- {
385
- "image": "android_studio_mac/screenshot_2024-11-28_14-56-16.png",
386
- "question": "open contacts of the android machine",
387
- "resized_size": [
388
- 1400,
389
- 2492
390
- ],
391
- "ground_truth": [
392
- 0.5348958333333333,
393
- 0.7754629629629629,
394
- 0.5575520833333333,
395
- 0.8050925925925926
396
- ],
397
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 1086]}}\n</tool_call>",
398
- "extracted_answer": [
399
- 0.5445425361155698,
400
- 0.7757142857142857
401
- ],
402
- "correct": 1
403
- },
404
- {
405
- "image": "android_studio_mac/screenshot_2024-11-07_14-49-07.png",
406
- "question": "add new service in android studio using the icon",
407
- "resized_size": [
408
- 1400,
409
- 2492
410
- ],
411
- "ground_truth": [
412
- 0.14453125,
413
- 0.600925925925926,
414
- 0.15130208333333334,
415
- 0.6143518518518518
416
- ],
417
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [356, 822]}}\n</tool_call>",
418
- "extracted_answer": [
419
- 0.14285714285714285,
420
- 0.5871428571428572
421
- ],
422
- "correct": 0
423
- },
424
- {
425
- "image": "android_studio_mac/screenshot_2024-11-28_11-31-04.png",
426
- "question": "install missing platform and fix project in android studio",
427
- "resized_size": [
428
- 1400,
429
- 2492
430
- ],
431
- "ground_truth": [
432
- 0.5153645833333333,
433
- 0.5990740740740741,
434
- 0.5791666666666667,
435
- 0.6083333333333333
436
- ],
437
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 816]}}\n</tool_call>",
438
- "extracted_answer": [
439
- 0.5445425361155698,
440
- 0.5828571428571429
441
- ],
442
- "correct": 0
443
- },
444
- {
445
- "image": "android_studio_mac/screenshot_2024-11-28_15-06-40.png",
446
- "question": "share the photo in virtual android machine in android studio",
447
- "resized_size": [
448
- 1400,
449
- 2492
450
- ],
451
- "ground_truth": [
452
- 0.58046875,
453
- 0.6995370370370371,
454
- 0.5890625,
455
- 0.7180555555555556
456
- ],
457
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1473, 986]}}\n</tool_call>",
458
- "extracted_answer": [
459
- 0.591091492776886,
460
- 0.7042857142857143
461
- ],
462
- "correct": 0
463
- },
464
- {
465
- "image": "android_studio_mac/screenshot_2024-11-05_16-05-52.png",
466
- "question": "run the project in android studio",
467
- "resized_size": [
468
- 1400,
469
- 2492
470
- ],
471
- "ground_truth": [
472
- 0.4984375,
473
- 0.07037037037037037,
474
- 0.5036458333333333,
475
- 0.08194444444444444
476
- ],
477
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 106]}}\n</tool_call>",
478
- "extracted_answer": [
479
- 0.5445425361155698,
480
- 0.07571428571428572
481
- ],
482
- "correct": 0
483
- },
484
- {
485
- "image": "android_studio_mac/screenshot_2024-11-28_15-51-23.png",
486
- "question": "like the video on youtube within the virtual android machine in android studio ",
487
- "resized_size": [
488
- 1400,
489
- 2492
490
- ],
491
- "ground_truth": [
492
- 0.5859375,
493
- 0.6314814814814815,
494
- 0.6005208333333333,
495
- 0.6430555555555556
496
- ],
497
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [2037, 806]}}\n</tool_call>",
498
- "extracted_answer": [
499
- 0.8174157303370787,
500
- 0.5757142857142857
501
- ],
502
- "correct": 0
503
- },
504
- {
505
- "image": "android_studio_mac/screenshot_2024-11-05_16-05-07.png",
506
- "question": "open device manager in android studio",
507
- "resized_size": [
508
- 1400,
509
- 2492
510
- ],
511
- "ground_truth": [
512
- 0.5549479166666667,
513
- 0.07083333333333333,
514
- 0.56015625,
515
- 0.0824074074074074
516
- ],
517
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 205]}}\n</tool_call>",
518
- "extracted_answer": [
519
- 0.5529695024077047,
520
- 0.14642857142857144
521
- ],
522
- "correct": 0
523
- },
524
- {
525
- "image": "android_studio_mac/screenshot_2024-11-28_15-24-57.png",
526
- "question": "change the pen color to green with in the virtual android machine in android studio",
527
- "resized_size": [
528
- 1400,
529
- 2492
530
- ],
531
- "ground_truth": [
532
- 0.5018229166666667,
533
- 0.7347222222222223,
534
- 0.5104166666666666,
535
- 0.7472222222222222
536
- ],
537
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 1040]}}\n</tool_call>",
538
- "extracted_answer": [
539
- 0.5445425361155698,
540
- 0.7428571428571429
541
- ],
542
- "correct": 0
543
- },
544
- {
545
- "image": "android_studio_mac/screenshot_2024-11-28_17-17-15.png",
546
- "question": "open the os settings of android virtual machine in android studio",
547
- "resized_size": [
548
- 1400,
549
- 2492
550
- ],
551
- "ground_truth": [
552
- 0.6151041666666667,
553
- 0.5416666666666666,
554
- 0.625,
555
- 0.5606481481481481
556
- ],
557
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1518, 763]}}\n</tool_call>",
558
- "extracted_answer": [
559
- 0.6091492776886035,
560
- 0.545
561
- ],
562
- "correct": 0
563
- },
564
- {
565
- "image": "android_studio_mac/screenshot_2024-11-28_16-11-18.png",
566
- "question": "take a screenshot of the virtual android machine in android studio",
567
- "resized_size": [
568
- 1400,
569
- 2492
570
- ],
571
- "ground_truth": [
572
- 0.57109375,
573
- 0.3402777777777778,
574
- 0.5778645833333333,
575
- 0.35185185185185186
576
- ],
577
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 825]}}\n</tool_call>",
578
- "extracted_answer": [
579
- 0.5529695024077047,
580
- 0.5892857142857143
581
- ],
582
- "correct": 0
583
- },
584
- {
585
- "image": "android_studio_mac/screenshot_2024-11-28_18-00-07.png",
586
- "question": "open notifications panel on the right in android studio",
587
- "resized_size": [
588
- 1400,
589
- 2492
590
- ],
591
- "ground_truth": [
592
- 0.80859375,
593
- 0.3,
594
- 0.8127604166666667,
595
- 0.3509259259259259
596
- ],
597
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [2378, 406]}}\n</tool_call>",
598
- "extracted_answer": [
599
- 0.9542536115569823,
600
- 0.29
601
- ],
602
- "correct": 0
603
- },
604
- {
605
- "image": "android_studio_mac/screenshot_2024-11-28_15-48-39.png",
606
- "question": "subscribe the adidas skateboarding on toutube with in the virtual android machine in android studio",
607
- "resized_size": [
608
- 1400,
609
- 2492
610
- ],
611
- "ground_truth": [
612
- 0.6028645833333334,
613
- 0.6226851851851852,
614
- 0.63046875,
615
- 0.6416666666666667
616
- ],
617
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1536, 879]}}\n</tool_call>",
618
- "extracted_answer": [
619
- 0.6163723916532905,
620
- 0.6278571428571429
621
- ],
622
- "correct": 1
623
- },
624
- {
625
- "image": "android_studio_mac/screenshot_2024-11-28_17-58-25.png",
626
- "question": "open version control in android studio",
627
- "resized_size": [
628
- 1400,
629
- 2492
630
- ],
631
- "ground_truth": [
632
- 0.24973958333333332,
633
- 0.8560185185185185,
634
- 0.2796875,
635
- 0.8671296296296296
636
- ],
637
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 1183]}}\n</tool_call>",
638
- "extracted_answer": [
639
- 0.26364365971107545,
640
- 0.845
641
- ],
642
- "correct": 0
643
- },
644
- {
645
- "image": "android_studio_mac/screenshot_2024-11-07_14-47-06.png",
646
- "question": "open app inspection in android studio ",
647
- "resized_size": [
648
- 1400,
649
- 2492
650
- ],
651
- "ground_truth": [
652
- 0.29739583333333336,
653
- 0.8592592592592593,
654
- 0.3307291666666667,
655
- 0.8708333333333333
656
- ],
657
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [807, 1183]}}\n</tool_call>",
658
- "extracted_answer": [
659
- 0.3238362760834671,
660
- 0.845
661
- ],
662
- "correct": 0
663
- },
664
- {
665
- "image": "android_studio_mac/screenshot_2024-11-05_16-06-34.png",
666
- "question": "attach debugger to android process in android studio",
667
- "resized_size": [
668
- 1400,
669
- 2492
670
- ],
671
- "ground_truth": [
672
- 0.53984375,
673
- 0.0699074074074074,
674
- 0.5447916666666667,
675
- 0.08148148148148149
676
- ],
677
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 131]}}\n</tool_call>",
678
- "extracted_answer": [
679
- 0.5565810593900482,
680
- 0.09357142857142857
681
- ],
682
- "correct": 0
683
- },
684
- {
685
- "image": "android_studio_mac/screenshot_2024-11-28_15-03-59.png",
686
- "question": "turn on the flash of the camera in virtual android machine in android studio",
687
- "resized_size": [
688
- 1400,
689
- 2492
690
- ],
691
- "ground_truth": [
692
- 0.69921875,
693
- 0.6097222222222223,
694
- 0.7078125,
695
- 0.6236111111111111
696
- ],
697
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 438]}}\n</tool_call>",
698
- "extracted_answer": [
699
- 0.26364365971107545,
700
- 0.31285714285714283
701
- ],
702
- "correct": 0
703
- },
704
- {
705
- "image": "android_studio_mac/screenshot_2024-11-28_17-07-21.png",
706
- "question": "clear all notification of the android virtual machine in android studio",
707
- "resized_size": [
708
- 1400,
709
- 2492
710
- ],
711
- "ground_truth": [
712
- 0.6408854166666667,
713
- 0.6712962962962963,
714
- 0.6565104166666667,
715
- 0.6847222222222222
716
- ],
717
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [637, 895]}}\n</tool_call>",
718
- "extracted_answer": [
719
- 0.2556179775280899,
720
- 0.6392857142857142
721
- ],
722
- "correct": 0
723
- },
724
- {
725
- "image": "android_studio_mac/screenshot_2024-11-07_15-06-51.png",
726
- "question": "collapse all bookmarks in android studio",
727
- "resized_size": [
728
- 1400,
729
- 2492
730
- ],
731
- "ground_truth": [
732
- 0.23932291666666666,
733
- 0.33240740740740743,
734
- 0.24583333333333332,
735
- 0.3449074074074074
736
- ],
737
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [553, 472]}}\n</tool_call>",
738
- "extracted_answer": [
739
- 0.22191011235955055,
740
- 0.33714285714285713
741
- ],
742
- "correct": 0
743
- },
744
- {
745
- "image": "android_studio_mac/screenshot_2024-11-07_15-42-10.png",
746
- "question": "expand TODO items in android studio",
747
- "resized_size": [
748
- 1400,
749
- 2492
750
- ],
751
- "ground_truth": [
752
- 0.10625,
753
- 0.649074074074074,
754
- 0.11197916666666667,
755
- 0.6606481481481481
756
- ],
757
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [305, 826]}}\n</tool_call>",
758
- "extracted_answer": [
759
- 0.1223916532905297,
760
- 0.59
761
- ],
762
- "correct": 0
763
- },
764
- {
765
- "image": "android_studio_mac/screenshot_2024-11-28_18-02-57.png",
766
- "question": "click Profiler to open services window in the bottom of the android studio",
767
- "resized_size": [
768
- 1400,
769
- 2492
770
- ],
771
- "ground_truth": [
772
- 0.28020833333333334,
773
- 0.8564814814814815,
774
- 0.2981770833333333,
775
- 0.8671296296296296
776
- ],
777
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [678, 1179]}}\n</tool_call>",
778
- "extracted_answer": [
779
- 0.27207062600321025,
780
- 0.8421428571428572
781
- ],
782
- "correct": 0
783
- },
784
- {
785
- "image": "android_studio_mac/screenshot_2024-11-28_17-19-51.png",
786
- "question": "stop the bilibili download in android virtual machine in android studio",
787
- "resized_size": [
788
- 1400,
789
- 2492
790
- ],
791
- "ground_truth": [
792
- 0.4661458333333333,
793
- 0.4666666666666667,
794
- 0.47630208333333335,
795
- 0.4847222222222222
796
- ],
797
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1538, 667]}}\n</tool_call>",
798
- "extracted_answer": [
799
- 0.6171749598715891,
800
- 0.4764285714285714
801
- ],
802
- "correct": 0
803
- },
804
- {
805
- "image": "android_studio_mac/screenshot_2024-11-28_15-40-13.png",
806
- "question": "clear all the modification of the photo with in the virtual android machine in android studio",
807
- "resized_size": [
808
- 1400,
809
- 2492
810
- ],
811
- "ground_truth": [
812
- 0.5263020833333333,
813
- 0.7342592592592593,
814
- 0.5424479166666667,
815
- 0.7509259259259259
816
- ],
817
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1317, 985]}}\n</tool_call>",
818
- "extracted_answer": [
819
- 0.5284911717495987,
820
- 0.7035714285714286
821
- ],
822
- "correct": 0
823
- },
824
- {
825
- "image": "android_studio_mac/screenshot_2024-11-28_17-39-17.png",
826
- "question": "zoom in the android virtual machine in android studio",
827
- "resized_size": [
828
- 1400,
829
- 2492
830
- ],
831
- "ground_truth": [
832
- 0.5559895833333334,
833
- 0.6717592592592593,
834
- 0.5625,
835
- 0.6837962962962963
836
- ],
837
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1395, 947]}}\n</tool_call>",
838
- "extracted_answer": [
839
- 0.5597913322632424,
840
- 0.6764285714285714
841
- ],
842
- "correct": 1
843
- },
844
- {
845
- "image": "android_studio_mac/screenshot_2024-11-28_11-43-20.png",
846
- "question": "back to home in virtual android machine in android studio",
847
- "resized_size": [
848
- 1400,
849
- 2492
850
- ],
851
- "ground_truth": [
852
- 0.60078125,
853
- 0.3800925925925926,
854
- 0.6065104166666667,
855
- 0.3902777777777778
856
- ],
857
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [37, 825]}}\n</tool_call>",
858
- "extracted_answer": [
859
- 0.014847512038523275,
860
- 0.5892857142857143
861
- ],
862
- "correct": 0
863
- },
864
- {
865
- "image": "android_studio_mac/screenshot_2024-11-28_17-09-50.png",
866
- "question": "turn on the falshlight of the android virtual machine in android studio",
867
- "resized_size": [
868
- 1400,
869
- 2492
870
- ],
871
- "ground_truth": [
872
- 0.5861979166666667,
873
- 0.47638888888888886,
874
- 0.62265625,
875
- 0.5
876
- ],
877
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1537, 668]}}\n</tool_call>",
878
- "extracted_answer": [
879
- 0.6167736757624398,
880
- 0.47714285714285715
881
- ],
882
- "correct": 1
883
- },
884
- {
885
- "image": "android_studio_mac/screenshot_2024-11-28_15-31-55.png",
886
- "question": "click the third crying emoji with in the virtual android machine in android machine",
887
- "resized_size": [
888
- 1400,
889
- 2492
890
- ],
891
- "ground_truth": [
892
- 0.553125,
893
- 0.5347222222222222,
894
- 0.5645833333333333,
895
- 0.5550925925925926
896
- ],
897
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1875, 693]}}\n</tool_call>",
898
- "extracted_answer": [
899
- 0.7524077046548957,
900
- 0.495
901
- ],
902
- "correct": 0
903
- },
904
- {
905
- "image": "android_studio_mac/screenshot_2024-11-28_16-15-46.png",
906
- "question": "change video to camera in virtual android machine in android studio",
907
- "resized_size": [
908
- 1400,
909
- 2492
910
- ],
911
- "ground_truth": [
912
- 0.5247395833333334,
913
- 0.5532407407407407,
914
- 0.5546875,
915
- 0.5791666666666667
916
- ],
917
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 786]}}\n</tool_call>",
918
- "extracted_answer": [
919
- 0.5445425361155698,
920
- 0.5614285714285714
921
- ],
922
- "correct": 1
923
- },
924
- {
925
- "image": "android_studio_mac/screenshot_2024-11-28_15-10-02.png",
926
- "question": "close the detail of the photo in virtual android machine in android studio",
927
- "resized_size": [
928
- 1400,
929
- 2492
930
- ],
931
- "ground_truth": [
932
- 0.6291666666666667,
933
- 0.6939814814814815,
934
- 0.6432291666666666,
935
- 0.7092592592592593
936
- ],
937
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1567, 980]}}\n</tool_call>",
938
- "extracted_answer": [
939
- 0.6288121990369181,
940
- 0.7
941
- ],
942
- "correct": 0
943
- },
944
- {
945
- "image": "android_studio_mac/screenshot_2024-11-28_15-15-42.png",
946
- "question": "modify the shadows of the picture witn in virtual android machine in android studio",
947
- "resized_size": [
948
- 1400,
949
- 2492
950
- ],
951
- "ground_truth": [
952
- 0.5369791666666667,
953
- 0.6925925925925925,
954
- 0.55234375,
955
- 0.7240740740740741
956
- ],
957
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1367, 1035]}}\n</tool_call>",
958
- "extracted_answer": [
959
- 0.5485553772070626,
960
- 0.7392857142857143
961
- ],
962
- "correct": 0
963
- },
964
- {
965
- "image": "android_studio_mac/screenshot_2024-11-28_15-05-13.png",
966
- "question": "turn off the hdr of virtual android camera in android studio",
967
- "resized_size": [
968
- 1400,
969
- 2492
970
- ],
971
- "ground_truth": [
972
- 0.6895833333333333,
973
- 0.6106481481481482,
974
- 0.6984375,
975
- 0.625
976
- ],
977
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1730, 899]}}\n</tool_call>",
978
- "extracted_answer": [
979
- 0.6942215088282504,
980
- 0.6421428571428571
981
- ],
982
- "correct": 0
983
- },
984
- {
985
- "image": "android_studio_mac/screenshot_2024-11-28_14-58-52.png",
986
- "question": "take a picture of virtual android machine in android studio",
987
- "resized_size": [
988
- 1400,
989
- 2492
990
- ],
991
- "ground_truth": [
992
- 0.51796875,
993
- 0.7638888888888888,
994
- 0.54296875,
995
- 0.7962962962962963
996
- ],
997
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1317, 856]}}\n</tool_call>",
998
- "extracted_answer": [
999
- 0.5284911717495987,
1000
- 0.6114285714285714
1001
- ],
1002
- "correct": 0
1003
- },
1004
- {
1005
- "image": "android_studio_mac/screenshot_2024-11-28_15-16-55.png",
1006
- "question": "modify the highlights of the photo with in the virtual android machine in android studio",
1007
- "resized_size": [
1008
- 1400,
1009
- 2492
1010
- ],
1011
- "ground_truth": [
1012
- 0.46197916666666666,
1013
- 0.7342592592592593,
1014
- 0.5502604166666667,
1015
- 0.7490740740740741
1016
- ],
1017
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1315, 982]}}\n</tool_call>",
1018
- "extracted_answer": [
1019
- 0.5276886035313002,
1020
- 0.7014285714285714
1021
- ],
1022
- "correct": 0
1023
- },
1024
- {
1025
- "image": "android_studio_mac/screenshot_2024-11-07_14-48-21.png",
1026
- "question": "open setting of app inspection in android studio",
1027
- "resized_size": [
1028
- 1400,
1029
- 2492
1030
- ],
1031
- "ground_truth": [
1032
- 0.5557291666666667,
1033
- 0.5884259259259259,
1034
- 0.5604166666666667,
1035
- 0.5981481481481481
1036
- ],
1037
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [367, 835]}}\n</tool_call>",
1038
- "extracted_answer": [
1039
- 0.1472712680577849,
1040
- 0.5964285714285714
1041
- ],
1042
- "correct": 0
1043
- },
1044
- {
1045
- "image": "android_studio_mac/screenshot_2024-11-28_15-02-38.png",
1046
- "question": "open more items of virtual android machine in android studio ",
1047
- "resized_size": [
1048
- 1400,
1049
- 2492
1050
- ],
1051
- "ground_truth": [
1052
- 0.6875,
1053
- 0.6402777777777777,
1054
- 0.70234375,
1055
- 0.6680555555555555
1056
- ],
1057
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 468]}}\n</tool_call>",
1058
- "extracted_answer": [
1059
- 0.14325842696629212,
1060
- 0.3342857142857143
1061
- ],
1062
- "correct": 0
1063
- },
1064
- {
1065
- "image": "android_studio_mac/screenshot_2024-11-07_15-38-30.png",
1066
- "question": "open options of terminal in android studio",
1067
- "resized_size": [
1068
- 1400,
1069
- 2492
1070
- ],
1071
- "ground_truth": [
1072
- 0.5546875,
1073
- 0.5884259259259259,
1074
- 0.5611979166666666,
1075
- 0.600462962962963
1076
- ],
1077
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 806]}}\n</tool_call>",
1078
- "extracted_answer": [
1079
- 0.14325842696629212,
1080
- 0.5757142857142857
1081
- ],
1082
- "correct": 0
1083
- },
1084
- {
1085
- "image": "android_studio_mac/screenshot_2024-11-28_17-11-07.png",
1086
- "question": "turn off the notification permission of google in android virtual machine",
1087
- "resized_size": [
1088
- 1400,
1089
- 2492
1090
- ],
1091
- "ground_truth": [
1092
- 0.6505208333333333,
1093
- 0.5648148148148148,
1094
- 0.6635416666666667,
1095
- 0.5828703703703704
1096
- ],
1097
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1617, 803]}}\n</tool_call>",
1098
- "extracted_answer": [
1099
- 0.648876404494382,
1100
- 0.5735714285714286
1101
- ],
1102
- "correct": 0
1103
- },
1104
- {
1105
- "image": "android_studio_mac/screenshot_2024-11-05_16-09-54.png",
1106
- "question": "supress warnings in android studio",
1107
- "resized_size": [
1108
- 1400,
1109
- 2492
1110
- ],
1111
- "ground_truth": [
1112
- 0.4434895833333333,
1113
- 0.2722222222222222,
1114
- 0.5223958333333333,
1115
- 0.2810185185185185
1116
- ],
1117
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1308, 386]}}\n</tool_call>",
1118
- "extracted_answer": [
1119
- 0.5248796147672552,
1120
- 0.2757142857142857
1121
- ],
1122
- "correct": 0
1123
- },
1124
- {
1125
- "image": "android_studio_mac/screenshot_2024-11-28_15-26-41.png",
1126
- "question": "open settings of the keyboard within the virtual android machine in android studio",
1127
- "resized_size": [
1128
- 1400,
1129
- 2492
1130
- ],
1131
- "ground_truth": [
1132
- 0.76015625,
1133
- 0.5703703703703704,
1134
- 0.7763020833333333,
1135
- 0.5875
1136
- ],
1137
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1836, 795]}}\n</tool_call>",
1138
- "extracted_answer": [
1139
- 0.7367576243980738,
1140
- 0.5678571428571428
1141
- ],
1142
- "correct": 0
1143
- },
1144
- {
1145
- "image": "android_studio_mac/screenshot_2024-11-28_14-55-23.png",
1146
- "question": "open recent call of Android machine in Android studio",
1147
- "resized_size": [
1148
- 1400,
1149
- 2492
1150
- ],
1151
- "ground_truth": [
1152
- 0.578125,
1153
- 0.6560185185185186,
1154
- 0.59765625,
1155
- 0.6875
1156
- ],
1157
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1463, 935]}}\n</tool_call>",
1158
- "extracted_answer": [
1159
- 0.5870786516853933,
1160
- 0.6678571428571428
1161
- ],
1162
- "correct": 1
1163
- },
1164
- {
1165
- "image": "android_studio_mac/screenshot_2024-11-28_11-23-32.png",
1166
- "question": "shutdown the virtual android phone in android studio",
1167
- "resized_size": [
1168
- 1400,
1169
- 2492
1170
- ],
1171
- "ground_truth": [
1172
- 0.45026041666666666,
1173
- 0.47638888888888886,
1174
- 0.4557291666666667,
1175
- 0.487962962962963
1176
- ],
1177
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 925]}}\n</tool_call>",
1178
- "extracted_answer": [
1179
- 0.5565810593900482,
1180
- 0.6607142857142857
1181
- ],
1182
- "correct": 0
1183
- },
1184
- {
1185
- "image": "android_studio_mac/screenshot_2024-11-28_17-55-00.png",
1186
- "question": "make current file read-only in android studio",
1187
- "resized_size": [
1188
- 1400,
1189
- 2492
1190
- ],
1191
- "ground_truth": [
1192
- 0.7734375,
1193
- 0.8680555555555556,
1194
- 0.7796875,
1195
- 0.8759259259259259
1196
- ],
1197
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1753, 748]}}\n</tool_call>",
1198
- "extracted_answer": [
1199
- 0.7034510433386838,
1200
- 0.5342857142857143
1201
- ],
1202
- "correct": 0
1203
- },
1204
- {
1205
- "image": "android_studio_mac/screenshot_2024-11-28_18-01-01.png",
1206
- "question": "open app quality insights window in the bootom of the android studio",
1207
- "resized_size": [
1208
- 1400,
1209
- 2492
1210
- ],
1211
- "ground_truth": [
1212
- 0.32057291666666665,
1213
- 0.8555555555555555,
1214
- 0.3572916666666667,
1215
- 0.8671296296296296
1216
- ],
1217
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [875, 1180]}}\n</tool_call>",
1218
- "extracted_answer": [
1219
- 0.351123595505618,
1220
- 0.8428571428571429
1221
- ],
1222
- "correct": 0
1223
- },
1224
- {
1225
- "image": "android_studio_mac/screenshot_2024-11-07_15-22-21.png",
1226
- "question": "rename bookmark name in android studio",
1227
- "resized_size": [
1228
- 1400,
1229
- 2492
1230
- ],
1231
- "ground_truth": [
1232
- 0.2234375,
1233
- 0.3314814814814815,
1234
- 0.23046875,
1235
- 0.3453703703703704
1236
- ],
1237
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [553, 472]}}\n</tool_call>",
1238
- "extracted_answer": [
1239
- 0.22191011235955055,
1240
- 0.33714285714285713
1241
- ],
1242
- "correct": 0
1243
- },
1244
- {
1245
- "image": "android_studio_mac/screenshot_2024-11-28_11-26-41.png",
1246
- "question": "log in to marscode AI in android studio",
1247
- "resized_size": [
1248
- 1400,
1249
- 2492
1250
- ],
1251
- "ground_truth": [
1252
- 0.51640625,
1253
- 0.5166666666666667,
1254
- 0.5276041666666667,
1255
- 0.5254629629629629
1256
- ],
1257
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 695]}}\n</tool_call>",
1258
- "extracted_answer": [
1259
- 0.5529695024077047,
1260
- 0.49642857142857144
1261
- ],
1262
- "correct": 0
1263
- },
1264
- {
1265
- "image": "android_studio_mac/screenshot_2024-11-28_11-25-07.png",
1266
- "question": "clear all notifications in android studio",
1267
- "resized_size": [
1268
- 1400,
1269
- 2492
1270
- ],
1271
- "ground_truth": [
1272
- 0.5440104166666667,
1273
- 0.2791666666666667,
1274
- 0.56015625,
1275
- 0.2916666666666667
1276
- ],
1277
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 360]}}\n</tool_call>",
1278
- "extracted_answer": [
1279
- 0.5445425361155698,
1280
- 0.2571428571428571
1281
- ],
1282
- "correct": 0
1283
- },
1284
- {
1285
- "image": "android_studio_mac/screenshot_2024-11-05_17-58-41.png",
1286
- "question": "edit selected AVD in android studio",
1287
- "resized_size": [
1288
- 1400,
1289
- 2492
1290
- ],
1291
- "ground_truth": [
1292
- 0.25677083333333334,
1293
- 0.13935185185185187,
1294
- 0.26171875,
1295
- 0.15231481481481482
1296
- ],
1297
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 203]}}\n</tool_call>",
1298
- "extracted_answer": [
1299
- 0.26364365971107545,
1300
- 0.145
1301
- ],
1302
- "correct": 0
1303
- },
1304
- {
1305
- "image": "android_studio_mac/screenshot_2024-11-28_15-14-25.png",
1306
- "question": "crop the photo witnin the virtual android machine in android studio",
1307
- "resized_size": [
1308
- 1400,
1309
- 2492
1310
- ],
1311
- "ground_truth": [
1312
- 0.525,
1313
- 0.7384259259259259,
1314
- 0.5377604166666666,
1315
- 0.75
1316
- ],
1317
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1367, 1041]}}\n</tool_call>",
1318
- "extracted_answer": [
1319
- 0.5485553772070626,
1320
- 0.7435714285714285
1321
- ],
1322
- "correct": 0
1323
- },
1324
- {
1325
- "image": "android_studio_mac/screenshot_2024-11-28_17-01-46.png",
1326
- "question": "open settings within the virtual android machine in android studio",
1327
- "resized_size": [
1328
- 1400,
1329
- 2492
1330
- ],
1331
- "ground_truth": [
1332
- 0.6526041666666667,
1333
- 0.6694444444444444,
1334
- 0.6622395833333333,
1335
- 0.6861111111111111
1336
- ],
1337
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [637, 805]}}\n</tool_call>",
1338
- "extracted_answer": [
1339
- 0.2556179775280899,
1340
- 0.575
1341
- ],
1342
- "correct": 0
1343
- },
1344
- {
1345
- "image": "android_studio_mac/screenshot_2024-11-28_14-45-58.png",
1346
- "question": "turn off internet of Android machine in Android studio",
1347
- "resized_size": [
1348
- 1400,
1349
- 2492
1350
- ],
1351
- "ground_truth": [
1352
- 0.5200520833333333,
1353
- 0.3638888888888889,
1354
- 0.5723958333333333,
1355
- 0.3958333333333333
1356
- ],
1357
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 528]}}\n</tool_call>",
1358
- "extracted_answer": [
1359
- 0.5445425361155698,
1360
- 0.37714285714285717
1361
- ],
1362
- "correct": 1
1363
- },
1364
- {
1365
- "image": "android_studio_mac/screenshot_2024-11-05_16-01-19.png",
1366
- "question": "filter resources in android studio",
1367
- "resized_size": [
1368
- 1400,
1369
- 2492
1370
- ],
1371
- "ground_truth": [
1372
- 0.2234375,
1373
- 0.09814814814814815,
1374
- 0.228125,
1375
- 0.1087962962962963
1376
- ],
1377
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [473, 168]}}\n</tool_call>",
1378
- "extracted_answer": [
1379
- 0.18980738362760835,
1380
- 0.12
1381
- ],
1382
- "correct": 0
1383
- },
1384
- {
1385
- "image": "android_studio_mac/screenshot_2024-11-07_15-47-45.png",
1386
- "question": "filter TODO item in android studio",
1387
- "resized_size": [
1388
- 1400,
1389
- 2492
1390
- ],
1391
- "ground_truth": [
1392
- 0.10651041666666666,
1393
- 0.625,
1394
- 0.11223958333333334,
1395
- 0.6361111111111111
1396
- ],
1397
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
1398
- "extracted_answer": [
1399
- 0.14325842696629212,
1400
- 0.5828571428571429
1401
- ],
1402
- "correct": 0
1403
- },
1404
- {
1405
- "image": "android_studio_mac/screenshot_2024-11-28_11-33-15.png",
1406
- "question": "sign in google message in virtual android machine in android studio ",
1407
- "resized_size": [
1408
- 1400,
1409
- 2492
1410
- ],
1411
- "ground_truth": [
1412
- 0.54609375,
1413
- 0.7574074074074074,
1414
- 0.58203125,
1415
- 0.774537037037037
1416
- ],
1417
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 1056]}}\n</tool_call>",
1418
- "extracted_answer": [
1419
- 0.5529695024077047,
1420
- 0.7542857142857143
1421
- ],
1422
- "correct": 0
1423
- },
1424
- {
1425
- "image": "android_studio_mac/screenshot_2024-11-28_18-03-58.png",
1426
- "question": "open terminal windows in the bottom of the android studio",
1427
- "resized_size": [
1428
- 1400,
1429
- 2492
1430
- ],
1431
- "ground_truth": [
1432
- 0.39817708333333335,
1433
- 0.8560185185185185,
1434
- 0.4171875,
1435
- 0.8675925925925926
1436
- ],
1437
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1035, 1181]}}\n</tool_call>",
1438
- "extracted_answer": [
1439
- 0.4153290529695024,
1440
- 0.8435714285714285
1441
- ],
1442
- "correct": 0
1443
- },
1444
- {
1445
- "image": "android_studio_mac/screenshot_2024-11-05_16-08-03.png",
1446
- "question": "sign in to android studio",
1447
- "resized_size": [
1448
- 1400,
1449
- 2492
1450
- ],
1451
- "ground_truth": [
1452
- 0.5770833333333333,
1453
- 0.07222222222222222,
1454
- 0.5822916666666667,
1455
- 0.08055555555555556
1456
- ],
1457
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 1125]}}\n</tool_call>",
1458
- "extracted_answer": [
1459
- 0.5529695024077047,
1460
- 0.8035714285714286
1461
- ],
1462
- "correct": 0
1463
- },
1464
- {
1465
- "image": "android_studio_mac/screenshot_2024-11-28_16-03-53.png",
1466
- "question": "like the QwQ-32b-preview on huggingface within the virtual android machine in android studio",
1467
- "resized_size": [
1468
- 1400,
1469
- 2492
1470
- ],
1471
- "ground_truth": [
1472
- 0.5341145833333333,
1473
- 0.6101851851851852,
1474
- 0.54765625,
1475
- 0.6212962962962963
1476
- ],
1477
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1358, 867]}}\n</tool_call>",
1478
- "extracted_answer": [
1479
- 0.5449438202247191,
1480
- 0.6192857142857143
1481
- ],
1482
- "correct": 1
1483
- },
1484
- {
1485
- "image": "android_studio_mac/screenshot_2024-11-05_18-03-37.png",
1486
- "question": "open the device in the device explorer in android studio",
1487
- "resized_size": [
1488
- 1400,
1489
- 2492
1490
- ],
1491
- "ground_truth": [
1492
- 0.2484375,
1493
- 0.1398148148148148,
1494
- 0.2552083333333333,
1495
- 0.1537037037037037
1496
- ],
1497
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [658, 230]}}\n</tool_call>",
1498
- "extracted_answer": [
1499
- 0.2640449438202247,
1500
- 0.16428571428571428
1501
- ],
1502
- "correct": 0
1503
- },
1504
- {
1505
- "image": "android_studio_mac/screenshot_2024-11-28_15-13-10.png",
1506
- "question": "Modify the filters of a photo to warm within the virtual Android machine in Android Studio.",
1507
- "resized_size": [
1508
- 1400,
1509
- 2492
1510
- ],
1511
- "ground_truth": [
1512
- 0.59453125,
1513
- 0.638425925925926,
1514
- 0.61796875,
1515
- 0.6694444444444444
1516
- ],
1517
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1537, 916]}}\n</tool_call>",
1518
- "extracted_answer": [
1519
- 0.6167736757624398,
1520
- 0.6542857142857142
1521
- ],
1522
- "correct": 1
1523
- },
1524
- {
1525
- "image": "android_studio_mac/screenshot_2024-11-28_17-05-33.png",
1526
- "question": "decrease the volume of the android virtual machine in android studio",
1527
- "resized_size": [
1528
- 1400,
1529
- 2492
1530
- ],
1531
- "ground_truth": [
1532
- 0.5924479166666666,
1533
- 0.3375,
1534
- 0.5979166666666667,
1535
- 0.3490740740740741
1536
- ],
1537
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1650, 1037]}}\n</tool_call>",
1538
- "extracted_answer": [
1539
- 0.6621187800963082,
1540
- 0.7407142857142858
1541
- ],
1542
- "correct": 0
1543
- },
1544
- {
1545
- "image": "android_studio_mac/screenshot_2024-11-28_15-01-29.png",
1546
- "question": "open the picture taked in virtual android machine in android studio ",
1547
- "resized_size": [
1548
- 1400,
1549
- 2492
1550
- ],
1551
- "ground_truth": [
1552
- 0.5697916666666667,
1553
- 0.6884259259259259,
1554
- 0.5864583333333333,
1555
- 0.7162037037037037
1556
- ],
1557
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 648]}}\n</tool_call>",
1558
- "extracted_answer": [
1559
- 0.14325842696629212,
1560
- 0.46285714285714286
1561
- ],
1562
- "correct": 0
1563
- },
1564
- {
1565
- "image": "android_studio_mac/screenshot_2024-11-07_15-42-48.png",
1566
- "question": "collapse all TODO list in android studio",
1567
- "resized_size": [
1568
- 1400,
1569
- 2492
1570
- ],
1571
- "ground_truth": [
1572
- 0.10572916666666667,
1573
- 0.6606481481481481,
1574
- 0.11197916666666667,
1575
- 0.6722222222222223
1576
- ],
1577
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
1578
- "extracted_answer": [
1579
- 0.14325842696629212,
1580
- 0.5828571428571429
1581
- ],
1582
- "correct": 0
1583
- },
1584
- {
1585
- "image": "android_studio_mac/screenshot_2024-11-07_15-31-49.png",
1586
- "question": "add new terminal in android studio",
1587
- "resized_size": [
1588
- 1400,
1589
- 2492
1590
- ],
1591
- "ground_truth": [
1592
- 0.14270833333333333,
1593
- 0.5875,
1594
- 0.14817708333333332,
1595
- 0.5990740740740741
1596
- ],
1597
- "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [356, 807]}}\n</tool_call>",
1598
- "extracted_answer": [
1599
- 0.14285714285714285,
1600
- 0.5764285714285714
1601
- ],
1602
- "correct": 0
1603
- }
1604
- ]
1605
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_grounding_r1.py DELETED
@@ -1,318 +0,0 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- from pprint import pprint
9
- import random
10
- from PIL import Image
11
- from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- import argparse
15
-
16
- import warnings
17
-
18
- warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
19
-
20
- def setup_distributed():
21
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
22
- torch.cuda.set_device(local_rank)
23
-
24
- dist.init_process_group(backend="nccl")
25
-
26
- world_size = dist.get_world_size()
27
- rank = dist.get_rank()
28
-
29
- return local_rank, world_size, rank
30
-
31
- local_rank, world_size, rank = setup_distributed()
32
- device = f"cuda:{local_rank}"
33
- print(f"Process {rank} using {device}")
34
-
35
- steps = 4100
36
- if rank == 0:
37
- print("Steps: ", steps)
38
-
39
- RUN_NAME="Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_no_position_high_quality_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_think_4e"
40
- MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
41
- OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
42
-
43
- BSZ=32
44
- DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/annotations"
45
-
46
- TEST_DATASETS = ['android_studio_macos', 'autocad_windows', 'blender_windows','davinci_macos','eviews_windows','excel_macos','fruitloops_windows','illustrator_windows','inventor_windows','linux_common_linux','macos_common_macos','matlab_macos','origin_windows','photoshop_windows','powerpoint_windows','premiere_windows','pycharm_macos','quartus_windows','solidworks_windows','stata_windows','unreal_engine_windows','vivado_windows','vmware_macos','vscode_macos','windows_common_windows','word_macos']
47
- IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/images"
48
-
49
-
50
- # TEST_DATASETS = ['lisa_test']
51
- # IMAGE_ROOT = "/data10/shz/dataset/lisa"
52
-
53
-
54
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
55
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
- MODEL_PATH,
57
- torch_dtype=torch.bfloat16,
58
- attn_implementation="flash_attention_2",
59
- device_map={"": local_rank},
60
- )
61
-
62
- # default processer
63
- processor = AutoProcessor.from_pretrained(MODEL_PATH)
64
- print(processor.image_processor.min_pixels)
65
- print(processor.image_processor.max_pixels)
66
- # def extract_point_answer(content):
67
- # # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
68
- # answer_tag_pattern = r'<answer>(.*?)</answer>'
69
- # content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
70
- # if content_answer_match:
71
- # content_answer = content_answer_match.group(1).strip()
72
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
73
- # if tool_call_match:
74
- # tool_call_content = tool_call_match.group(1).strip()
75
- # # 解析 JSON
76
- # tool_call_json = json.loads(tool_call_content)
77
- # arguments = tool_call_json.get("arguments", {})
78
- # coordinate = arguments.get("coordinate", None)
79
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
80
- # x, y = coordinate
81
- # extracted_coordinate = [x, y]
82
- # return extracted_coordinate
83
- # return [0, 0]
84
-
85
-
86
- def extract_point_answer(content):
87
- # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
88
- answer_tag_pattern = r'<answer>(.*?)</answer>' # 修正正则表达式中的错误
89
- content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
90
- if content_answer_match:
91
- content_answer = content_answer_match.group(1).strip()
92
- tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
93
- if tool_call_match:
94
- tool_call_content = tool_call_match.group(1).strip()
95
- # 首先尝试将 tool_call_content 解析为 JSON
96
- try:
97
- tool_call_json = json.loads(tool_call_content)
98
- arguments = tool_call_json.get("arguments", {})
99
- coordinate = arguments.get("coordinate", None)
100
- if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
101
- try:
102
- x = float(coordinate[0])
103
- y = float(coordinate[1])
104
- return [x, y]
105
- except (ValueError, TypeError):
106
- pass # 如果转换失败,继续尝试���则提取
107
- except json.JSONDecodeError:
108
- pass # 如果 JSON 解析失败,继续尝试正则提取
109
- # 回退到正则表达式提取两个数字
110
- numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
111
- if len(numbers) >= 2:
112
- x = float(numbers[-2])
113
- y = float(numbers[-1])
114
- return [x, y]
115
- return [0, 0]
116
-
117
- def point_in_box(point, box):
118
- x,y = point
119
- if box[0] <= x < box[2] and box[1] <= y < box[3]:
120
- return 1
121
- else:
122
- return 0
123
-
124
- num_samples = 2000
125
- num_all_sample = 0
126
- num_correct_sample = 0
127
- for ds in TEST_DATASETS:
128
- if rank == 0:
129
- print(f"Processing {ds}...")
130
- ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
131
- data = json.load(open(ds_path, "r"))
132
- random.seed(42)
133
- random.shuffle(data)
134
- data = data[:num_samples]
135
-
136
- # Split data for distributed evaluation
137
- per_rank_data = len(data) // world_size
138
- start_idx = rank * per_rank_data
139
- end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
140
- rank_data = data[start_idx:end_idx]
141
-
142
- messages = []
143
-
144
- for x in rank_data:
145
- image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
146
- width,height = x['img_size'][0],x['img_size'][1]
147
- resized_height, resized_width = smart_resize(
148
- height,
149
- width,
150
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
151
- min_pixels = processor.image_processor.min_pixels,
152
- max_pixels = processor.image_processor.max_pixels,
153
- )
154
- system_content = """You are a helpful assistant.
155
- #Tools
156
-
157
- You may call one or more functions to assist with the user query.
158
-
159
- You are provided with function signatures within <tools></tools> XML tags:
160
- <tools>
161
- {"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
162
- </tools>
163
-
164
- For each function call, first provide a detailed step-by-step thought process within <think></think> XML tags to analyze the user query and determine the action location, then return a json object with function name and arguments within <tool_call></tool_call> XML tags inside <answer></answer> XML tags:
165
- <think>
166
- [Your detailed step-by-step thought process here]
167
- </think>
168
- <answer>
169
- <tool_call>
170
- {"name": <function-name>, "arguments": <args-json-object>}
171
- </tool_call>
172
- </answer>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
173
- message = [
174
- {
175
- "role": "system",
176
- "content": [
177
- {
178
- "type": "text",
179
- "text": system_content
180
- }
181
- ]
182
- },
183
- {
184
- "role": "user",
185
- "content": [
186
- {
187
- "type": "image",
188
- "image": f"file://{image_path}"
189
- },
190
- {
191
- "type": "text",
192
- "text": x['instruction']
193
- }
194
- ]
195
- },
196
- ]
197
- # print(message)
198
- messages.append(message)
199
-
200
-
201
- rank_outputs = [] # List to store answers for this rank
202
- all_outputs = [] # List to store all answers
203
-
204
- # Process data
205
- for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
206
- batch_messages = messages[i:i + BSZ]
207
-
208
- # Preparation for inference
209
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
210
-
211
- image_inputs, video_inputs = process_vision_info(batch_messages)
212
- inputs = processor(
213
- text=text,
214
- images=image_inputs,
215
- videos=video_inputs,
216
- padding=True,
217
- padding_side="left",
218
- return_tensors="pt",
219
- )
220
- inputs = inputs.to(device)
221
-
222
- # Inference: Generation of the output
223
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
224
-
225
- generated_ids_trimmed = [
226
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
227
- ]
228
- batch_output_text = processor.batch_decode(
229
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
230
- )
231
-
232
- rank_outputs.extend(batch_output_text)
233
-
234
- print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
235
-
236
- # Gather all outputs from all ranks
237
- all_outputs = [None] * len(data)
238
- rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
239
-
240
- gathered_results = [None] * world_size
241
- dist.all_gather_object(gathered_results, rank_results)
242
-
243
- assert gathered_results[-1][-1][0] == len(data) - 1
244
-
245
- # The main process will collect all results
246
- if rank == 0:
247
- for results in gathered_results:
248
- for idx, output in results:
249
- assert idx < len(all_outputs)
250
- all_outputs[idx] = output
251
- assert all_outputs[-1] is not None
252
-
253
- final_output = []
254
- correct_number = 0
255
-
256
- for input_example, model_output in zip(data, all_outputs):
257
- original_output = model_output
258
- ground_truth = input_example['bbox']
259
- ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], ground_truth[2] / input_example['img_size'][0], ground_truth[3] / input_example['img_size'][1]]
260
- model_answer = extract_point_answer(original_output)
261
- resized_height, resized_width = smart_resize(
262
- input_example['img_size'][1],
263
- input_example['img_size'][0],
264
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
265
- min_pixels = processor.image_processor.min_pixels,
266
- max_pixels = processor.image_processor.max_pixels,
267
- )
268
- model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
269
- # Count correct answers
270
- correct = 0
271
- if model_answer is not None:
272
- correct = point_in_box(model_answer, ground_truth)
273
- correct_number += correct
274
- num_all_sample +=1
275
- num_correct_sample += correct
276
-
277
- # Create a result dictionary for this example
278
- result = {
279
- 'image': input_example['img_filename'],
280
- 'question': input_example['instruction'],
281
- 'resized_size': [resized_height, resized_width],
282
- 'ground_truth': ground_truth,
283
- 'model_output': original_output,
284
- 'extracted_answer': model_answer,
285
- 'correct': correct
286
- }
287
- final_output.append(result)
288
-
289
- # Calculate and print accuracy
290
- accuracy = correct_number / len(data) * 100
291
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
292
-
293
- # Save results to a JSON file
294
- output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
295
- output_dir = os.path.dirname(output_path)
296
- if not os.path.exists(output_dir):
297
- os.makedirs(output_dir)
298
- with open(output_path, "w") as f:
299
- json.dump({
300
- 'accuracy': accuracy,
301
- 'results': final_output
302
- }, f, indent=2)
303
-
304
- print(f"Results saved to {output_path}")
305
- print("-"*100)
306
- # 将最后的统计和打印移到rank==0的条件块内
307
- if rank == 0:
308
- accuracy = num_correct_sample / num_all_sample * 100
309
- print(f"\nnumber of correct samples: {num_correct_sample}")
310
- print(f"number of all samples: {num_all_sample}")
311
- print(f"Accuracy of all datasets: {accuracy:.2f}%")
312
-
313
- # Synchronize all processes
314
- dist.barrier()
315
-
316
-
317
-
318
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_grounding_r1_nothink.py DELETED
@@ -1,331 +0,0 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- from pprint import pprint
9
- import random
10
- from PIL import Image
11
- from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- import argparse
15
-
16
- import warnings
17
-
18
- warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
19
-
20
- def setup_distributed():
21
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
22
- torch.cuda.set_device(local_rank)
23
-
24
- dist.init_process_group(backend="nccl")
25
-
26
- world_size = dist.get_world_size()
27
- rank = dist.get_rank()
28
-
29
- return local_rank, world_size, rank
30
-
31
- local_rank, world_size, rank = setup_distributed()
32
- device = f"cuda:{local_rank}"
33
- print(f"Process {rank} using {device}")
34
-
35
- steps = 3860
36
- if rank == 0:
37
- print("Steps: ", steps)
38
-
39
- RUN_NAME = "Qwen2.5-VL-3B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_0.2_filtered_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e_max_pixel_4028160"
40
- MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
41
- #MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_acadmic/public_data/11178625/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
42
- OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
43
-
44
- BSZ=8
45
- DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/annotations"
46
-
47
- TEST_DATASETS = ['android_studio_macos', 'autocad_windows', 'blender_windows','davinci_macos','eviews_windows','excel_macos','fruitloops_windows','illustrator_windows','inventor_windows','linux_common_linux','macos_common_macos','matlab_macos','origin_windows','photoshop_windows','powerpoint_windows','premiere_windows','pycharm_macos','quartus_windows','solidworks_windows','stata_windows','unreal_engine_windows','vivado_windows','vmware_macos','vscode_macos','windows_common_windows','word_macos']
48
- IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/images"
49
-
50
-
51
- # TEST_DATASETS = ['lisa_test']
52
- # IMAGE_ROOT = "/data10/shz/dataset/lisa"
53
-
54
-
55
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
56
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
- MODEL_PATH,
58
- torch_dtype=torch.bfloat16,
59
- attn_implementation="flash_attention_2",
60
- device_map={"": local_rank},
61
- )
62
- # default processer
63
- processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=7049280,min_pixels=3136)
64
- # processor.image_processor.min_pixels=3136
65
- # processor.image_processor.max_pixels=2007040
66
- print(processor.image_processor.min_pixels)
67
- print(processor.image_processor.max_pixels)
68
- # def extract_point_answer(content):
69
- # # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
70
- # answer_tag_pattern = r'<answer>(.*?)</answer>'
71
- # content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
72
- # if content_answer_match:
73
- # content_answer = content_answer_match.group(1).strip()
74
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
75
- # if tool_call_match:
76
- # tool_call_content = tool_call_match.group(1).strip()
77
- # # 解析 JSON
78
- # tool_call_json = json.loads(tool_call_content)
79
- # arguments = tool_call_json.get("arguments", {})
80
- # coordinate = arguments.get("coordinate", None)
81
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
82
- # x, y = coordinate
83
- # extracted_coordinate = [x, y]
84
- # return extracted_coordinate
85
- # return [0, 0]
86
-
87
-
88
- # def extract_point_answer(content):
89
- # # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
90
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
91
- # if tool_call_match:
92
- # tool_call_content = tool_call_match.group(1).strip()
93
- # # 首先尝试将 tool_call_content 解析为 JSON
94
- # try:
95
- # tool_call_json = json.loads(tool_call_content)
96
- # print(tool_call_json)
97
- # arguments = tool_call_json.get("arguments", {})
98
- # coordinate = arguments.get("coordinate", None)
99
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
100
- # try:
101
- # x = float(coordinate[0])
102
- # y = float(coordinate[1])
103
- # return [x, y]
104
- # except (ValueError, TypeError):
105
- # pass # 如果转换失败,继续尝试正则提取
106
- # except json.JSONDecodeError:
107
- # pass # 如果 JSON 解析失败,继续尝试正则提取
108
- # # 回退到正则表达式提取两个数字
109
- # numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
110
- # if len(numbers) >= 2:
111
- # x = float(numbers[-2])
112
- # y = float(numbers[-1])
113
- # return [x, y]
114
- # return [0, 0]
115
-
116
-
117
-
118
- def extract_point_answer(content):
119
- # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
120
- tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
121
- if tool_call_match:
122
- tool_call_content = tool_call_match.group(1).strip()
123
- # 首先尝试将 tool_call_content 解析为 JSON
124
- try:
125
- numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
126
- if len(numbers) >= 2:
127
- x = float(numbers[-2])
128
- y = float(numbers[-1])
129
- return [x, y]
130
- except json.JSONDecodeError:
131
- pass # 如果 JSON 解析失败,继续尝试正则提取
132
- # 回退到正则表达式提取两个数字
133
- return [0, 0]
134
-
135
- def point_in_box(point, box):
136
- x,y = point
137
- if box[0] <= x < box[2] and box[1] <= y < box[3]:
138
- return 1
139
- else:
140
- return 0
141
-
142
- num_samples = 2000
143
- num_all_sample = 0
144
- num_correct_sample = 0
145
- for ds in TEST_DATASETS:
146
- if rank == 0:
147
- print(f"Processing {ds}...")
148
- ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
149
- data = json.load(open(ds_path, "r"))
150
- random.seed(42)
151
- random.shuffle(data)
152
- data = data[:num_samples]
153
-
154
- # Split data for distributed evaluation
155
- per_rank_data = len(data) // world_size
156
- start_idx = rank * per_rank_data
157
- end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
158
- rank_data = data[start_idx:end_idx]
159
-
160
- messages = []
161
-
162
- for x in rank_data:
163
- image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
164
- width,height = x['img_size'][0],x['img_size'][1]
165
- resized_height, resized_width = smart_resize(
166
- height,
167
- width,
168
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
169
- min_pixels = processor.image_processor.min_pixels,
170
- max_pixels = processor.image_processor.max_pixels,
171
- )
172
- system_content = """You are a helpful assistant.
173
- #Tools
174
-
175
- You may call one or more functions to assist with the user query.
176
-
177
- You are provided with function signatures within <tools></tools> XML tags:
178
- <tools>
179
- {"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
180
- </tools>
181
-
182
- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
183
- <tool_call>
184
- {"name": <function-name>, "arguments": <args-json-object>}
185
- </tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
186
- message = [
187
- {
188
- "role": "system",
189
- "content": [
190
- {
191
- "type": "text",
192
- "text": system_content
193
- }
194
- ]
195
- },
196
- {
197
- "role": "user",
198
- "content": [
199
- {
200
- "type": "image",
201
- "image": f"file://{image_path}"
202
- },
203
- {
204
- "type": "text",
205
- "text": x['instruction']
206
- }
207
- ]
208
- },
209
- ]
210
- # print(message)
211
- messages.append(message)
212
-
213
-
214
- rank_outputs = [] # List to store answers for this rank
215
- all_outputs = [] # List to store all answers
216
-
217
- # Process data
218
- for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
219
- batch_messages = messages[i:i + BSZ]
220
-
221
- # Preparation for inference
222
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
223
-
224
- image_inputs, video_inputs = process_vision_info(batch_messages)
225
- inputs = processor(
226
- text=text,
227
- images=image_inputs,
228
- videos=video_inputs,
229
- padding=True,
230
- padding_side="left",
231
- return_tensors="pt",
232
- )
233
- inputs = inputs.to(device)
234
-
235
- # Inference: Generation of the output
236
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
237
-
238
- generated_ids_trimmed = [
239
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
240
- ]
241
- batch_output_text = processor.batch_decode(
242
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
243
- )
244
-
245
- rank_outputs.extend(batch_output_text)
246
-
247
- print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
248
-
249
- # Gather all outputs from all ranks
250
- all_outputs = [None] * len(data)
251
- rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
252
-
253
- gathered_results = [None] * world_size
254
- dist.all_gather_object(gathered_results, rank_results)
255
-
256
- assert gathered_results[-1][-1][0] == len(data) - 1
257
-
258
- # The main process will collect all results
259
- if rank == 0:
260
- for results in gathered_results:
261
- for idx, output in results:
262
- assert idx < len(all_outputs)
263
- all_outputs[idx] = output
264
- assert all_outputs[-1] is not None
265
-
266
- final_output = []
267
- correct_number = 0
268
-
269
- for input_example, model_output in zip(data, all_outputs):
270
- original_output = model_output
271
- ground_truth = input_example['bbox']
272
- ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], ground_truth[2] / input_example['img_size'][0], ground_truth[3] / input_example['img_size'][1]]
273
- model_answer = extract_point_answer(original_output)
274
- resized_height, resized_width = smart_resize(
275
- input_example['img_size'][1],
276
- input_example['img_size'][0],
277
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
278
- min_pixels = processor.image_processor.min_pixels,
279
- max_pixels = processor.image_processor.max_pixels,
280
- )
281
- model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
282
- # Count correct answers
283
- correct = 0
284
- if model_answer is not None:
285
- correct = point_in_box(model_answer, ground_truth)
286
- correct_number += correct
287
- num_all_sample +=1
288
- num_correct_sample += correct
289
-
290
- # Create a result dictionary for this example
291
- result = {
292
- 'image': input_example['img_filename'],
293
- 'question': input_example['instruction'],
294
- 'resized_size': [resized_height, resized_width],
295
- 'ground_truth': ground_truth,
296
- 'model_output': original_output,
297
- 'extracted_answer': model_answer,
298
- 'correct': correct
299
- }
300
- final_output.append(result)
301
-
302
- # Calculate and print accuracy
303
- accuracy = correct_number / len(data) * 100
304
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
305
-
306
- # Save results to a JSON file
307
- output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
308
- output_dir = os.path.dirname(output_path)
309
- if not os.path.exists(output_dir):
310
- os.makedirs(output_dir)
311
- with open(output_path, "w") as f:
312
- json.dump({
313
- 'accuracy': accuracy,
314
- 'results': final_output
315
- }, f, indent=2)
316
-
317
- print(f"Results saved to {output_path}")
318
- print("-"*100)
319
- # 将最后的统计和打印移到rank==0的条件块内
320
- if rank == 0:
321
- accuracy = num_correct_sample / num_all_sample * 100
322
- print(f"\nnumber of correct samples: {num_correct_sample}")
323
- print(f"number of all samples: {num_all_sample}")
324
- print(f"Accuracy of all datasets: {accuracy:.2f}%")
325
-
326
- # Synchronize all processes
327
- dist.barrier()
328
-
329
-
330
-
331
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_grounding_r1_nothink_ss.py DELETED
@@ -1,352 +0,0 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- from pprint import pprint
9
- import random
10
- from PIL import Image
11
- from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- import argparse
15
-
16
- import warnings
17
-
18
- warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
19
-
20
- def setup_distributed():
21
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
22
- torch.cuda.set_device(local_rank)
23
-
24
- dist.init_process_group(backend="nccl")
25
-
26
- world_size = dist.get_world_size()
27
- rank = dist.get_rank()
28
-
29
- return local_rank, world_size, rank
30
-
31
- local_rank, world_size, rank = setup_distributed()
32
- device = f"cuda:{local_rank}"
33
- print(f"Process {rank} using {device}")
34
-
35
- steps = 3800
36
- if rank == 0:
37
- print("Steps: ", steps)
38
- #RUN_NAME = "base"
39
- RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_filtered_only_one_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e"
40
- #MODEL_PATH="/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-7B-Instruct"
41
- MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
42
- OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
43
-
44
- BSZ=32
45
- DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot/metadata"
46
-
47
- TEST_DATASETS = ['hf_test_full']
48
- IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot/images"
49
-
50
-
51
- # TEST_DATASETS = ['lisa_test']
52
- # IMAGE_ROOT = "/data10/shz/dataset/lisa"
53
-
54
-
55
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
56
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
- MODEL_PATH,
58
- torch_dtype=torch.bfloat16,
59
- attn_implementation="flash_attention_2",
60
- device_map={"": local_rank},
61
- )
62
- # default processer
63
- processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=2007040,min_pixels=3136)
64
- # processor.image_processor.min_pixels=3136
65
- # processor.image_processor.max_pixels=2007040
66
- print(processor.image_processor.min_pixels)
67
- print(processor.image_processor.max_pixels)
68
- # def extract_point_answer(content):
69
- # # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
70
- # answer_tag_pattern = r'<answer>(.*?)</answer>'
71
- # content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
72
- # if content_answer_match:
73
- # content_answer = content_answer_match.group(1).strip()
74
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
75
- # if tool_call_match:
76
- # tool_call_content = tool_call_match.group(1).strip()
77
- # # 解析 JSON
78
- # tool_call_json = json.loads(tool_call_content)
79
- # arguments = tool_call_json.get("arguments", {})
80
- # coordinate = arguments.get("coordinate", None)
81
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
82
- # x, y = coordinate
83
- # extracted_coordinate = [x, y]
84
- # return extracted_coordinate
85
- # return [0, 0]
86
-
87
-
88
- # def extract_point_answer(content):
89
- # # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
90
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
91
- # if tool_call_match:
92
- # tool_call_content = tool_call_match.group(1).strip()
93
- # # 首先尝试将 tool_call_content 解析为 JSON
94
- # try:
95
- # tool_call_json = json.loads(tool_call_content)
96
- # print(tool_call_json)
97
- # arguments = tool_call_json.get("arguments", {})
98
- # coordinate = arguments.get("coordinate", None)
99
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
100
- # try:
101
- # x = float(coordinate[0])
102
- # y = float(coordinate[1])
103
- # return [x, y]
104
- # except (ValueError, TypeError):
105
- # pass # 如果转换失败,继续尝试正则提取
106
- # except json.JSONDecodeError:
107
- # pass # 如果 JSON 解析失败,继续尝试正则提取
108
- # # 回退到正则表达式提取两个数字
109
- # numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
110
- # if len(numbers) >= 2:
111
- # x = float(numbers[-2])
112
- # y = float(numbers[-1])
113
- # return [x, y]
114
- # return [0, 0]
115
-
116
-
117
-
118
- def extract_point_answer(content):
119
- # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
120
- tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
121
- if tool_call_match:
122
- tool_call_content = tool_call_match.group(1).strip()
123
- # 首先尝试将 tool_call_content 解析为 JSON
124
- try:
125
- numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
126
- if len(numbers) >= 2:
127
- x = float(numbers[-2])
128
- y = float(numbers[-1])
129
- return [x, y]
130
- except json.JSONDecodeError:
131
- pass # 如果 JSON 解析失败,继续尝试正则提取
132
- # 回退到正则表达式提取两个数字
133
- return [0, 0]
134
-
135
- def point_in_box(point, box):
136
- x,y = point
137
- if box[0] <= x < box[2] and box[1] <= y < box[3]:
138
- return 1
139
- else:
140
- return 0
141
-
142
- num_samples = 2000
143
- num_all_sample = 0
144
- num_desktop_sample = 0
145
- num_mobile_sample = 0
146
- num_web_sample = 0
147
- num_correct_sample = 0
148
- for ds in TEST_DATASETS:
149
- if rank == 0:
150
- print(f"Processing {ds}...")
151
- ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
152
- data = json.load(open(ds_path, "r"))
153
- random.seed(42)
154
- random.shuffle(data)
155
- data = data[:num_samples]
156
-
157
- # Split data for distributed evaluation
158
- per_rank_data = len(data) // world_size
159
- start_idx = rank * per_rank_data
160
- end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
161
- rank_data = data[start_idx:end_idx]
162
-
163
- messages = []
164
-
165
- for x in rank_data:
166
- image_path = os.path.join(IMAGE_ROOT, x['img_url'])
167
- width,height = x['img_size'][0],x['img_size'][1]
168
- resized_height, resized_width = smart_resize(
169
- height,
170
- width,
171
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
172
- min_pixels = processor.image_processor.min_pixels,
173
- max_pixels = processor.image_processor.max_pixels,
174
- )
175
- system_content = """You are a helpful assistant.
176
- #Tools
177
-
178
- You may call one or more functions to assist with the user query.
179
-
180
- You are provided with function signatures within <tools></tools> XML tags:
181
- <tools>
182
- {"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
183
- </tools>
184
-
185
- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
186
- <tool_call>
187
- {"name": <function-name>, "arguments": <args-json-object>}
188
- </tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
189
- message = [
190
- {
191
- "role": "system",
192
- "content": [
193
- {
194
- "type": "text",
195
- "text": system_content
196
- }
197
- ]
198
- },
199
- {
200
- "role": "user",
201
- "content": [
202
- {
203
- "type": "image",
204
- "image": f"file://{image_path}"
205
- },
206
- {
207
- "type": "text",
208
- "text": x['task']
209
- }
210
- ]
211
- },
212
- ]
213
- # print(message)
214
- messages.append(message)
215
-
216
-
217
- rank_outputs = [] # List to store answers for this rank
218
- all_outputs = [] # List to store all answers
219
-
220
- # Process data
221
- for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
222
- batch_messages = messages[i:i + BSZ]
223
-
224
- # Preparation for inference
225
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
226
-
227
- image_inputs, video_inputs = process_vision_info(batch_messages)
228
- inputs = processor(
229
- text=text,
230
- images=image_inputs,
231
- videos=video_inputs,
232
- padding=True,
233
- padding_side="left",
234
- return_tensors="pt",
235
- )
236
- inputs = inputs.to(device)
237
-
238
- # Inference: Generation of the output
239
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
240
-
241
- generated_ids_trimmed = [
242
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
243
- ]
244
- batch_output_text = processor.batch_decode(
245
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
246
- )
247
-
248
- rank_outputs.extend(batch_output_text)
249
-
250
- print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
251
-
252
- # Gather all outputs from all ranks
253
- all_outputs = [None] * len(data)
254
- rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
255
-
256
- gathered_results = [None] * world_size
257
- dist.all_gather_object(gathered_results, rank_results)
258
-
259
- assert gathered_results[-1][-1][0] == len(data) - 1
260
-
261
- # The main process will collect all results
262
- if rank == 0:
263
- for results in gathered_results:
264
- for idx, output in results:
265
- assert idx < len(all_outputs)
266
- all_outputs[idx] = output
267
- assert all_outputs[-1] is not None
268
-
269
- final_output = []
270
- correct_number = 0
271
- correct_number_desktop = 0
272
- correct_number_mobile = 0
273
- correct_number_web = 0
274
-
275
- for input_example, model_output in zip(data, all_outputs):
276
- original_output = model_output
277
- ground_truth = input_example['bbox']
278
- split_class = input_example['split']
279
- ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], (ground_truth[0]+ground_truth[2]) / input_example['img_size'][0], (ground_truth[1]+ground_truth[3]) / input_example['img_size'][1]]
280
- model_answer = extract_point_answer(original_output)
281
- resized_height, resized_width = smart_resize(
282
- input_example['img_size'][1],
283
- input_example['img_size'][0],
284
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
285
- min_pixels = processor.image_processor.min_pixels,
286
- max_pixels = processor.image_processor.max_pixels,
287
- )
288
- model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
289
- # Count correct answers
290
- correct = 0
291
- if model_answer is not None:
292
- correct = point_in_box(model_answer, ground_truth)
293
- correct_number += correct
294
- num_all_sample +=1
295
- num_correct_sample += correct
296
- if split_class == "desktop":
297
- correct_number_desktop += correct
298
- num_desktop_sample += 1
299
- if split_class == "mobile":
300
- correct_number_mobile += correct
301
- num_mobile_sample += 1
302
- if split_class == "web":
303
- correct_number_web += correct
304
- num_web_sample += 1
305
- # Create a result dictionary for this example
306
- result = {
307
- 'image': input_example['img_url'],
308
- 'question': input_example['task'],
309
- 'resized_size': [resized_height, resized_width],
310
- 'ground_truth': ground_truth,
311
- 'model_output': original_output,
312
- 'extracted_answer': model_answer,
313
- 'correct': correct
314
- }
315
- final_output.append(result)
316
-
317
- # Calculate and print accuracy
318
- accuracy = correct_number / len(data) * 100
319
- accuracy_desktop = correct_number_desktop / num_desktop_sample * 100
320
- accuracy_mobile = correct_number_mobile / num_mobile_sample * 100
321
- accuracy_web = correct_number_web / num_web_sample * 100
322
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
323
- print(f"Accuracy of desktop: {accuracy_desktop:.2f}%")
324
- print(f"Accuracy of mobile: {accuracy_mobile:.2f}%")
325
- print(f"Accuracy of web: {accuracy_web:.2f}%")
326
-
327
- # Save results to a JSON file
328
- output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
329
- output_dir = os.path.dirname(output_path)
330
- if not os.path.exists(output_dir):
331
- os.makedirs(output_dir)
332
- with open(output_path, "w") as f:
333
- json.dump({
334
- 'accuracy': accuracy,
335
- 'results': final_output
336
- }, f, indent=2)
337
-
338
- print(f"Results saved to {output_path}")
339
- print("-"*100)
340
- # 将最后的统计和打印移到rank==0的条件块内
341
- if rank == 0:
342
- accuracy = num_correct_sample / num_all_sample * 100
343
- print(f"\nnumber of correct samples: {num_correct_sample}")
344
- print(f"number of all samples: {num_all_sample}")
345
- print(f"Accuracy of all datasets: {accuracy:.2f}%")
346
-
347
- # Synchronize all processes
348
- dist.barrier()
349
-
350
-
351
-
352
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_grounding_r1_nothink_ssv2.py DELETED
@@ -1,335 +0,0 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- from pprint import pprint
9
- import random
10
- from PIL import Image
11
- from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- import argparse
15
-
16
- import warnings
17
-
18
- warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
19
-
20
- def setup_distributed():
21
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
22
- torch.cuda.set_device(local_rank)
23
-
24
- dist.init_process_group(backend="nccl")
25
-
26
- world_size = dist.get_world_size()
27
- rank = dist.get_rank()
28
-
29
- return local_rank, world_size, rank
30
-
31
- local_rank, world_size, rank = setup_distributed()
32
- device = f"cuda:{local_rank}"
33
- print(f"Process {rank} using {device}")
34
-
35
- steps = 3800
36
- if rank == 0:
37
- print("Steps: ", steps)
38
- # #RUN_NAME = "base"
39
- # RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_filtered_only_one_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e"
40
- # #MODEL_PATH="/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-7B-Instruct"
41
- # MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
42
- # OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
43
- #RUN_NAME = "base"
44
-
45
- MODEL_PATH= "ByteDance-Seed/UI-TARS-2B-SFT"
46
- OUTPUT_PATH="./logs/rec_results_ui_tras_2B.json"
47
-
48
- BSZ=32
49
- DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-v2"
50
-
51
- TEST_DATASETS = ['screenspot_desktop_v2','screenspot_mobile_v2','screenspot_web_v2']
52
- IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-v2/screenspotv2_image"
53
-
54
-
55
- # TEST_DATASETS = ['lisa_test']
56
- # IMAGE_ROOT = "/data10/shz/dataset/lisa"
57
-
58
-
59
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
60
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
61
- MODEL_PATH,
62
- torch_dtype=torch.bfloat16,
63
- attn_implementation="flash_attention_2",
64
- device_map={"": local_rank},
65
- )
66
- # default processer
67
- processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=3512320,min_pixels=3136)
68
- # processor.image_processor.min_pixels=3136
69
- # processor.image_processor.max_pixels=2007040
70
- print(processor.image_processor.min_pixels)
71
- print(processor.image_processor.max_pixels)
72
- # def extract_point_answer(content):
73
- # # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
74
- # answer_tag_pattern = r'<answer>(.*?)</answer>'
75
- # content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
76
- # if content_answer_match:
77
- # content_answer = content_answer_match.group(1).strip()
78
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
79
- # if tool_call_match:
80
- # tool_call_content = tool_call_match.group(1).strip()
81
- # # 解析 JSON
82
- # tool_call_json = json.loads(tool_call_content)
83
- # arguments = tool_call_json.get("arguments", {})
84
- # coordinate = arguments.get("coordinate", None)
85
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
86
- # x, y = coordinate
87
- # extracted_coordinate = [x, y]
88
- # return extracted_coordinate
89
- # return [0, 0]
90
-
91
-
92
- # def extract_point_answer(content):
93
- # # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
94
- # tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
95
- # if tool_call_match:
96
- # tool_call_content = tool_call_match.group(1).strip()
97
- # # 首先尝试将 tool_call_content 解析为 JSON
98
- # try:
99
- # tool_call_json = json.loads(tool_call_content)
100
- # print(tool_call_json)
101
- # arguments = tool_call_json.get("arguments", {})
102
- # coordinate = arguments.get("coordinate", None)
103
- # if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
104
- # try:
105
- # x = float(coordinate[0])
106
- # y = float(coordinate[1])
107
- # return [x, y]
108
- # except (ValueError, TypeError):
109
- # pass # 如果转换失败,继续尝试正则提取
110
- # except json.JSONDecodeError:
111
- # pass # 如果 JSON 解析失败,继续尝试正则提取
112
- # # 回退到正则表达式提取两个数字
113
- # numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
114
- # if len(numbers) >= 2:
115
- # x = float(numbers[-2])
116
- # y = float(numbers[-1])
117
- # return [x, y]
118
- # return [0, 0]
119
-
120
-
121
-
122
- def extract_point_answer(content):
123
- # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
124
- tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
125
- if tool_call_match:
126
- tool_call_content = tool_call_match.group(1).strip()
127
- # 首先尝试将 tool_call_content 解析为 JSON
128
- try:
129
- numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
130
- if len(numbers) >= 2:
131
- x = float(numbers[-2])
132
- y = float(numbers[-1])
133
- return [x, y]
134
- except json.JSONDecodeError:
135
- pass # 如果 JSON 解析失败,继续尝试正则提取
136
- # 回退到正则表达式提取两个数字
137
- return [0, 0]
138
-
139
- def point_in_box(point, box):
140
- x,y = point
141
- if box[0] <= x < box[2] and box[1] <= y < box[3]:
142
- return 1
143
- else:
144
- return 0
145
-
146
- num_samples = 2000
147
- num_all_sample = 0
148
- num_correct_sample = 0
149
- for ds in TEST_DATASETS:
150
- if rank == 0:
151
- print(f"Processing {ds}...")
152
- ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
153
- data = json.load(open(ds_path, "r"))
154
- random.seed(42)
155
- random.shuffle(data)
156
- data = data[:num_samples]
157
-
158
- # Split data for distributed evaluation
159
- per_rank_data = len(data) // world_size
160
- start_idx = rank * per_rank_data
161
- end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
162
- rank_data = data[start_idx:end_idx]
163
-
164
- messages = []
165
-
166
- for x in rank_data:
167
- image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
168
- width,height = x['img_size'][0],x['img_size'][1]
169
- resized_height, resized_width = smart_resize(
170
- height,
171
- width,
172
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
173
- min_pixels = processor.image_processor.min_pixels,
174
- max_pixels = processor.image_processor.max_pixels,
175
- )
176
- system_content = """You are a helpful assistant.
177
- #Tools
178
-
179
- You may call one or more functions to assist with the user query.
180
-
181
- You are provided with function signatures within <tools></tools> XML tags:
182
- <tools>
183
- {"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
184
- </tools>
185
-
186
- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
187
- <tool_call>
188
- {"name": <function-name>, "arguments": <args-json-object>}
189
- </tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
190
- message = [
191
- {
192
- "role": "system",
193
- "content": [
194
- {
195
- "type": "text",
196
- "text": system_content
197
- }
198
- ]
199
- },
200
- {
201
- "role": "user",
202
- "content": [
203
- {
204
- "type": "image",
205
- "image": f"file://{image_path}"
206
- },
207
- {
208
- "type": "text",
209
- "text": x['instruction']
210
- }
211
- ]
212
- },
213
- ]
214
- # print(message)
215
- messages.append(message)
216
-
217
-
218
- rank_outputs = [] # List to store answers for this rank
219
- all_outputs = [] # List to store all answers
220
-
221
- # Process data
222
- for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
223
- batch_messages = messages[i:i + BSZ]
224
-
225
- # Preparation for inference
226
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
227
-
228
- image_inputs, video_inputs = process_vision_info(batch_messages)
229
- inputs = processor(
230
- text=text,
231
- images=image_inputs,
232
- videos=video_inputs,
233
- padding=True,
234
- padding_side="left",
235
- return_tensors="pt",
236
- )
237
- inputs = inputs.to(device)
238
-
239
- # Inference: Generation of the output
240
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
241
-
242
- generated_ids_trimmed = [
243
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
244
- ]
245
- batch_output_text = processor.batch_decode(
246
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
247
- )
248
-
249
- rank_outputs.extend(batch_output_text)
250
-
251
- print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
252
-
253
- # Gather all outputs from all ranks
254
- all_outputs = [None] * len(data)
255
- rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
256
-
257
- gathered_results = [None] * world_size
258
- dist.all_gather_object(gathered_results, rank_results)
259
-
260
- assert gathered_results[-1][-1][0] == len(data) - 1
261
-
262
- # The main process will collect all results
263
- if rank == 0:
264
- for results in gathered_results:
265
- for idx, output in results:
266
- assert idx < len(all_outputs)
267
- all_outputs[idx] = output
268
- assert all_outputs[-1] is not None
269
-
270
- final_output = []
271
- correct_number = 0
272
-
273
- for input_example, model_output in zip(data, all_outputs):
274
- original_output = model_output
275
- ground_truth = input_example['bbox']
276
- ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], (ground_truth[0]+ground_truth[2]) / input_example['img_size'][0], (ground_truth[1]+ground_truth[3]) / input_example['img_size'][1]]
277
- model_answer = extract_point_answer(original_output)
278
- resized_height, resized_width = smart_resize(
279
- input_example['img_size'][1],
280
- input_example['img_size'][0],
281
- factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
282
- min_pixels = processor.image_processor.min_pixels,
283
- max_pixels = processor.image_processor.max_pixels,
284
- )
285
- model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
286
- # Count correct answers
287
- correct = 0
288
- if model_answer is not None:
289
- correct = point_in_box(model_answer, ground_truth)
290
- correct_number += correct
291
- num_all_sample +=1
292
- num_correct_sample += correct
293
-
294
- # Create a result dictionary for this example
295
- result = {
296
- 'image': input_example['img_filename'],
297
- 'question': input_example['instruction'],
298
- 'resized_size': [resized_height, resized_width],
299
- 'ground_truth': ground_truth,
300
- 'model_output': original_output,
301
- 'extracted_answer': model_answer,
302
- 'correct': correct
303
- }
304
- final_output.append(result)
305
-
306
- # Calculate and print accuracy
307
- accuracy = correct_number / len(data) * 100
308
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
309
-
310
- # Save results to a JSON file
311
- output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
312
- output_dir = os.path.dirname(output_path)
313
- if not os.path.exists(output_dir):
314
- os.makedirs(output_dir)
315
- with open(output_path, "w") as f:
316
- json.dump({
317
- 'accuracy': accuracy,
318
- 'results': final_output
319
- }, f, indent=2)
320
-
321
- print(f"Results saved to {output_path}")
322
- print("-"*100)
323
- # 将最后的统计和打印移到rank==0的条件块内
324
- if rank == 0:
325
- accuracy = num_correct_sample / num_all_sample * 100
326
- print(f"\nnumber of correct samples: {num_correct_sample}")
327
- print(f"number of all samples: {num_all_sample}")
328
- print(f"Accuracy of all datasets: {accuracy:.2f}%")
329
-
330
- # Synchronize all processes
331
- dist.barrier()
332
-
333
-
334
-
335
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_od_r1.py DELETED
@@ -1,178 +0,0 @@
1
- import re
2
- import os
3
- import json
4
- import torch
5
- import random
6
-
7
- from tqdm import tqdm
8
- from pprint import pprint
9
- from qwen_vl_utils import process_vision_info
10
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
11
-
12
-
13
- def extract_bbox_answer(content):
14
- pattern = r'```json(.*?)```'
15
- json_match = re.search(pattern, content, re.DOTALL)
16
- bbox_json = json_match.group(1).strip() if json_match else None
17
-
18
- if bbox_json:
19
- try:
20
- bbox = json.loads(bbox_json)[0]['bbox_2d']
21
- return bbox, False
22
- except:
23
- return [0, 0, 0, 0], False
24
- else:
25
- return [0, 0, 0, 0], False
26
-
27
-
28
- def iou(box1, box2):
29
- inter_x1 = max(box1[0], box2[0])
30
- inter_y1 = max(box1[1], box2[1])
31
- inter_x2 = min(box1[2] - 1, box2[2] - 1)
32
- inter_y2 = min(box1[3] - 1, box2[3] - 1)
33
- if inter_x1 < inter_x2 and inter_y1 < inter_y2:
34
- inter = (inter_x2 - inter_x1 + 1) * (inter_y2 - inter_y1 + 1)
35
- else:
36
- inter = 0
37
- union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + (box2[2] - box2[0]) * (box2[3] - box2[1]) - inter
38
- return float(inter) / union
39
-
40
-
41
- def load_model(model_path, device_map):
42
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
43
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
44
- model_path,
45
- torch_dtype=torch.bfloat16,
46
- attn_implementation="flash_attention_2",
47
- device_map=device_map,
48
- )
49
-
50
- # default processer
51
- processor = AutoProcessor.from_pretrained(model_path)
52
-
53
- return model, processor
54
-
55
-
56
- def eval_od_r1(
57
- model_path, test_datasets, data_root, image_root, question_template, output_dir, batch_size=32, sample_num=500, seed=42, device_map="cuda:0"
58
- ):
59
- random.seed(seed)
60
- model, processor = load_model(model_path, device_map)
61
-
62
- for ds in test_datasets:
63
- print(f"Processing {ds}...")
64
-
65
- ds_path = os.path.join(data_root, f"{ds}.json")
66
- data = json.load(open(ds_path, "r"))
67
- random.shuffle(data)
68
- data = data[:sample_num]
69
- messages = []
70
-
71
- for x in data:
72
- image_path = os.path.join(image_root, x['image'])
73
- messages.append(
74
- [
75
- {
76
- "role":
77
- "user",
78
- "content":
79
- [
80
- {
81
- "type": "image",
82
- "image": f"file://{image_path}"
83
- }, {
84
- "type": "text",
85
- "text": question_template.format(Question=x['normal_caption'])
86
- }
87
- ]
88
- }
89
- ]
90
- )
91
-
92
- all_outputs = [] # List to store all answers
93
-
94
- # Process data
95
- for i in tqdm(range(0, len(messages), batch_size)):
96
- batch_messages = messages[i:i + batch_size]
97
-
98
- # Preparation for inference
99
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
100
-
101
- image_inputs, video_inputs = process_vision_info(batch_messages)
102
- inputs = processor(
103
- text=text,
104
- images=image_inputs,
105
- videos=video_inputs,
106
- padding=True,
107
- return_tensors="pt",
108
- )
109
- inputs = inputs.to(device_map)
110
-
111
- # Inference: Generation of the output
112
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
113
-
114
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
115
- batch_output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
116
- all_outputs.extend(batch_output_text)
117
-
118
- final_output = []
119
- correct_number = 0
120
-
121
- for input_example, model_output in zip(data, all_outputs):
122
- original_output = model_output
123
- ground_truth = input_example['solution']
124
- ground_truth_normalized = input_example['normalized_solution']
125
- model_answer, normalized = extract_bbox_answer(original_output)
126
-
127
- # Count correct answers
128
- correct = 0
129
- if model_answer is not None:
130
- iou_value = iou(model_answer, ground_truth_normalized if normalized else ground_truth)
131
- if iou_value > 0.5:
132
- correct = 1
133
- correct_number += correct
134
-
135
- # Create a result dictionary for this example
136
- result = {
137
- "question": question_template.format(Question=input_example['normal_caption']),
138
- "ground_truth": ground_truth if not normalized else ground_truth_normalized,
139
- "model_output": original_output,
140
- "extracted_answer": model_answer,
141
- "correct": correct,
142
- "iou": iou_value
143
- }
144
- final_output.append(result)
145
-
146
- # Calculate and print accuracy
147
- accuracy = correct_number / len(data) * 100
148
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
149
-
150
- # Save results to a JSON file
151
- result_path = os.path.join(output_dir, f"{os.path.basename(model_path)}", f"{ds}_od_r1.json")
152
- os.makedirs(os.path.dirname(result_path), exist_ok=True)
153
- with open(result_path, "w") as f:
154
- json.dump({"accuracy": accuracy, "results": final_output}, f, indent=2)
155
-
156
- print(f"Results saved to {result_path}")
157
- print('-' * 100)
158
-
159
-
160
- if __name__ == "__main__":
161
- model_path = '' # Add the path to the model
162
- data_root = '' # Add the data root
163
- test_datasets = ['refcoco_val', 'refcocop_val', 'refcocog_val'] # modify the datasets
164
- image_root = '' # Add the image root
165
- output_dir = 'logs' # Add the output directory, default is logs
166
- device_map = 'cuda:0' # select the device, default is cuda:0
167
-
168
- question_template = '{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format.' # modify the question template which must contain {Question}, {Question} will be replaced by the caption
169
-
170
- eval_od_r1(
171
- model_path=model_path,
172
- data_root=data_root,
173
- test_datasets=test_datasets,
174
- image_root=image_root,
175
- question_template=question_template,
176
- output_dir=output_dir,
177
- device_map=device_map
178
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_rec_baseline.py DELETED
@@ -1,225 +0,0 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- from pprint import pprint
9
- import random
10
-
11
-
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- import argparse
15
-
16
- import warnings
17
-
18
- warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
19
-
20
- def setup_distributed():
21
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
22
- torch.cuda.set_device(local_rank)
23
-
24
- dist.init_process_group(backend="nccl")
25
-
26
- world_size = dist.get_world_size()
27
- rank = dist.get_rank()
28
-
29
- print(f"Process {rank}/{world_size} initialized on cuda:{local_rank}")
30
- return local_rank, world_size, rank
31
-
32
- local_rank, world_size, rank = setup_distributed()
33
- device = f"cuda:{local_rank}"
34
-
35
- steps = 100
36
- MODEL_PATH=f"/data10/shz/project/LLaMA-Factory/saves/qwen2_5_vl-3b/full/sft/checkpoint-{steps}"
37
- OUTPUT_PATH="./logs/rec_results_{DATASET}_qwen2_5vl_3b_instruct_sft_{STEPS}.json"
38
-
39
- # MODEL_PATH = "/data10/shz/ckpt/vlm-r1-related/Qwen2.5-VL-3B-Instruct"
40
- # OUTPUT_PATH = "./logs/rec_results_{DATASET}_qwen2_5vl_3b_instruct_baseline_{STEPS}.json"
41
-
42
- BSZ=4
43
- DATA_ROOT = "/data10/shz/dataset/rec/rec_jsons_processed"
44
-
45
- TEST_DATASETS = ['refcoco_val', 'refcocop_val', 'refcocog_val']
46
- IMAGE_ROOT = "/data10/shz/dataset/coco"
47
-
48
- # TEST_DATASETS = ['lisa_test']
49
- # IMAGE_ROOT = "/data10/shz/dataset/lisa"
50
-
51
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
52
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
53
- MODEL_PATH,
54
- torch_dtype=torch.bfloat16,
55
- attn_implementation="flash_attention_2",
56
- device_map={"": local_rank},
57
- )
58
-
59
- # default processer
60
- processor = AutoProcessor.from_pretrained(MODEL_PATH)
61
-
62
- def extract_bbox_answer(content):
63
- bbox_pattern = r'\[(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*)\]'
64
- # bbox_pattern = r'\[(-?\d*\.?\d+),\s*(-?\d*\.?\d+),\s*(-?\d*\.?\d+),\s*(-?\d*\.?\d+)\]'
65
- bbox_match = re.search(bbox_pattern, content)
66
-
67
- if bbox_match:
68
- bbox = [float(bbox_match.group(1)), float(bbox_match.group(2)), float(bbox_match.group(3)), float(bbox_match.group(4))]
69
- return bbox
70
- return [0, 0, 0, 0]
71
-
72
- def iou(box1, box2):
73
- inter_x1 = max(box1[0], box2[0])
74
- inter_y1 = max(box1[1], box2[1])
75
- inter_x2 = min(box1[2]-1, box2[2]-1)
76
- inter_y2 = min(box1[3]-1, box2[3]-1)
77
- if inter_x1 < inter_x2 and inter_y1 < inter_y2:
78
- inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
79
- else:
80
- inter = 0
81
- union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
82
- return float(inter)/union
83
-
84
- num_samples = 2000
85
- for ds in TEST_DATASETS:
86
- if rank == 0:
87
- print(f"Processing {ds}...")
88
- ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
89
- data = json.load(open(ds_path, "r"))
90
- random.seed(42)
91
- random.shuffle(data)
92
- data = data[:num_samples]
93
- # QUESTION_TEMPLATE = "{Question}" if steps > 0 else "{Question} Please provide the bounding box coordinate in JSON format."
94
- QUESTION_TEMPLATE = "{Question} Please provide the bounding box coordinate in JSON format."
95
-
96
- # Split data for distributed evaluation
97
- per_rank_data = len(data) // world_size
98
- start_idx = rank * per_rank_data
99
- end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
100
- rank_data = data[start_idx:end_idx]
101
-
102
- messages = []
103
-
104
- for x in rank_data:
105
- image_path = os.path.join(IMAGE_ROOT, x['image'])
106
- message = [
107
- # {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
108
- {
109
- "role": "user",
110
- "content": [
111
- {
112
- "type": "image",
113
- "image": f"file://{image_path}"
114
- },
115
- {
116
- "type": "text",
117
- "text": QUESTION_TEMPLATE.format(Question=x['problem'])
118
- }
119
- ]
120
- }]
121
- messages.append(message)
122
-
123
- rank_outputs = [] # List to store answers for this rank
124
- all_outputs = [] # List to store all answers
125
-
126
- # Process data
127
- for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
128
- batch_messages = messages[i:i + BSZ]
129
-
130
- # Preparation for inference
131
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
132
-
133
- image_inputs, video_inputs = process_vision_info(batch_messages)
134
- inputs = processor(
135
- text=text,
136
- images=image_inputs,
137
- videos=video_inputs,
138
- padding=True,
139
- padding_side="left",
140
- return_tensors="pt",
141
- )
142
- inputs = inputs.to(device)
143
-
144
- # Inference: Generation of the output
145
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
146
-
147
- generated_ids_trimmed = [
148
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
149
- ]
150
- batch_output_text = processor.batch_decode(
151
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
152
- )
153
-
154
- rank_outputs.extend(batch_output_text)
155
-
156
- print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
157
-
158
- # Gather all outputs from all ranks
159
- all_outputs = [None] * len(data)
160
- rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
161
-
162
- gathered_results = [None] * world_size
163
- dist.all_gather_object(gathered_results, rank_results)
164
-
165
- assert gathered_results[-1][-1][0] == len(data) - 1
166
-
167
- # The main process will collect all results
168
- if rank == 0:
169
- for results in gathered_results:
170
- for idx, output in results:
171
- assert idx < len(all_outputs)
172
- all_outputs[idx] = output
173
- assert all_outputs[-1] is not None
174
-
175
- final_output = []
176
- correct_number = 0
177
-
178
- for input_example, model_output in zip(data, all_outputs):
179
- original_output = model_output
180
- ground_truth = input_example['solution']
181
- model_answer = extract_bbox_answer(original_output)
182
-
183
- # Count correct answers
184
- correct = 0
185
- if model_answer is not None:
186
- if iou(model_answer, ground_truth) > 0.5:
187
- correct = 1
188
- correct_number += correct
189
-
190
- # Create a result dictionary for this example
191
- result = {
192
- 'image': input_example['image'],
193
- 'question': input_example['problem'],
194
- 'ground_truth': ground_truth,
195
- 'model_output': original_output,
196
- 'extracted_answer': model_answer,
197
- 'correct': correct
198
- }
199
- final_output.append(result)
200
-
201
- # Calculate and print accuracy
202
- accuracy = correct_number / len(data) * 100
203
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
204
-
205
- # Save results to a JSON file
206
- output_path = OUTPUT_PATH.format(DATASET=ds, STEPS=steps)
207
- output_dir = os.path.dirname(output_path)
208
- if not os.path.exists(output_dir):
209
- os.makedirs(output_dir)
210
- with open(output_path, "w") as f:
211
- json.dump({
212
- 'accuracy': accuracy,
213
- 'results': final_output
214
- }, f, indent=2)
215
-
216
- print(f"Results saved to {output_path}")
217
- print("-"*100)
218
-
219
- # Synchronize all processes
220
- dist.barrier()
221
-
222
-
223
-
224
-
225
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/test_rec_r1.py DELETED
@@ -1,232 +0,0 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- from pprint import pprint
9
- import random
10
-
11
-
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- import argparse
15
-
16
- import warnings
17
-
18
- warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
19
-
20
- def setup_distributed():
21
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
22
- torch.cuda.set_device(local_rank)
23
-
24
- dist.init_process_group(backend="nccl")
25
-
26
- world_size = dist.get_world_size()
27
- rank = dist.get_rank()
28
-
29
- return local_rank, world_size, rank
30
-
31
- local_rank, world_size, rank = setup_distributed()
32
- device = f"cuda:{local_rank}"
33
- print(f"Process {rank} using {device}")
34
-
35
- steps = 100
36
- if rank == 0:
37
- print("Steps: ", steps)
38
-
39
- RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_image_size"
40
-
41
- MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11179904/code/VLM_R1_CUSTOM/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
42
- OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
43
-
44
- BSZ=16
45
- DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed"
46
-
47
- TEST_DATASETS = ['refcoco_val', 'refcocop_val', 'refcocog_val']
48
- IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data"
49
-
50
-
51
- # TEST_DATASETS = ['lisa_test']
52
- # IMAGE_ROOT = "/data10/shz/dataset/lisa"
53
-
54
-
55
- #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
56
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
- MODEL_PATH,
58
- torch_dtype=torch.bfloat16,
59
- attn_implementation="flash_attention_2",
60
- device_map={"": local_rank},
61
- )
62
-
63
- # default processer
64
- processor = AutoProcessor.from_pretrained(MODEL_PATH)
65
-
66
- def extract_bbox_answer(content):
67
- # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
68
- answer_tag_pattern = r'<answer>(.*?)</answer>'
69
- bbox_pattern = r'\{.*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]\s*.*\}'
70
- content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
71
- if content_answer_match:
72
- content_answer = content_answer_match.group(1).strip()
73
- bbox_match = re.search(bbox_pattern, content_answer, re.DOTALL)
74
- if bbox_match:
75
- bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
76
- return bbox
77
- return [0, 0, 0, 0]
78
-
79
- def iou(box1, box2):
80
- inter_x1 = max(box1[0], box2[0])
81
- inter_y1 = max(box1[1], box2[1])
82
- inter_x2 = min(box1[2]-1, box2[2]-1)
83
- inter_y2 = min(box1[3]-1, box2[3]-1)
84
- if inter_x1 < inter_x2 and inter_y1 < inter_y2:
85
- inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
86
- else:
87
- inter = 0
88
- union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
89
- return float(inter)/union
90
-
91
- num_samples = 2000
92
- for ds in TEST_DATASETS:
93
- if rank == 0:
94
- print(f"Processing {ds}...")
95
- ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
96
- data = json.load(open(ds_path, "r"))
97
- random.seed(42)
98
- random.shuffle(data)
99
- data = data[:num_samples]
100
-
101
- QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
102
-
103
- # Split data for distributed evaluation
104
- per_rank_data = len(data) // world_size
105
- start_idx = rank * per_rank_data
106
- end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
107
- rank_data = data[start_idx:end_idx]
108
-
109
- messages = []
110
-
111
- for x in rank_data:
112
- image_path = os.path.join(IMAGE_ROOT, x['image'])
113
- message = [
114
- # {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
115
- {
116
- "role": "user",
117
- "content": [
118
- {
119
- "type": "image",
120
- "image": f"file://{image_path}"
121
- },
122
- {
123
- "type": "text",
124
- "text": QUESTION_TEMPLATE.format(Question=x['problem'])
125
- }
126
- ]
127
- }]
128
- messages.append(message)
129
-
130
- rank_outputs = [] # List to store answers for this rank
131
- all_outputs = [] # List to store all answers
132
-
133
- # Process data
134
- for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
135
- batch_messages = messages[i:i + BSZ]
136
-
137
- # Preparation for inference
138
- text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
139
-
140
- image_inputs, video_inputs = process_vision_info(batch_messages)
141
- inputs = processor(
142
- text=text,
143
- images=image_inputs,
144
- videos=video_inputs,
145
- padding=True,
146
- padding_side="left",
147
- return_tensors="pt",
148
- )
149
- inputs = inputs.to(device)
150
-
151
- # Inference: Generation of the output
152
- generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
153
-
154
- generated_ids_trimmed = [
155
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
156
- ]
157
- batch_output_text = processor.batch_decode(
158
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
159
- )
160
-
161
- rank_outputs.extend(batch_output_text)
162
-
163
- print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
164
-
165
- # Gather all outputs from all ranks
166
- all_outputs = [None] * len(data)
167
- rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
168
-
169
- gathered_results = [None] * world_size
170
- dist.all_gather_object(gathered_results, rank_results)
171
-
172
- assert gathered_results[-1][-1][0] == len(data) - 1
173
-
174
- # The main process will collect all results
175
- if rank == 0:
176
- for results in gathered_results:
177
- for idx, output in results:
178
- assert idx < len(all_outputs)
179
- all_outputs[idx] = output
180
- assert all_outputs[-1] is not None
181
-
182
- final_output = []
183
- correct_number = 0
184
-
185
- for input_example, model_output in zip(data, all_outputs):
186
- original_output = model_output
187
- ground_truth = input_example['solution']
188
- model_answer = extract_bbox_answer(original_output)
189
-
190
- # Count correct answers
191
- correct = 0
192
- if model_answer is not None:
193
- if iou(model_answer, ground_truth) > 0.5:
194
- correct = 1
195
- correct_number += correct
196
-
197
- # Create a result dictionary for this example
198
- result = {
199
- 'image': input_example['image'],
200
- 'question': input_example['problem'],
201
- 'ground_truth': ground_truth,
202
- 'model_output': original_output,
203
- 'extracted_answer': model_answer,
204
- 'correct': correct
205
- }
206
- final_output.append(result)
207
-
208
- # Calculate and print accuracy
209
- accuracy = correct_number / len(data) * 100
210
- print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
211
-
212
- # Save results to a JSON file
213
- output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
214
- output_dir = os.path.dirname(output_path)
215
- if not os.path.exists(output_dir):
216
- os.makedirs(output_dir)
217
- with open(output_path, "w") as f:
218
- json.dump({
219
- 'accuracy': accuracy,
220
- 'results': final_output
221
- }, f, indent=2)
222
-
223
- print(f"Results saved to {output_path}")
224
- print("-"*100)
225
-
226
- # Synchronize all processes
227
- dist.barrier()
228
-
229
-
230
-
231
-
232
-