Delete eval with huggingface_hub
Browse files- eval/logs/rec_results_android_studio_macos_8k_SFT_0.json +0 -1605
- eval/test_grounding_r1.py +0 -318
- eval/test_grounding_r1_nothink.py +0 -331
- eval/test_grounding_r1_nothink_ss.py +0 -352
- eval/test_grounding_r1_nothink_ssv2.py +0 -335
- eval/test_od_r1.py +0 -178
- eval/test_rec_baseline.py +0 -225
- eval/test_rec_r1.py +0 -232
eval/logs/rec_results_android_studio_macos_8k_SFT_0.json
DELETED
|
@@ -1,1605 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"accuracy": 15.0,
|
| 3 |
-
"results": [
|
| 4 |
-
{
|
| 5 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-52-28.png",
|
| 6 |
-
"question": "go to previous page in virtual Android machine in Android studio",
|
| 7 |
-
"resized_size": [
|
| 8 |
-
1400,
|
| 9 |
-
2492
|
| 10 |
-
],
|
| 11 |
-
"ground_truth": [
|
| 12 |
-
0.5294270833333333,
|
| 13 |
-
0.4152777777777778,
|
| 14 |
-
0.5348958333333333,
|
| 15 |
-
0.425
|
| 16 |
-
],
|
| 17 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [548, 503]}}\n</tool_call>",
|
| 18 |
-
"extracted_answer": [
|
| 19 |
-
0.21990369181380418,
|
| 20 |
-
0.35928571428571426
|
| 21 |
-
],
|
| 22 |
-
"correct": 0
|
| 23 |
-
},
|
| 24 |
-
{
|
| 25 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-03-23.png",
|
| 26 |
-
"question": "reverse the sorting by device name in AVD in android studio",
|
| 27 |
-
"resized_size": [
|
| 28 |
-
1400,
|
| 29 |
-
2492
|
| 30 |
-
],
|
| 31 |
-
"ground_truth": [
|
| 32 |
-
0.12604166666666666,
|
| 33 |
-
0.12129629629629629,
|
| 34 |
-
0.1296875,
|
| 35 |
-
0.12685185185185185
|
| 36 |
-
],
|
| 37 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [337, 156]}}\n</tool_call>",
|
| 38 |
-
"extracted_answer": [
|
| 39 |
-
0.13523274478330657,
|
| 40 |
-
0.11142857142857143
|
| 41 |
-
],
|
| 42 |
-
"correct": 0
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-30-52.png",
|
| 46 |
-
"question": "open physical device panel of the device manager in android studio",
|
| 47 |
-
"resized_size": [
|
| 48 |
-
1400,
|
| 49 |
-
2492
|
| 50 |
-
],
|
| 51 |
-
"ground_truth": [
|
| 52 |
-
0.12291666666666666,
|
| 53 |
-
0.09212962962962963,
|
| 54 |
-
0.13932291666666666,
|
| 55 |
-
0.10324074074074074
|
| 56 |
-
],
|
| 57 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [305, 156]}}\n</tool_call>",
|
| 58 |
-
"extracted_answer": [
|
| 59 |
-
0.1223916532905297,
|
| 60 |
-
0.11142857142857143
|
| 61 |
-
],
|
| 62 |
-
"correct": 0
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-07-20.png",
|
| 66 |
-
"question": "where is the stop button in android studio?",
|
| 67 |
-
"resized_size": [
|
| 68 |
-
1400,
|
| 69 |
-
2492
|
| 70 |
-
],
|
| 71 |
-
"ground_truth": [
|
| 72 |
-
0.54609375,
|
| 73 |
-
0.07083333333333333,
|
| 74 |
-
0.5520833333333334,
|
| 75 |
-
0.08055555555555556
|
| 76 |
-
],
|
| 77 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 86]}}\n</tool_call>",
|
| 78 |
-
"extracted_answer": [
|
| 79 |
-
0.5445425361155698,
|
| 80 |
-
0.06142857142857143
|
| 81 |
-
],
|
| 82 |
-
"correct": 0
|
| 83 |
-
},
|
| 84 |
-
{
|
| 85 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-59-02.png",
|
| 86 |
-
"question": "open marscode ai logger in android studio",
|
| 87 |
-
"resized_size": [
|
| 88 |
-
1400,
|
| 89 |
-
2492
|
| 90 |
-
],
|
| 91 |
-
"ground_truth": [
|
| 92 |
-
0.44244791666666666,
|
| 93 |
-
0.8569444444444444,
|
| 94 |
-
0.47942708333333334,
|
| 95 |
-
0.8680555555555556
|
| 96 |
-
],
|
| 97 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [875, 1183]}}\n</tool_call>",
|
| 98 |
-
"extracted_answer": [
|
| 99 |
-
0.351123595505618,
|
| 100 |
-
0.845
|
| 101 |
-
],
|
| 102 |
-
"correct": 0
|
| 103 |
-
},
|
| 104 |
-
{
|
| 105 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-04-51.png",
|
| 106 |
-
"question": "Increase the volume of the Android virtual machine in android studio",
|
| 107 |
-
"resized_size": [
|
| 108 |
-
1400,
|
| 109 |
-
2492
|
| 110 |
-
],
|
| 111 |
-
"ground_truth": [
|
| 112 |
-
0.5861979166666667,
|
| 113 |
-
0.3388888888888889,
|
| 114 |
-
0.5911458333333334,
|
| 115 |
-
0.34953703703703703
|
| 116 |
-
],
|
| 117 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1586, 473]}}\n</tool_call>",
|
| 118 |
-
"extracted_answer": [
|
| 119 |
-
0.6364365971107544,
|
| 120 |
-
0.33785714285714286
|
| 121 |
-
],
|
| 122 |
-
"correct": 0
|
| 123 |
-
},
|
| 124 |
-
{
|
| 125 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-31-44.png",
|
| 126 |
-
"question": "switch the android virtual machine to pixel xl api 34 in android studio",
|
| 127 |
-
"resized_size": [
|
| 128 |
-
1400,
|
| 129 |
-
2492
|
| 130 |
-
],
|
| 131 |
-
"ground_truth": [
|
| 132 |
-
0.41041666666666665,
|
| 133 |
-
0.29953703703703705,
|
| 134 |
-
0.4401041666666667,
|
| 135 |
-
0.30972222222222223
|
| 136 |
-
],
|
| 137 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [608, 537]}}\n</tool_call>",
|
| 138 |
-
"extracted_answer": [
|
| 139 |
-
0.24398073836276082,
|
| 140 |
-
0.38357142857142856
|
| 141 |
-
],
|
| 142 |
-
"correct": 0
|
| 143 |
-
},
|
| 144 |
-
{
|
| 145 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-19-16.png",
|
| 146 |
-
"question": "login with Marscode in android studio",
|
| 147 |
-
"resized_size": [
|
| 148 |
-
1400,
|
| 149 |
-
2492
|
| 150 |
-
],
|
| 151 |
-
"ground_truth": [
|
| 152 |
-
0.4942708333333333,
|
| 153 |
-
0.30925925925925923,
|
| 154 |
-
0.5182291666666666,
|
| 155 |
-
0.3351851851851852
|
| 156 |
-
],
|
| 157 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1273, 450]}}\n</tool_call>",
|
| 158 |
-
"extracted_answer": [
|
| 159 |
-
0.5108346709470305,
|
| 160 |
-
0.32142857142857145
|
| 161 |
-
],
|
| 162 |
-
"correct": 1
|
| 163 |
-
},
|
| 164 |
-
{
|
| 165 |
-
"image": "android_studio_mac/screenshot_2024-11-28_14-43-59.png",
|
| 166 |
-
"question": "turn off bluetooth of virtual Android machine in Android studio ",
|
| 167 |
-
"resized_size": [
|
| 168 |
-
1400,
|
| 169 |
-
2492
|
| 170 |
-
],
|
| 171 |
-
"ground_truth": [
|
| 172 |
-
0.57421875,
|
| 173 |
-
0.3638888888888889,
|
| 174 |
-
0.6270833333333333,
|
| 175 |
-
0.39444444444444443
|
| 176 |
-
],
|
| 177 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 526]}}\n</tool_call>",
|
| 178 |
-
"extracted_answer": [
|
| 179 |
-
0.5565810593900482,
|
| 180 |
-
0.3757142857142857
|
| 181 |
-
],
|
| 182 |
-
"correct": 0
|
| 183 |
-
},
|
| 184 |
-
{
|
| 185 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-10-30.png",
|
| 186 |
-
"question": "refresh virtual device of device manager in android studio ",
|
| 187 |
-
"resized_size": [
|
| 188 |
-
1400,
|
| 189 |
-
2492
|
| 190 |
-
],
|
| 191 |
-
"ground_truth": [
|
| 192 |
-
0.13802083333333334,
|
| 193 |
-
0.10694444444444444,
|
| 194 |
-
0.14635416666666667,
|
| 195 |
-
0.11805555555555555
|
| 196 |
-
],
|
| 197 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [327, 156]}}\n</tool_call>",
|
| 198 |
-
"extracted_answer": [
|
| 199 |
-
0.1312199036918138,
|
| 200 |
-
0.11142857142857143
|
| 201 |
-
],
|
| 202 |
-
"correct": 0
|
| 203 |
-
},
|
| 204 |
-
{
|
| 205 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-50-55.png",
|
| 206 |
-
"question": "open Recent Apps in virtual Android machine in Android studio",
|
| 207 |
-
"resized_size": [
|
| 208 |
-
1400,
|
| 209 |
-
2492
|
| 210 |
-
],
|
| 211 |
-
"ground_truth": [
|
| 212 |
-
0.5372395833333333,
|
| 213 |
-
0.3347222222222222,
|
| 214 |
-
0.5427083333333333,
|
| 215 |
-
0.3453703703703704
|
| 216 |
-
],
|
| 217 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [375, 568]}}\n</tool_call>",
|
| 218 |
-
"extracted_answer": [
|
| 219 |
-
0.15048154093097912,
|
| 220 |
-
0.4057142857142857
|
| 221 |
-
],
|
| 222 |
-
"correct": 0
|
| 223 |
-
},
|
| 224 |
-
{
|
| 225 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-18-32.png",
|
| 226 |
-
"question": "record virtual android phone screen in android studio",
|
| 227 |
-
"resized_size": [
|
| 228 |
-
1400,
|
| 229 |
-
2492
|
| 230 |
-
],
|
| 231 |
-
"ground_truth": [
|
| 232 |
-
0.5106770833333333,
|
| 233 |
-
0.4759259259259259,
|
| 234 |
-
0.5174479166666667,
|
| 235 |
-
0.48564814814814816
|
| 236 |
-
],
|
| 237 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1385, 876]}}\n</tool_call>",
|
| 238 |
-
"extracted_answer": [
|
| 239 |
-
0.5557784911717496,
|
| 240 |
-
0.6257142857142857
|
| 241 |
-
],
|
| 242 |
-
"correct": 0
|
| 243 |
-
},
|
| 244 |
-
{
|
| 245 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-04-38.png",
|
| 246 |
-
"question": "search information of project in android studio",
|
| 247 |
-
"resized_size": [
|
| 248 |
-
1400,
|
| 249 |
-
2492
|
| 250 |
-
],
|
| 251 |
-
"ground_truth": [
|
| 252 |
-
0.56328125,
|
| 253 |
-
0.0699074074074074,
|
| 254 |
-
0.5684895833333333,
|
| 255 |
-
0.08287037037037037
|
| 256 |
-
],
|
| 257 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 142]}}\n</tool_call>",
|
| 258 |
-
"extracted_answer": [
|
| 259 |
-
0.5445425361155698,
|
| 260 |
-
0.10142857142857142
|
| 261 |
-
],
|
| 262 |
-
"correct": 0
|
| 263 |
-
},
|
| 264 |
-
{
|
| 265 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-28-57.png",
|
| 266 |
-
"question": "search emoji whthin the virtual android studio in android studio",
|
| 267 |
-
"resized_size": [
|
| 268 |
-
1400,
|
| 269 |
-
2492
|
| 270 |
-
],
|
| 271 |
-
"ground_truth": [
|
| 272 |
-
0.52265625,
|
| 273 |
-
0.5958333333333333,
|
| 274 |
-
0.57734375,
|
| 275 |
-
0.6111111111111112
|
| 276 |
-
],
|
| 277 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1375, 816]}}\n</tool_call>",
|
| 278 |
-
"extracted_answer": [
|
| 279 |
-
0.5517656500802568,
|
| 280 |
-
0.5828571428571429
|
| 281 |
-
],
|
| 282 |
-
"correct": 0
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-39-12.png",
|
| 286 |
-
"question": "undo the action of the picture within the virtual android machine in android studio",
|
| 287 |
-
"resized_size": [
|
| 288 |
-
1400,
|
| 289 |
-
2492
|
| 290 |
-
],
|
| 291 |
-
"ground_truth": [
|
| 292 |
-
0.57109375,
|
| 293 |
-
0.7337962962962963,
|
| 294 |
-
0.5809895833333333,
|
| 295 |
-
0.7523148148148148
|
| 296 |
-
],
|
| 297 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1437, 1012]}}\n</tool_call>",
|
| 298 |
-
"extracted_answer": [
|
| 299 |
-
0.576645264847512,
|
| 300 |
-
0.7228571428571429
|
| 301 |
-
],
|
| 302 |
-
"correct": 0
|
| 303 |
-
},
|
| 304 |
-
{
|
| 305 |
-
"image": "android_studio_mac/screenshot_2024-11-07_14-57-01.png",
|
| 306 |
-
"question": "close bookmark panel",
|
| 307 |
-
"resized_size": [
|
| 308 |
-
1400,
|
| 309 |
-
2492
|
| 310 |
-
],
|
| 311 |
-
"ground_truth": [
|
| 312 |
-
0.25442708333333336,
|
| 313 |
-
0.33287037037037037,
|
| 314 |
-
0.2604166666666667,
|
| 315 |
-
0.3435185185185185
|
| 316 |
-
],
|
| 317 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [607, 473]}}\n</tool_call>",
|
| 318 |
-
"extracted_answer": [
|
| 319 |
-
0.24357945425361155,
|
| 320 |
-
0.33785714285714286
|
| 321 |
-
],
|
| 322 |
-
"correct": 0
|
| 323 |
-
},
|
| 324 |
-
{
|
| 325 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-33-17.png",
|
| 326 |
-
"question": "open settings of terminal in android studio",
|
| 327 |
-
"resized_size": [
|
| 328 |
-
1400,
|
| 329 |
-
2492
|
| 330 |
-
],
|
| 331 |
-
"ground_truth": [
|
| 332 |
-
0.14921875,
|
| 333 |
-
0.612037037037037,
|
| 334 |
-
0.17786458333333333,
|
| 335 |
-
0.6236111111111111
|
| 336 |
-
],
|
| 337 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
|
| 338 |
-
"extracted_answer": [
|
| 339 |
-
0.14325842696629212,
|
| 340 |
-
0.5828571428571429
|
| 341 |
-
],
|
| 342 |
-
"correct": 0
|
| 343 |
-
},
|
| 344 |
-
{
|
| 345 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-12-20.png",
|
| 346 |
-
"question": "turn off notification dot of google in android virtual machine in android studio",
|
| 347 |
-
"resized_size": [
|
| 348 |
-
1400,
|
| 349 |
-
2492
|
| 350 |
-
],
|
| 351 |
-
"ground_truth": [
|
| 352 |
-
0.6489583333333333,
|
| 353 |
-
0.6916666666666667,
|
| 354 |
-
0.6614583333333334,
|
| 355 |
-
0.7032407407407407
|
| 356 |
-
],
|
| 357 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1635, 978]}}\n</tool_call>",
|
| 358 |
-
"extracted_answer": [
|
| 359 |
-
0.656099518459069,
|
| 360 |
-
0.6985714285714286
|
| 361 |
-
],
|
| 362 |
-
"correct": 1
|
| 363 |
-
},
|
| 364 |
-
{
|
| 365 |
-
"image": "android_studio_mac/screenshot_2024-11-28_16-09-39.png",
|
| 366 |
-
"question": "turn on dark theme of the virtual android macine",
|
| 367 |
-
"resized_size": [
|
| 368 |
-
1400,
|
| 369 |
-
2492
|
| 370 |
-
],
|
| 371 |
-
"ground_truth": [
|
| 372 |
-
0.5903645833333333,
|
| 373 |
-
0.6708333333333333,
|
| 374 |
-
0.60234375,
|
| 375 |
-
0.6824074074074075
|
| 376 |
-
],
|
| 377 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1478, 950]}}\n</tool_call>",
|
| 378 |
-
"extracted_answer": [
|
| 379 |
-
0.5930979133226324,
|
| 380 |
-
0.6785714285714286
|
| 381 |
-
],
|
| 382 |
-
"correct": 1
|
| 383 |
-
},
|
| 384 |
-
{
|
| 385 |
-
"image": "android_studio_mac/screenshot_2024-11-28_14-56-16.png",
|
| 386 |
-
"question": "open contacts of the android machine",
|
| 387 |
-
"resized_size": [
|
| 388 |
-
1400,
|
| 389 |
-
2492
|
| 390 |
-
],
|
| 391 |
-
"ground_truth": [
|
| 392 |
-
0.5348958333333333,
|
| 393 |
-
0.7754629629629629,
|
| 394 |
-
0.5575520833333333,
|
| 395 |
-
0.8050925925925926
|
| 396 |
-
],
|
| 397 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 1086]}}\n</tool_call>",
|
| 398 |
-
"extracted_answer": [
|
| 399 |
-
0.5445425361155698,
|
| 400 |
-
0.7757142857142857
|
| 401 |
-
],
|
| 402 |
-
"correct": 1
|
| 403 |
-
},
|
| 404 |
-
{
|
| 405 |
-
"image": "android_studio_mac/screenshot_2024-11-07_14-49-07.png",
|
| 406 |
-
"question": "add new service in android studio using the icon",
|
| 407 |
-
"resized_size": [
|
| 408 |
-
1400,
|
| 409 |
-
2492
|
| 410 |
-
],
|
| 411 |
-
"ground_truth": [
|
| 412 |
-
0.14453125,
|
| 413 |
-
0.600925925925926,
|
| 414 |
-
0.15130208333333334,
|
| 415 |
-
0.6143518518518518
|
| 416 |
-
],
|
| 417 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [356, 822]}}\n</tool_call>",
|
| 418 |
-
"extracted_answer": [
|
| 419 |
-
0.14285714285714285,
|
| 420 |
-
0.5871428571428572
|
| 421 |
-
],
|
| 422 |
-
"correct": 0
|
| 423 |
-
},
|
| 424 |
-
{
|
| 425 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-31-04.png",
|
| 426 |
-
"question": "install missing platform and fix project in android studio",
|
| 427 |
-
"resized_size": [
|
| 428 |
-
1400,
|
| 429 |
-
2492
|
| 430 |
-
],
|
| 431 |
-
"ground_truth": [
|
| 432 |
-
0.5153645833333333,
|
| 433 |
-
0.5990740740740741,
|
| 434 |
-
0.5791666666666667,
|
| 435 |
-
0.6083333333333333
|
| 436 |
-
],
|
| 437 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 816]}}\n</tool_call>",
|
| 438 |
-
"extracted_answer": [
|
| 439 |
-
0.5445425361155698,
|
| 440 |
-
0.5828571428571429
|
| 441 |
-
],
|
| 442 |
-
"correct": 0
|
| 443 |
-
},
|
| 444 |
-
{
|
| 445 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-06-40.png",
|
| 446 |
-
"question": "share the photo in virtual android machine in android studio",
|
| 447 |
-
"resized_size": [
|
| 448 |
-
1400,
|
| 449 |
-
2492
|
| 450 |
-
],
|
| 451 |
-
"ground_truth": [
|
| 452 |
-
0.58046875,
|
| 453 |
-
0.6995370370370371,
|
| 454 |
-
0.5890625,
|
| 455 |
-
0.7180555555555556
|
| 456 |
-
],
|
| 457 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1473, 986]}}\n</tool_call>",
|
| 458 |
-
"extracted_answer": [
|
| 459 |
-
0.591091492776886,
|
| 460 |
-
0.7042857142857143
|
| 461 |
-
],
|
| 462 |
-
"correct": 0
|
| 463 |
-
},
|
| 464 |
-
{
|
| 465 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-05-52.png",
|
| 466 |
-
"question": "run the project in android studio",
|
| 467 |
-
"resized_size": [
|
| 468 |
-
1400,
|
| 469 |
-
2492
|
| 470 |
-
],
|
| 471 |
-
"ground_truth": [
|
| 472 |
-
0.4984375,
|
| 473 |
-
0.07037037037037037,
|
| 474 |
-
0.5036458333333333,
|
| 475 |
-
0.08194444444444444
|
| 476 |
-
],
|
| 477 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 106]}}\n</tool_call>",
|
| 478 |
-
"extracted_answer": [
|
| 479 |
-
0.5445425361155698,
|
| 480 |
-
0.07571428571428572
|
| 481 |
-
],
|
| 482 |
-
"correct": 0
|
| 483 |
-
},
|
| 484 |
-
{
|
| 485 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-51-23.png",
|
| 486 |
-
"question": "like the video on youtube within the virtual android machine in android studio ",
|
| 487 |
-
"resized_size": [
|
| 488 |
-
1400,
|
| 489 |
-
2492
|
| 490 |
-
],
|
| 491 |
-
"ground_truth": [
|
| 492 |
-
0.5859375,
|
| 493 |
-
0.6314814814814815,
|
| 494 |
-
0.6005208333333333,
|
| 495 |
-
0.6430555555555556
|
| 496 |
-
],
|
| 497 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [2037, 806]}}\n</tool_call>",
|
| 498 |
-
"extracted_answer": [
|
| 499 |
-
0.8174157303370787,
|
| 500 |
-
0.5757142857142857
|
| 501 |
-
],
|
| 502 |
-
"correct": 0
|
| 503 |
-
},
|
| 504 |
-
{
|
| 505 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-05-07.png",
|
| 506 |
-
"question": "open device manager in android studio",
|
| 507 |
-
"resized_size": [
|
| 508 |
-
1400,
|
| 509 |
-
2492
|
| 510 |
-
],
|
| 511 |
-
"ground_truth": [
|
| 512 |
-
0.5549479166666667,
|
| 513 |
-
0.07083333333333333,
|
| 514 |
-
0.56015625,
|
| 515 |
-
0.0824074074074074
|
| 516 |
-
],
|
| 517 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 205]}}\n</tool_call>",
|
| 518 |
-
"extracted_answer": [
|
| 519 |
-
0.5529695024077047,
|
| 520 |
-
0.14642857142857144
|
| 521 |
-
],
|
| 522 |
-
"correct": 0
|
| 523 |
-
},
|
| 524 |
-
{
|
| 525 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-24-57.png",
|
| 526 |
-
"question": "change the pen color to green with in the virtual android machine in android studio",
|
| 527 |
-
"resized_size": [
|
| 528 |
-
1400,
|
| 529 |
-
2492
|
| 530 |
-
],
|
| 531 |
-
"ground_truth": [
|
| 532 |
-
0.5018229166666667,
|
| 533 |
-
0.7347222222222223,
|
| 534 |
-
0.5104166666666666,
|
| 535 |
-
0.7472222222222222
|
| 536 |
-
],
|
| 537 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 1040]}}\n</tool_call>",
|
| 538 |
-
"extracted_answer": [
|
| 539 |
-
0.5445425361155698,
|
| 540 |
-
0.7428571428571429
|
| 541 |
-
],
|
| 542 |
-
"correct": 0
|
| 543 |
-
},
|
| 544 |
-
{
|
| 545 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-17-15.png",
|
| 546 |
-
"question": "open the os settings of android virtual machine in android studio",
|
| 547 |
-
"resized_size": [
|
| 548 |
-
1400,
|
| 549 |
-
2492
|
| 550 |
-
],
|
| 551 |
-
"ground_truth": [
|
| 552 |
-
0.6151041666666667,
|
| 553 |
-
0.5416666666666666,
|
| 554 |
-
0.625,
|
| 555 |
-
0.5606481481481481
|
| 556 |
-
],
|
| 557 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1518, 763]}}\n</tool_call>",
|
| 558 |
-
"extracted_answer": [
|
| 559 |
-
0.6091492776886035,
|
| 560 |
-
0.545
|
| 561 |
-
],
|
| 562 |
-
"correct": 0
|
| 563 |
-
},
|
| 564 |
-
{
|
| 565 |
-
"image": "android_studio_mac/screenshot_2024-11-28_16-11-18.png",
|
| 566 |
-
"question": "take a screenshot of the virtual android machine in android studio",
|
| 567 |
-
"resized_size": [
|
| 568 |
-
1400,
|
| 569 |
-
2492
|
| 570 |
-
],
|
| 571 |
-
"ground_truth": [
|
| 572 |
-
0.57109375,
|
| 573 |
-
0.3402777777777778,
|
| 574 |
-
0.5778645833333333,
|
| 575 |
-
0.35185185185185186
|
| 576 |
-
],
|
| 577 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 825]}}\n</tool_call>",
|
| 578 |
-
"extracted_answer": [
|
| 579 |
-
0.5529695024077047,
|
| 580 |
-
0.5892857142857143
|
| 581 |
-
],
|
| 582 |
-
"correct": 0
|
| 583 |
-
},
|
| 584 |
-
{
|
| 585 |
-
"image": "android_studio_mac/screenshot_2024-11-28_18-00-07.png",
|
| 586 |
-
"question": "open notifications panel on the right in android studio",
|
| 587 |
-
"resized_size": [
|
| 588 |
-
1400,
|
| 589 |
-
2492
|
| 590 |
-
],
|
| 591 |
-
"ground_truth": [
|
| 592 |
-
0.80859375,
|
| 593 |
-
0.3,
|
| 594 |
-
0.8127604166666667,
|
| 595 |
-
0.3509259259259259
|
| 596 |
-
],
|
| 597 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [2378, 406]}}\n</tool_call>",
|
| 598 |
-
"extracted_answer": [
|
| 599 |
-
0.9542536115569823,
|
| 600 |
-
0.29
|
| 601 |
-
],
|
| 602 |
-
"correct": 0
|
| 603 |
-
},
|
| 604 |
-
{
|
| 605 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-48-39.png",
|
| 606 |
-
"question": "subscribe the adidas skateboarding on toutube with in the virtual android machine in android studio",
|
| 607 |
-
"resized_size": [
|
| 608 |
-
1400,
|
| 609 |
-
2492
|
| 610 |
-
],
|
| 611 |
-
"ground_truth": [
|
| 612 |
-
0.6028645833333334,
|
| 613 |
-
0.6226851851851852,
|
| 614 |
-
0.63046875,
|
| 615 |
-
0.6416666666666667
|
| 616 |
-
],
|
| 617 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1536, 879]}}\n</tool_call>",
|
| 618 |
-
"extracted_answer": [
|
| 619 |
-
0.6163723916532905,
|
| 620 |
-
0.6278571428571429
|
| 621 |
-
],
|
| 622 |
-
"correct": 1
|
| 623 |
-
},
|
| 624 |
-
{
|
| 625 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-58-25.png",
|
| 626 |
-
"question": "open version control in android studio",
|
| 627 |
-
"resized_size": [
|
| 628 |
-
1400,
|
| 629 |
-
2492
|
| 630 |
-
],
|
| 631 |
-
"ground_truth": [
|
| 632 |
-
0.24973958333333332,
|
| 633 |
-
0.8560185185185185,
|
| 634 |
-
0.2796875,
|
| 635 |
-
0.8671296296296296
|
| 636 |
-
],
|
| 637 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 1183]}}\n</tool_call>",
|
| 638 |
-
"extracted_answer": [
|
| 639 |
-
0.26364365971107545,
|
| 640 |
-
0.845
|
| 641 |
-
],
|
| 642 |
-
"correct": 0
|
| 643 |
-
},
|
| 644 |
-
{
|
| 645 |
-
"image": "android_studio_mac/screenshot_2024-11-07_14-47-06.png",
|
| 646 |
-
"question": "open app inspection in android studio ",
|
| 647 |
-
"resized_size": [
|
| 648 |
-
1400,
|
| 649 |
-
2492
|
| 650 |
-
],
|
| 651 |
-
"ground_truth": [
|
| 652 |
-
0.29739583333333336,
|
| 653 |
-
0.8592592592592593,
|
| 654 |
-
0.3307291666666667,
|
| 655 |
-
0.8708333333333333
|
| 656 |
-
],
|
| 657 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [807, 1183]}}\n</tool_call>",
|
| 658 |
-
"extracted_answer": [
|
| 659 |
-
0.3238362760834671,
|
| 660 |
-
0.845
|
| 661 |
-
],
|
| 662 |
-
"correct": 0
|
| 663 |
-
},
|
| 664 |
-
{
|
| 665 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-06-34.png",
|
| 666 |
-
"question": "attach debugger to android process in android studio",
|
| 667 |
-
"resized_size": [
|
| 668 |
-
1400,
|
| 669 |
-
2492
|
| 670 |
-
],
|
| 671 |
-
"ground_truth": [
|
| 672 |
-
0.53984375,
|
| 673 |
-
0.0699074074074074,
|
| 674 |
-
0.5447916666666667,
|
| 675 |
-
0.08148148148148149
|
| 676 |
-
],
|
| 677 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 131]}}\n</tool_call>",
|
| 678 |
-
"extracted_answer": [
|
| 679 |
-
0.5565810593900482,
|
| 680 |
-
0.09357142857142857
|
| 681 |
-
],
|
| 682 |
-
"correct": 0
|
| 683 |
-
},
|
| 684 |
-
{
|
| 685 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-03-59.png",
|
| 686 |
-
"question": "turn on the flash of the camera in virtual android machine in android studio",
|
| 687 |
-
"resized_size": [
|
| 688 |
-
1400,
|
| 689 |
-
2492
|
| 690 |
-
],
|
| 691 |
-
"ground_truth": [
|
| 692 |
-
0.69921875,
|
| 693 |
-
0.6097222222222223,
|
| 694 |
-
0.7078125,
|
| 695 |
-
0.6236111111111111
|
| 696 |
-
],
|
| 697 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 438]}}\n</tool_call>",
|
| 698 |
-
"extracted_answer": [
|
| 699 |
-
0.26364365971107545,
|
| 700 |
-
0.31285714285714283
|
| 701 |
-
],
|
| 702 |
-
"correct": 0
|
| 703 |
-
},
|
| 704 |
-
{
|
| 705 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-07-21.png",
|
| 706 |
-
"question": "clear all notification of the android virtual machine in android studio",
|
| 707 |
-
"resized_size": [
|
| 708 |
-
1400,
|
| 709 |
-
2492
|
| 710 |
-
],
|
| 711 |
-
"ground_truth": [
|
| 712 |
-
0.6408854166666667,
|
| 713 |
-
0.6712962962962963,
|
| 714 |
-
0.6565104166666667,
|
| 715 |
-
0.6847222222222222
|
| 716 |
-
],
|
| 717 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [637, 895]}}\n</tool_call>",
|
| 718 |
-
"extracted_answer": [
|
| 719 |
-
0.2556179775280899,
|
| 720 |
-
0.6392857142857142
|
| 721 |
-
],
|
| 722 |
-
"correct": 0
|
| 723 |
-
},
|
| 724 |
-
{
|
| 725 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-06-51.png",
|
| 726 |
-
"question": "collapse all bookmarks in android studio",
|
| 727 |
-
"resized_size": [
|
| 728 |
-
1400,
|
| 729 |
-
2492
|
| 730 |
-
],
|
| 731 |
-
"ground_truth": [
|
| 732 |
-
0.23932291666666666,
|
| 733 |
-
0.33240740740740743,
|
| 734 |
-
0.24583333333333332,
|
| 735 |
-
0.3449074074074074
|
| 736 |
-
],
|
| 737 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [553, 472]}}\n</tool_call>",
|
| 738 |
-
"extracted_answer": [
|
| 739 |
-
0.22191011235955055,
|
| 740 |
-
0.33714285714285713
|
| 741 |
-
],
|
| 742 |
-
"correct": 0
|
| 743 |
-
},
|
| 744 |
-
{
|
| 745 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-42-10.png",
|
| 746 |
-
"question": "expand TODO items in android studio",
|
| 747 |
-
"resized_size": [
|
| 748 |
-
1400,
|
| 749 |
-
2492
|
| 750 |
-
],
|
| 751 |
-
"ground_truth": [
|
| 752 |
-
0.10625,
|
| 753 |
-
0.649074074074074,
|
| 754 |
-
0.11197916666666667,
|
| 755 |
-
0.6606481481481481
|
| 756 |
-
],
|
| 757 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [305, 826]}}\n</tool_call>",
|
| 758 |
-
"extracted_answer": [
|
| 759 |
-
0.1223916532905297,
|
| 760 |
-
0.59
|
| 761 |
-
],
|
| 762 |
-
"correct": 0
|
| 763 |
-
},
|
| 764 |
-
{
|
| 765 |
-
"image": "android_studio_mac/screenshot_2024-11-28_18-02-57.png",
|
| 766 |
-
"question": "click Profiler to open services window in the bottom of the android studio",
|
| 767 |
-
"resized_size": [
|
| 768 |
-
1400,
|
| 769 |
-
2492
|
| 770 |
-
],
|
| 771 |
-
"ground_truth": [
|
| 772 |
-
0.28020833333333334,
|
| 773 |
-
0.8564814814814815,
|
| 774 |
-
0.2981770833333333,
|
| 775 |
-
0.8671296296296296
|
| 776 |
-
],
|
| 777 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [678, 1179]}}\n</tool_call>",
|
| 778 |
-
"extracted_answer": [
|
| 779 |
-
0.27207062600321025,
|
| 780 |
-
0.8421428571428572
|
| 781 |
-
],
|
| 782 |
-
"correct": 0
|
| 783 |
-
},
|
| 784 |
-
{
|
| 785 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-19-51.png",
|
| 786 |
-
"question": "stop the bilibili download in android virtual machine in android studio",
|
| 787 |
-
"resized_size": [
|
| 788 |
-
1400,
|
| 789 |
-
2492
|
| 790 |
-
],
|
| 791 |
-
"ground_truth": [
|
| 792 |
-
0.4661458333333333,
|
| 793 |
-
0.4666666666666667,
|
| 794 |
-
0.47630208333333335,
|
| 795 |
-
0.4847222222222222
|
| 796 |
-
],
|
| 797 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1538, 667]}}\n</tool_call>",
|
| 798 |
-
"extracted_answer": [
|
| 799 |
-
0.6171749598715891,
|
| 800 |
-
0.4764285714285714
|
| 801 |
-
],
|
| 802 |
-
"correct": 0
|
| 803 |
-
},
|
| 804 |
-
{
|
| 805 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-40-13.png",
|
| 806 |
-
"question": "clear all the modification of the photo with in the virtual android machine in android studio",
|
| 807 |
-
"resized_size": [
|
| 808 |
-
1400,
|
| 809 |
-
2492
|
| 810 |
-
],
|
| 811 |
-
"ground_truth": [
|
| 812 |
-
0.5263020833333333,
|
| 813 |
-
0.7342592592592593,
|
| 814 |
-
0.5424479166666667,
|
| 815 |
-
0.7509259259259259
|
| 816 |
-
],
|
| 817 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1317, 985]}}\n</tool_call>",
|
| 818 |
-
"extracted_answer": [
|
| 819 |
-
0.5284911717495987,
|
| 820 |
-
0.7035714285714286
|
| 821 |
-
],
|
| 822 |
-
"correct": 0
|
| 823 |
-
},
|
| 824 |
-
{
|
| 825 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-39-17.png",
|
| 826 |
-
"question": "zoom in the android virtual machine in android studio",
|
| 827 |
-
"resized_size": [
|
| 828 |
-
1400,
|
| 829 |
-
2492
|
| 830 |
-
],
|
| 831 |
-
"ground_truth": [
|
| 832 |
-
0.5559895833333334,
|
| 833 |
-
0.6717592592592593,
|
| 834 |
-
0.5625,
|
| 835 |
-
0.6837962962962963
|
| 836 |
-
],
|
| 837 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1395, 947]}}\n</tool_call>",
|
| 838 |
-
"extracted_answer": [
|
| 839 |
-
0.5597913322632424,
|
| 840 |
-
0.6764285714285714
|
| 841 |
-
],
|
| 842 |
-
"correct": 1
|
| 843 |
-
},
|
| 844 |
-
{
|
| 845 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-43-20.png",
|
| 846 |
-
"question": "back to home in virtual android machine in android studio",
|
| 847 |
-
"resized_size": [
|
| 848 |
-
1400,
|
| 849 |
-
2492
|
| 850 |
-
],
|
| 851 |
-
"ground_truth": [
|
| 852 |
-
0.60078125,
|
| 853 |
-
0.3800925925925926,
|
| 854 |
-
0.6065104166666667,
|
| 855 |
-
0.3902777777777778
|
| 856 |
-
],
|
| 857 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [37, 825]}}\n</tool_call>",
|
| 858 |
-
"extracted_answer": [
|
| 859 |
-
0.014847512038523275,
|
| 860 |
-
0.5892857142857143
|
| 861 |
-
],
|
| 862 |
-
"correct": 0
|
| 863 |
-
},
|
| 864 |
-
{
|
| 865 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-09-50.png",
|
| 866 |
-
"question": "turn on the falshlight of the android virtual machine in android studio",
|
| 867 |
-
"resized_size": [
|
| 868 |
-
1400,
|
| 869 |
-
2492
|
| 870 |
-
],
|
| 871 |
-
"ground_truth": [
|
| 872 |
-
0.5861979166666667,
|
| 873 |
-
0.47638888888888886,
|
| 874 |
-
0.62265625,
|
| 875 |
-
0.5
|
| 876 |
-
],
|
| 877 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1537, 668]}}\n</tool_call>",
|
| 878 |
-
"extracted_answer": [
|
| 879 |
-
0.6167736757624398,
|
| 880 |
-
0.47714285714285715
|
| 881 |
-
],
|
| 882 |
-
"correct": 1
|
| 883 |
-
},
|
| 884 |
-
{
|
| 885 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-31-55.png",
|
| 886 |
-
"question": "click the third crying emoji with in the virtual android machine in android machine",
|
| 887 |
-
"resized_size": [
|
| 888 |
-
1400,
|
| 889 |
-
2492
|
| 890 |
-
],
|
| 891 |
-
"ground_truth": [
|
| 892 |
-
0.553125,
|
| 893 |
-
0.5347222222222222,
|
| 894 |
-
0.5645833333333333,
|
| 895 |
-
0.5550925925925926
|
| 896 |
-
],
|
| 897 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1875, 693]}}\n</tool_call>",
|
| 898 |
-
"extracted_answer": [
|
| 899 |
-
0.7524077046548957,
|
| 900 |
-
0.495
|
| 901 |
-
],
|
| 902 |
-
"correct": 0
|
| 903 |
-
},
|
| 904 |
-
{
|
| 905 |
-
"image": "android_studio_mac/screenshot_2024-11-28_16-15-46.png",
|
| 906 |
-
"question": "change video to camera in virtual android machine in android studio",
|
| 907 |
-
"resized_size": [
|
| 908 |
-
1400,
|
| 909 |
-
2492
|
| 910 |
-
],
|
| 911 |
-
"ground_truth": [
|
| 912 |
-
0.5247395833333334,
|
| 913 |
-
0.5532407407407407,
|
| 914 |
-
0.5546875,
|
| 915 |
-
0.5791666666666667
|
| 916 |
-
],
|
| 917 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 786]}}\n</tool_call>",
|
| 918 |
-
"extracted_answer": [
|
| 919 |
-
0.5445425361155698,
|
| 920 |
-
0.5614285714285714
|
| 921 |
-
],
|
| 922 |
-
"correct": 1
|
| 923 |
-
},
|
| 924 |
-
{
|
| 925 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-10-02.png",
|
| 926 |
-
"question": "close the detail of the photo in virtual android machine in android studio",
|
| 927 |
-
"resized_size": [
|
| 928 |
-
1400,
|
| 929 |
-
2492
|
| 930 |
-
],
|
| 931 |
-
"ground_truth": [
|
| 932 |
-
0.6291666666666667,
|
| 933 |
-
0.6939814814814815,
|
| 934 |
-
0.6432291666666666,
|
| 935 |
-
0.7092592592592593
|
| 936 |
-
],
|
| 937 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1567, 980]}}\n</tool_call>",
|
| 938 |
-
"extracted_answer": [
|
| 939 |
-
0.6288121990369181,
|
| 940 |
-
0.7
|
| 941 |
-
],
|
| 942 |
-
"correct": 0
|
| 943 |
-
},
|
| 944 |
-
{
|
| 945 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-15-42.png",
|
| 946 |
-
"question": "modify the shadows of the picture witn in virtual android machine in android studio",
|
| 947 |
-
"resized_size": [
|
| 948 |
-
1400,
|
| 949 |
-
2492
|
| 950 |
-
],
|
| 951 |
-
"ground_truth": [
|
| 952 |
-
0.5369791666666667,
|
| 953 |
-
0.6925925925925925,
|
| 954 |
-
0.55234375,
|
| 955 |
-
0.7240740740740741
|
| 956 |
-
],
|
| 957 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1367, 1035]}}\n</tool_call>",
|
| 958 |
-
"extracted_answer": [
|
| 959 |
-
0.5485553772070626,
|
| 960 |
-
0.7392857142857143
|
| 961 |
-
],
|
| 962 |
-
"correct": 0
|
| 963 |
-
},
|
| 964 |
-
{
|
| 965 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-05-13.png",
|
| 966 |
-
"question": "turn off the hdr of virtual android camera in android studio",
|
| 967 |
-
"resized_size": [
|
| 968 |
-
1400,
|
| 969 |
-
2492
|
| 970 |
-
],
|
| 971 |
-
"ground_truth": [
|
| 972 |
-
0.6895833333333333,
|
| 973 |
-
0.6106481481481482,
|
| 974 |
-
0.6984375,
|
| 975 |
-
0.625
|
| 976 |
-
],
|
| 977 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1730, 899]}}\n</tool_call>",
|
| 978 |
-
"extracted_answer": [
|
| 979 |
-
0.6942215088282504,
|
| 980 |
-
0.6421428571428571
|
| 981 |
-
],
|
| 982 |
-
"correct": 0
|
| 983 |
-
},
|
| 984 |
-
{
|
| 985 |
-
"image": "android_studio_mac/screenshot_2024-11-28_14-58-52.png",
|
| 986 |
-
"question": "take a picture of virtual android machine in android studio",
|
| 987 |
-
"resized_size": [
|
| 988 |
-
1400,
|
| 989 |
-
2492
|
| 990 |
-
],
|
| 991 |
-
"ground_truth": [
|
| 992 |
-
0.51796875,
|
| 993 |
-
0.7638888888888888,
|
| 994 |
-
0.54296875,
|
| 995 |
-
0.7962962962962963
|
| 996 |
-
],
|
| 997 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1317, 856]}}\n</tool_call>",
|
| 998 |
-
"extracted_answer": [
|
| 999 |
-
0.5284911717495987,
|
| 1000 |
-
0.6114285714285714
|
| 1001 |
-
],
|
| 1002 |
-
"correct": 0
|
| 1003 |
-
},
|
| 1004 |
-
{
|
| 1005 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-16-55.png",
|
| 1006 |
-
"question": "modify the highlights of the photo with in the virtual android machine in android studio",
|
| 1007 |
-
"resized_size": [
|
| 1008 |
-
1400,
|
| 1009 |
-
2492
|
| 1010 |
-
],
|
| 1011 |
-
"ground_truth": [
|
| 1012 |
-
0.46197916666666666,
|
| 1013 |
-
0.7342592592592593,
|
| 1014 |
-
0.5502604166666667,
|
| 1015 |
-
0.7490740740740741
|
| 1016 |
-
],
|
| 1017 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1315, 982]}}\n</tool_call>",
|
| 1018 |
-
"extracted_answer": [
|
| 1019 |
-
0.5276886035313002,
|
| 1020 |
-
0.7014285714285714
|
| 1021 |
-
],
|
| 1022 |
-
"correct": 0
|
| 1023 |
-
},
|
| 1024 |
-
{
|
| 1025 |
-
"image": "android_studio_mac/screenshot_2024-11-07_14-48-21.png",
|
| 1026 |
-
"question": "open setting of app inspection in android studio",
|
| 1027 |
-
"resized_size": [
|
| 1028 |
-
1400,
|
| 1029 |
-
2492
|
| 1030 |
-
],
|
| 1031 |
-
"ground_truth": [
|
| 1032 |
-
0.5557291666666667,
|
| 1033 |
-
0.5884259259259259,
|
| 1034 |
-
0.5604166666666667,
|
| 1035 |
-
0.5981481481481481
|
| 1036 |
-
],
|
| 1037 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [367, 835]}}\n</tool_call>",
|
| 1038 |
-
"extracted_answer": [
|
| 1039 |
-
0.1472712680577849,
|
| 1040 |
-
0.5964285714285714
|
| 1041 |
-
],
|
| 1042 |
-
"correct": 0
|
| 1043 |
-
},
|
| 1044 |
-
{
|
| 1045 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-02-38.png",
|
| 1046 |
-
"question": "open more items of virtual android machine in android studio ",
|
| 1047 |
-
"resized_size": [
|
| 1048 |
-
1400,
|
| 1049 |
-
2492
|
| 1050 |
-
],
|
| 1051 |
-
"ground_truth": [
|
| 1052 |
-
0.6875,
|
| 1053 |
-
0.6402777777777777,
|
| 1054 |
-
0.70234375,
|
| 1055 |
-
0.6680555555555555
|
| 1056 |
-
],
|
| 1057 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 468]}}\n</tool_call>",
|
| 1058 |
-
"extracted_answer": [
|
| 1059 |
-
0.14325842696629212,
|
| 1060 |
-
0.3342857142857143
|
| 1061 |
-
],
|
| 1062 |
-
"correct": 0
|
| 1063 |
-
},
|
| 1064 |
-
{
|
| 1065 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-38-30.png",
|
| 1066 |
-
"question": "open options of terminal in android studio",
|
| 1067 |
-
"resized_size": [
|
| 1068 |
-
1400,
|
| 1069 |
-
2492
|
| 1070 |
-
],
|
| 1071 |
-
"ground_truth": [
|
| 1072 |
-
0.5546875,
|
| 1073 |
-
0.5884259259259259,
|
| 1074 |
-
0.5611979166666666,
|
| 1075 |
-
0.600462962962963
|
| 1076 |
-
],
|
| 1077 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 806]}}\n</tool_call>",
|
| 1078 |
-
"extracted_answer": [
|
| 1079 |
-
0.14325842696629212,
|
| 1080 |
-
0.5757142857142857
|
| 1081 |
-
],
|
| 1082 |
-
"correct": 0
|
| 1083 |
-
},
|
| 1084 |
-
{
|
| 1085 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-11-07.png",
|
| 1086 |
-
"question": "turn off the notification permission of google in android virtual machine",
|
| 1087 |
-
"resized_size": [
|
| 1088 |
-
1400,
|
| 1089 |
-
2492
|
| 1090 |
-
],
|
| 1091 |
-
"ground_truth": [
|
| 1092 |
-
0.6505208333333333,
|
| 1093 |
-
0.5648148148148148,
|
| 1094 |
-
0.6635416666666667,
|
| 1095 |
-
0.5828703703703704
|
| 1096 |
-
],
|
| 1097 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1617, 803]}}\n</tool_call>",
|
| 1098 |
-
"extracted_answer": [
|
| 1099 |
-
0.648876404494382,
|
| 1100 |
-
0.5735714285714286
|
| 1101 |
-
],
|
| 1102 |
-
"correct": 0
|
| 1103 |
-
},
|
| 1104 |
-
{
|
| 1105 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-09-54.png",
|
| 1106 |
-
"question": "supress warnings in android studio",
|
| 1107 |
-
"resized_size": [
|
| 1108 |
-
1400,
|
| 1109 |
-
2492
|
| 1110 |
-
],
|
| 1111 |
-
"ground_truth": [
|
| 1112 |
-
0.4434895833333333,
|
| 1113 |
-
0.2722222222222222,
|
| 1114 |
-
0.5223958333333333,
|
| 1115 |
-
0.2810185185185185
|
| 1116 |
-
],
|
| 1117 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1308, 386]}}\n</tool_call>",
|
| 1118 |
-
"extracted_answer": [
|
| 1119 |
-
0.5248796147672552,
|
| 1120 |
-
0.2757142857142857
|
| 1121 |
-
],
|
| 1122 |
-
"correct": 0
|
| 1123 |
-
},
|
| 1124 |
-
{
|
| 1125 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-26-41.png",
|
| 1126 |
-
"question": "open settings of the keyboard within the virtual android machine in android studio",
|
| 1127 |
-
"resized_size": [
|
| 1128 |
-
1400,
|
| 1129 |
-
2492
|
| 1130 |
-
],
|
| 1131 |
-
"ground_truth": [
|
| 1132 |
-
0.76015625,
|
| 1133 |
-
0.5703703703703704,
|
| 1134 |
-
0.7763020833333333,
|
| 1135 |
-
0.5875
|
| 1136 |
-
],
|
| 1137 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1836, 795]}}\n</tool_call>",
|
| 1138 |
-
"extracted_answer": [
|
| 1139 |
-
0.7367576243980738,
|
| 1140 |
-
0.5678571428571428
|
| 1141 |
-
],
|
| 1142 |
-
"correct": 0
|
| 1143 |
-
},
|
| 1144 |
-
{
|
| 1145 |
-
"image": "android_studio_mac/screenshot_2024-11-28_14-55-23.png",
|
| 1146 |
-
"question": "open recent call of Android machine in Android studio",
|
| 1147 |
-
"resized_size": [
|
| 1148 |
-
1400,
|
| 1149 |
-
2492
|
| 1150 |
-
],
|
| 1151 |
-
"ground_truth": [
|
| 1152 |
-
0.578125,
|
| 1153 |
-
0.6560185185185186,
|
| 1154 |
-
0.59765625,
|
| 1155 |
-
0.6875
|
| 1156 |
-
],
|
| 1157 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1463, 935]}}\n</tool_call>",
|
| 1158 |
-
"extracted_answer": [
|
| 1159 |
-
0.5870786516853933,
|
| 1160 |
-
0.6678571428571428
|
| 1161 |
-
],
|
| 1162 |
-
"correct": 1
|
| 1163 |
-
},
|
| 1164 |
-
{
|
| 1165 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-23-32.png",
|
| 1166 |
-
"question": "shutdown the virtual android phone in android studio",
|
| 1167 |
-
"resized_size": [
|
| 1168 |
-
1400,
|
| 1169 |
-
2492
|
| 1170 |
-
],
|
| 1171 |
-
"ground_truth": [
|
| 1172 |
-
0.45026041666666666,
|
| 1173 |
-
0.47638888888888886,
|
| 1174 |
-
0.4557291666666667,
|
| 1175 |
-
0.487962962962963
|
| 1176 |
-
],
|
| 1177 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 925]}}\n</tool_call>",
|
| 1178 |
-
"extracted_answer": [
|
| 1179 |
-
0.5565810593900482,
|
| 1180 |
-
0.6607142857142857
|
| 1181 |
-
],
|
| 1182 |
-
"correct": 0
|
| 1183 |
-
},
|
| 1184 |
-
{
|
| 1185 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-55-00.png",
|
| 1186 |
-
"question": "make current file read-only in android studio",
|
| 1187 |
-
"resized_size": [
|
| 1188 |
-
1400,
|
| 1189 |
-
2492
|
| 1190 |
-
],
|
| 1191 |
-
"ground_truth": [
|
| 1192 |
-
0.7734375,
|
| 1193 |
-
0.8680555555555556,
|
| 1194 |
-
0.7796875,
|
| 1195 |
-
0.8759259259259259
|
| 1196 |
-
],
|
| 1197 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1753, 748]}}\n</tool_call>",
|
| 1198 |
-
"extracted_answer": [
|
| 1199 |
-
0.7034510433386838,
|
| 1200 |
-
0.5342857142857143
|
| 1201 |
-
],
|
| 1202 |
-
"correct": 0
|
| 1203 |
-
},
|
| 1204 |
-
{
|
| 1205 |
-
"image": "android_studio_mac/screenshot_2024-11-28_18-01-01.png",
|
| 1206 |
-
"question": "open app quality insights window in the bootom of the android studio",
|
| 1207 |
-
"resized_size": [
|
| 1208 |
-
1400,
|
| 1209 |
-
2492
|
| 1210 |
-
],
|
| 1211 |
-
"ground_truth": [
|
| 1212 |
-
0.32057291666666665,
|
| 1213 |
-
0.8555555555555555,
|
| 1214 |
-
0.3572916666666667,
|
| 1215 |
-
0.8671296296296296
|
| 1216 |
-
],
|
| 1217 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [875, 1180]}}\n</tool_call>",
|
| 1218 |
-
"extracted_answer": [
|
| 1219 |
-
0.351123595505618,
|
| 1220 |
-
0.8428571428571429
|
| 1221 |
-
],
|
| 1222 |
-
"correct": 0
|
| 1223 |
-
},
|
| 1224 |
-
{
|
| 1225 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-22-21.png",
|
| 1226 |
-
"question": "rename bookmark name in android studio",
|
| 1227 |
-
"resized_size": [
|
| 1228 |
-
1400,
|
| 1229 |
-
2492
|
| 1230 |
-
],
|
| 1231 |
-
"ground_truth": [
|
| 1232 |
-
0.2234375,
|
| 1233 |
-
0.3314814814814815,
|
| 1234 |
-
0.23046875,
|
| 1235 |
-
0.3453703703703704
|
| 1236 |
-
],
|
| 1237 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [553, 472]}}\n</tool_call>",
|
| 1238 |
-
"extracted_answer": [
|
| 1239 |
-
0.22191011235955055,
|
| 1240 |
-
0.33714285714285713
|
| 1241 |
-
],
|
| 1242 |
-
"correct": 0
|
| 1243 |
-
},
|
| 1244 |
-
{
|
| 1245 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-26-41.png",
|
| 1246 |
-
"question": "log in to marscode AI in android studio",
|
| 1247 |
-
"resized_size": [
|
| 1248 |
-
1400,
|
| 1249 |
-
2492
|
| 1250 |
-
],
|
| 1251 |
-
"ground_truth": [
|
| 1252 |
-
0.51640625,
|
| 1253 |
-
0.5166666666666667,
|
| 1254 |
-
0.5276041666666667,
|
| 1255 |
-
0.5254629629629629
|
| 1256 |
-
],
|
| 1257 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 695]}}\n</tool_call>",
|
| 1258 |
-
"extracted_answer": [
|
| 1259 |
-
0.5529695024077047,
|
| 1260 |
-
0.49642857142857144
|
| 1261 |
-
],
|
| 1262 |
-
"correct": 0
|
| 1263 |
-
},
|
| 1264 |
-
{
|
| 1265 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-25-07.png",
|
| 1266 |
-
"question": "clear all notifications in android studio",
|
| 1267 |
-
"resized_size": [
|
| 1268 |
-
1400,
|
| 1269 |
-
2492
|
| 1270 |
-
],
|
| 1271 |
-
"ground_truth": [
|
| 1272 |
-
0.5440104166666667,
|
| 1273 |
-
0.2791666666666667,
|
| 1274 |
-
0.56015625,
|
| 1275 |
-
0.2916666666666667
|
| 1276 |
-
],
|
| 1277 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 360]}}\n</tool_call>",
|
| 1278 |
-
"extracted_answer": [
|
| 1279 |
-
0.5445425361155698,
|
| 1280 |
-
0.2571428571428571
|
| 1281 |
-
],
|
| 1282 |
-
"correct": 0
|
| 1283 |
-
},
|
| 1284 |
-
{
|
| 1285 |
-
"image": "android_studio_mac/screenshot_2024-11-05_17-58-41.png",
|
| 1286 |
-
"question": "edit selected AVD in android studio",
|
| 1287 |
-
"resized_size": [
|
| 1288 |
-
1400,
|
| 1289 |
-
2492
|
| 1290 |
-
],
|
| 1291 |
-
"ground_truth": [
|
| 1292 |
-
0.25677083333333334,
|
| 1293 |
-
0.13935185185185187,
|
| 1294 |
-
0.26171875,
|
| 1295 |
-
0.15231481481481482
|
| 1296 |
-
],
|
| 1297 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 203]}}\n</tool_call>",
|
| 1298 |
-
"extracted_answer": [
|
| 1299 |
-
0.26364365971107545,
|
| 1300 |
-
0.145
|
| 1301 |
-
],
|
| 1302 |
-
"correct": 0
|
| 1303 |
-
},
|
| 1304 |
-
{
|
| 1305 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-14-25.png",
|
| 1306 |
-
"question": "crop the photo witnin the virtual android machine in android studio",
|
| 1307 |
-
"resized_size": [
|
| 1308 |
-
1400,
|
| 1309 |
-
2492
|
| 1310 |
-
],
|
| 1311 |
-
"ground_truth": [
|
| 1312 |
-
0.525,
|
| 1313 |
-
0.7384259259259259,
|
| 1314 |
-
0.5377604166666666,
|
| 1315 |
-
0.75
|
| 1316 |
-
],
|
| 1317 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1367, 1041]}}\n</tool_call>",
|
| 1318 |
-
"extracted_answer": [
|
| 1319 |
-
0.5485553772070626,
|
| 1320 |
-
0.7435714285714285
|
| 1321 |
-
],
|
| 1322 |
-
"correct": 0
|
| 1323 |
-
},
|
| 1324 |
-
{
|
| 1325 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-01-46.png",
|
| 1326 |
-
"question": "open settings within the virtual android machine in android studio",
|
| 1327 |
-
"resized_size": [
|
| 1328 |
-
1400,
|
| 1329 |
-
2492
|
| 1330 |
-
],
|
| 1331 |
-
"ground_truth": [
|
| 1332 |
-
0.6526041666666667,
|
| 1333 |
-
0.6694444444444444,
|
| 1334 |
-
0.6622395833333333,
|
| 1335 |
-
0.6861111111111111
|
| 1336 |
-
],
|
| 1337 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [637, 805]}}\n</tool_call>",
|
| 1338 |
-
"extracted_answer": [
|
| 1339 |
-
0.2556179775280899,
|
| 1340 |
-
0.575
|
| 1341 |
-
],
|
| 1342 |
-
"correct": 0
|
| 1343 |
-
},
|
| 1344 |
-
{
|
| 1345 |
-
"image": "android_studio_mac/screenshot_2024-11-28_14-45-58.png",
|
| 1346 |
-
"question": "turn off internet of Android machine in Android studio",
|
| 1347 |
-
"resized_size": [
|
| 1348 |
-
1400,
|
| 1349 |
-
2492
|
| 1350 |
-
],
|
| 1351 |
-
"ground_truth": [
|
| 1352 |
-
0.5200520833333333,
|
| 1353 |
-
0.3638888888888889,
|
| 1354 |
-
0.5723958333333333,
|
| 1355 |
-
0.3958333333333333
|
| 1356 |
-
],
|
| 1357 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 528]}}\n</tool_call>",
|
| 1358 |
-
"extracted_answer": [
|
| 1359 |
-
0.5445425361155698,
|
| 1360 |
-
0.37714285714285717
|
| 1361 |
-
],
|
| 1362 |
-
"correct": 1
|
| 1363 |
-
},
|
| 1364 |
-
{
|
| 1365 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-01-19.png",
|
| 1366 |
-
"question": "filter resources in android studio",
|
| 1367 |
-
"resized_size": [
|
| 1368 |
-
1400,
|
| 1369 |
-
2492
|
| 1370 |
-
],
|
| 1371 |
-
"ground_truth": [
|
| 1372 |
-
0.2234375,
|
| 1373 |
-
0.09814814814814815,
|
| 1374 |
-
0.228125,
|
| 1375 |
-
0.1087962962962963
|
| 1376 |
-
],
|
| 1377 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [473, 168]}}\n</tool_call>",
|
| 1378 |
-
"extracted_answer": [
|
| 1379 |
-
0.18980738362760835,
|
| 1380 |
-
0.12
|
| 1381 |
-
],
|
| 1382 |
-
"correct": 0
|
| 1383 |
-
},
|
| 1384 |
-
{
|
| 1385 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-47-45.png",
|
| 1386 |
-
"question": "filter TODO item in android studio",
|
| 1387 |
-
"resized_size": [
|
| 1388 |
-
1400,
|
| 1389 |
-
2492
|
| 1390 |
-
],
|
| 1391 |
-
"ground_truth": [
|
| 1392 |
-
0.10651041666666666,
|
| 1393 |
-
0.625,
|
| 1394 |
-
0.11223958333333334,
|
| 1395 |
-
0.6361111111111111
|
| 1396 |
-
],
|
| 1397 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
|
| 1398 |
-
"extracted_answer": [
|
| 1399 |
-
0.14325842696629212,
|
| 1400 |
-
0.5828571428571429
|
| 1401 |
-
],
|
| 1402 |
-
"correct": 0
|
| 1403 |
-
},
|
| 1404 |
-
{
|
| 1405 |
-
"image": "android_studio_mac/screenshot_2024-11-28_11-33-15.png",
|
| 1406 |
-
"question": "sign in google message in virtual android machine in android studio ",
|
| 1407 |
-
"resized_size": [
|
| 1408 |
-
1400,
|
| 1409 |
-
2492
|
| 1410 |
-
],
|
| 1411 |
-
"ground_truth": [
|
| 1412 |
-
0.54609375,
|
| 1413 |
-
0.7574074074074074,
|
| 1414 |
-
0.58203125,
|
| 1415 |
-
0.774537037037037
|
| 1416 |
-
],
|
| 1417 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 1056]}}\n</tool_call>",
|
| 1418 |
-
"extracted_answer": [
|
| 1419 |
-
0.5529695024077047,
|
| 1420 |
-
0.7542857142857143
|
| 1421 |
-
],
|
| 1422 |
-
"correct": 0
|
| 1423 |
-
},
|
| 1424 |
-
{
|
| 1425 |
-
"image": "android_studio_mac/screenshot_2024-11-28_18-03-58.png",
|
| 1426 |
-
"question": "open terminal windows in the bottom of the android studio",
|
| 1427 |
-
"resized_size": [
|
| 1428 |
-
1400,
|
| 1429 |
-
2492
|
| 1430 |
-
],
|
| 1431 |
-
"ground_truth": [
|
| 1432 |
-
0.39817708333333335,
|
| 1433 |
-
0.8560185185185185,
|
| 1434 |
-
0.4171875,
|
| 1435 |
-
0.8675925925925926
|
| 1436 |
-
],
|
| 1437 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1035, 1181]}}\n</tool_call>",
|
| 1438 |
-
"extracted_answer": [
|
| 1439 |
-
0.4153290529695024,
|
| 1440 |
-
0.8435714285714285
|
| 1441 |
-
],
|
| 1442 |
-
"correct": 0
|
| 1443 |
-
},
|
| 1444 |
-
{
|
| 1445 |
-
"image": "android_studio_mac/screenshot_2024-11-05_16-08-03.png",
|
| 1446 |
-
"question": "sign in to android studio",
|
| 1447 |
-
"resized_size": [
|
| 1448 |
-
1400,
|
| 1449 |
-
2492
|
| 1450 |
-
],
|
| 1451 |
-
"ground_truth": [
|
| 1452 |
-
0.5770833333333333,
|
| 1453 |
-
0.07222222222222222,
|
| 1454 |
-
0.5822916666666667,
|
| 1455 |
-
0.08055555555555556
|
| 1456 |
-
],
|
| 1457 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 1125]}}\n</tool_call>",
|
| 1458 |
-
"extracted_answer": [
|
| 1459 |
-
0.5529695024077047,
|
| 1460 |
-
0.8035714285714286
|
| 1461 |
-
],
|
| 1462 |
-
"correct": 0
|
| 1463 |
-
},
|
| 1464 |
-
{
|
| 1465 |
-
"image": "android_studio_mac/screenshot_2024-11-28_16-03-53.png",
|
| 1466 |
-
"question": "like the QwQ-32b-preview on huggingface within the virtual android machine in android studio",
|
| 1467 |
-
"resized_size": [
|
| 1468 |
-
1400,
|
| 1469 |
-
2492
|
| 1470 |
-
],
|
| 1471 |
-
"ground_truth": [
|
| 1472 |
-
0.5341145833333333,
|
| 1473 |
-
0.6101851851851852,
|
| 1474 |
-
0.54765625,
|
| 1475 |
-
0.6212962962962963
|
| 1476 |
-
],
|
| 1477 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1358, 867]}}\n</tool_call>",
|
| 1478 |
-
"extracted_answer": [
|
| 1479 |
-
0.5449438202247191,
|
| 1480 |
-
0.6192857142857143
|
| 1481 |
-
],
|
| 1482 |
-
"correct": 1
|
| 1483 |
-
},
|
| 1484 |
-
{
|
| 1485 |
-
"image": "android_studio_mac/screenshot_2024-11-05_18-03-37.png",
|
| 1486 |
-
"question": "open the device in the device explorer in android studio",
|
| 1487 |
-
"resized_size": [
|
| 1488 |
-
1400,
|
| 1489 |
-
2492
|
| 1490 |
-
],
|
| 1491 |
-
"ground_truth": [
|
| 1492 |
-
0.2484375,
|
| 1493 |
-
0.1398148148148148,
|
| 1494 |
-
0.2552083333333333,
|
| 1495 |
-
0.1537037037037037
|
| 1496 |
-
],
|
| 1497 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [658, 230]}}\n</tool_call>",
|
| 1498 |
-
"extracted_answer": [
|
| 1499 |
-
0.2640449438202247,
|
| 1500 |
-
0.16428571428571428
|
| 1501 |
-
],
|
| 1502 |
-
"correct": 0
|
| 1503 |
-
},
|
| 1504 |
-
{
|
| 1505 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-13-10.png",
|
| 1506 |
-
"question": "Modify the filters of a photo to warm within the virtual Android machine in Android Studio.",
|
| 1507 |
-
"resized_size": [
|
| 1508 |
-
1400,
|
| 1509 |
-
2492
|
| 1510 |
-
],
|
| 1511 |
-
"ground_truth": [
|
| 1512 |
-
0.59453125,
|
| 1513 |
-
0.638425925925926,
|
| 1514 |
-
0.61796875,
|
| 1515 |
-
0.6694444444444444
|
| 1516 |
-
],
|
| 1517 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1537, 916]}}\n</tool_call>",
|
| 1518 |
-
"extracted_answer": [
|
| 1519 |
-
0.6167736757624398,
|
| 1520 |
-
0.6542857142857142
|
| 1521 |
-
],
|
| 1522 |
-
"correct": 1
|
| 1523 |
-
},
|
| 1524 |
-
{
|
| 1525 |
-
"image": "android_studio_mac/screenshot_2024-11-28_17-05-33.png",
|
| 1526 |
-
"question": "decrease the volume of the android virtual machine in android studio",
|
| 1527 |
-
"resized_size": [
|
| 1528 |
-
1400,
|
| 1529 |
-
2492
|
| 1530 |
-
],
|
| 1531 |
-
"ground_truth": [
|
| 1532 |
-
0.5924479166666666,
|
| 1533 |
-
0.3375,
|
| 1534 |
-
0.5979166666666667,
|
| 1535 |
-
0.3490740740740741
|
| 1536 |
-
],
|
| 1537 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1650, 1037]}}\n</tool_call>",
|
| 1538 |
-
"extracted_answer": [
|
| 1539 |
-
0.6621187800963082,
|
| 1540 |
-
0.7407142857142858
|
| 1541 |
-
],
|
| 1542 |
-
"correct": 0
|
| 1543 |
-
},
|
| 1544 |
-
{
|
| 1545 |
-
"image": "android_studio_mac/screenshot_2024-11-28_15-01-29.png",
|
| 1546 |
-
"question": "open the picture taked in virtual android machine in android studio ",
|
| 1547 |
-
"resized_size": [
|
| 1548 |
-
1400,
|
| 1549 |
-
2492
|
| 1550 |
-
],
|
| 1551 |
-
"ground_truth": [
|
| 1552 |
-
0.5697916666666667,
|
| 1553 |
-
0.6884259259259259,
|
| 1554 |
-
0.5864583333333333,
|
| 1555 |
-
0.7162037037037037
|
| 1556 |
-
],
|
| 1557 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 648]}}\n</tool_call>",
|
| 1558 |
-
"extracted_answer": [
|
| 1559 |
-
0.14325842696629212,
|
| 1560 |
-
0.46285714285714286
|
| 1561 |
-
],
|
| 1562 |
-
"correct": 0
|
| 1563 |
-
},
|
| 1564 |
-
{
|
| 1565 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-42-48.png",
|
| 1566 |
-
"question": "collapse all TODO list in android studio",
|
| 1567 |
-
"resized_size": [
|
| 1568 |
-
1400,
|
| 1569 |
-
2492
|
| 1570 |
-
],
|
| 1571 |
-
"ground_truth": [
|
| 1572 |
-
0.10572916666666667,
|
| 1573 |
-
0.6606481481481481,
|
| 1574 |
-
0.11197916666666667,
|
| 1575 |
-
0.6722222222222223
|
| 1576 |
-
],
|
| 1577 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
|
| 1578 |
-
"extracted_answer": [
|
| 1579 |
-
0.14325842696629212,
|
| 1580 |
-
0.5828571428571429
|
| 1581 |
-
],
|
| 1582 |
-
"correct": 0
|
| 1583 |
-
},
|
| 1584 |
-
{
|
| 1585 |
-
"image": "android_studio_mac/screenshot_2024-11-07_15-31-49.png",
|
| 1586 |
-
"question": "add new terminal in android studio",
|
| 1587 |
-
"resized_size": [
|
| 1588 |
-
1400,
|
| 1589 |
-
2492
|
| 1590 |
-
],
|
| 1591 |
-
"ground_truth": [
|
| 1592 |
-
0.14270833333333333,
|
| 1593 |
-
0.5875,
|
| 1594 |
-
0.14817708333333332,
|
| 1595 |
-
0.5990740740740741
|
| 1596 |
-
],
|
| 1597 |
-
"model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [356, 807]}}\n</tool_call>",
|
| 1598 |
-
"extracted_answer": [
|
| 1599 |
-
0.14285714285714285,
|
| 1600 |
-
0.5764285714285714
|
| 1601 |
-
],
|
| 1602 |
-
"correct": 0
|
| 1603 |
-
}
|
| 1604 |
-
]
|
| 1605 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_grounding_r1.py
DELETED
|
@@ -1,318 +0,0 @@
|
|
| 1 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
-
from qwen_vl_utils import process_vision_info
|
| 3 |
-
import torch
|
| 4 |
-
import json
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
import random
|
| 10 |
-
from PIL import Image
|
| 11 |
-
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
|
| 12 |
-
import torch.distributed as dist
|
| 13 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 14 |
-
import argparse
|
| 15 |
-
|
| 16 |
-
import warnings
|
| 17 |
-
|
| 18 |
-
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
|
| 19 |
-
|
| 20 |
-
def setup_distributed():
|
| 21 |
-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
| 22 |
-
torch.cuda.set_device(local_rank)
|
| 23 |
-
|
| 24 |
-
dist.init_process_group(backend="nccl")
|
| 25 |
-
|
| 26 |
-
world_size = dist.get_world_size()
|
| 27 |
-
rank = dist.get_rank()
|
| 28 |
-
|
| 29 |
-
return local_rank, world_size, rank
|
| 30 |
-
|
| 31 |
-
local_rank, world_size, rank = setup_distributed()
|
| 32 |
-
device = f"cuda:{local_rank}"
|
| 33 |
-
print(f"Process {rank} using {device}")
|
| 34 |
-
|
| 35 |
-
steps = 4100
|
| 36 |
-
if rank == 0:
|
| 37 |
-
print("Steps: ", steps)
|
| 38 |
-
|
| 39 |
-
RUN_NAME="Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_no_position_high_quality_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_think_4e"
|
| 40 |
-
MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
|
| 41 |
-
OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
|
| 42 |
-
|
| 43 |
-
BSZ=32
|
| 44 |
-
DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/annotations"
|
| 45 |
-
|
| 46 |
-
TEST_DATASETS = ['android_studio_macos', 'autocad_windows', 'blender_windows','davinci_macos','eviews_windows','excel_macos','fruitloops_windows','illustrator_windows','inventor_windows','linux_common_linux','macos_common_macos','matlab_macos','origin_windows','photoshop_windows','powerpoint_windows','premiere_windows','pycharm_macos','quartus_windows','solidworks_windows','stata_windows','unreal_engine_windows','vivado_windows','vmware_macos','vscode_macos','windows_common_windows','word_macos']
|
| 47 |
-
IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/images"
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
# TEST_DATASETS = ['lisa_test']
|
| 51 |
-
# IMAGE_ROOT = "/data10/shz/dataset/lisa"
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 55 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 56 |
-
MODEL_PATH,
|
| 57 |
-
torch_dtype=torch.bfloat16,
|
| 58 |
-
attn_implementation="flash_attention_2",
|
| 59 |
-
device_map={"": local_rank},
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
# default processer
|
| 63 |
-
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
| 64 |
-
print(processor.image_processor.min_pixels)
|
| 65 |
-
print(processor.image_processor.max_pixels)
|
| 66 |
-
# def extract_point_answer(content):
|
| 67 |
-
# # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
|
| 68 |
-
# answer_tag_pattern = r'<answer>(.*?)</answer>'
|
| 69 |
-
# content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
|
| 70 |
-
# if content_answer_match:
|
| 71 |
-
# content_answer = content_answer_match.group(1).strip()
|
| 72 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
|
| 73 |
-
# if tool_call_match:
|
| 74 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 75 |
-
# # 解析 JSON
|
| 76 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 77 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 78 |
-
# coordinate = arguments.get("coordinate", None)
|
| 79 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 80 |
-
# x, y = coordinate
|
| 81 |
-
# extracted_coordinate = [x, y]
|
| 82 |
-
# return extracted_coordinate
|
| 83 |
-
# return [0, 0]
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
def extract_point_answer(content):
|
| 87 |
-
# 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 88 |
-
answer_tag_pattern = r'<answer>(.*?)</answer>' # 修正正则表达式中的错误
|
| 89 |
-
content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
|
| 90 |
-
if content_answer_match:
|
| 91 |
-
content_answer = content_answer_match.group(1).strip()
|
| 92 |
-
tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
|
| 93 |
-
if tool_call_match:
|
| 94 |
-
tool_call_content = tool_call_match.group(1).strip()
|
| 95 |
-
# 首先尝试将 tool_call_content 解析为 JSON
|
| 96 |
-
try:
|
| 97 |
-
tool_call_json = json.loads(tool_call_content)
|
| 98 |
-
arguments = tool_call_json.get("arguments", {})
|
| 99 |
-
coordinate = arguments.get("coordinate", None)
|
| 100 |
-
if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 101 |
-
try:
|
| 102 |
-
x = float(coordinate[0])
|
| 103 |
-
y = float(coordinate[1])
|
| 104 |
-
return [x, y]
|
| 105 |
-
except (ValueError, TypeError):
|
| 106 |
-
pass # 如果转换失败,继续尝试���则提取
|
| 107 |
-
except json.JSONDecodeError:
|
| 108 |
-
pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 109 |
-
# 回退到正则表达式提取两个数字
|
| 110 |
-
numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 111 |
-
if len(numbers) >= 2:
|
| 112 |
-
x = float(numbers[-2])
|
| 113 |
-
y = float(numbers[-1])
|
| 114 |
-
return [x, y]
|
| 115 |
-
return [0, 0]
|
| 116 |
-
|
| 117 |
-
def point_in_box(point, box):
|
| 118 |
-
x,y = point
|
| 119 |
-
if box[0] <= x < box[2] and box[1] <= y < box[3]:
|
| 120 |
-
return 1
|
| 121 |
-
else:
|
| 122 |
-
return 0
|
| 123 |
-
|
| 124 |
-
num_samples = 2000
|
| 125 |
-
num_all_sample = 0
|
| 126 |
-
num_correct_sample = 0
|
| 127 |
-
for ds in TEST_DATASETS:
|
| 128 |
-
if rank == 0:
|
| 129 |
-
print(f"Processing {ds}...")
|
| 130 |
-
ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
|
| 131 |
-
data = json.load(open(ds_path, "r"))
|
| 132 |
-
random.seed(42)
|
| 133 |
-
random.shuffle(data)
|
| 134 |
-
data = data[:num_samples]
|
| 135 |
-
|
| 136 |
-
# Split data for distributed evaluation
|
| 137 |
-
per_rank_data = len(data) // world_size
|
| 138 |
-
start_idx = rank * per_rank_data
|
| 139 |
-
end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
|
| 140 |
-
rank_data = data[start_idx:end_idx]
|
| 141 |
-
|
| 142 |
-
messages = []
|
| 143 |
-
|
| 144 |
-
for x in rank_data:
|
| 145 |
-
image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
|
| 146 |
-
width,height = x['img_size'][0],x['img_size'][1]
|
| 147 |
-
resized_height, resized_width = smart_resize(
|
| 148 |
-
height,
|
| 149 |
-
width,
|
| 150 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 151 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 152 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 153 |
-
)
|
| 154 |
-
system_content = """You are a helpful assistant.
|
| 155 |
-
#Tools
|
| 156 |
-
|
| 157 |
-
You may call one or more functions to assist with the user query.
|
| 158 |
-
|
| 159 |
-
You are provided with function signatures within <tools></tools> XML tags:
|
| 160 |
-
<tools>
|
| 161 |
-
{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
|
| 162 |
-
</tools>
|
| 163 |
-
|
| 164 |
-
For each function call, first provide a detailed step-by-step thought process within <think></think> XML tags to analyze the user query and determine the action location, then return a json object with function name and arguments within <tool_call></tool_call> XML tags inside <answer></answer> XML tags:
|
| 165 |
-
<think>
|
| 166 |
-
[Your detailed step-by-step thought process here]
|
| 167 |
-
</think>
|
| 168 |
-
<answer>
|
| 169 |
-
<tool_call>
|
| 170 |
-
{"name": <function-name>, "arguments": <args-json-object>}
|
| 171 |
-
</tool_call>
|
| 172 |
-
</answer>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
|
| 173 |
-
message = [
|
| 174 |
-
{
|
| 175 |
-
"role": "system",
|
| 176 |
-
"content": [
|
| 177 |
-
{
|
| 178 |
-
"type": "text",
|
| 179 |
-
"text": system_content
|
| 180 |
-
}
|
| 181 |
-
]
|
| 182 |
-
},
|
| 183 |
-
{
|
| 184 |
-
"role": "user",
|
| 185 |
-
"content": [
|
| 186 |
-
{
|
| 187 |
-
"type": "image",
|
| 188 |
-
"image": f"file://{image_path}"
|
| 189 |
-
},
|
| 190 |
-
{
|
| 191 |
-
"type": "text",
|
| 192 |
-
"text": x['instruction']
|
| 193 |
-
}
|
| 194 |
-
]
|
| 195 |
-
},
|
| 196 |
-
]
|
| 197 |
-
# print(message)
|
| 198 |
-
messages.append(message)
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
rank_outputs = [] # List to store answers for this rank
|
| 202 |
-
all_outputs = [] # List to store all answers
|
| 203 |
-
|
| 204 |
-
# Process data
|
| 205 |
-
for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
|
| 206 |
-
batch_messages = messages[i:i + BSZ]
|
| 207 |
-
|
| 208 |
-
# Preparation for inference
|
| 209 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 210 |
-
|
| 211 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 212 |
-
inputs = processor(
|
| 213 |
-
text=text,
|
| 214 |
-
images=image_inputs,
|
| 215 |
-
videos=video_inputs,
|
| 216 |
-
padding=True,
|
| 217 |
-
padding_side="left",
|
| 218 |
-
return_tensors="pt",
|
| 219 |
-
)
|
| 220 |
-
inputs = inputs.to(device)
|
| 221 |
-
|
| 222 |
-
# Inference: Generation of the output
|
| 223 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 224 |
-
|
| 225 |
-
generated_ids_trimmed = [
|
| 226 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 227 |
-
]
|
| 228 |
-
batch_output_text = processor.batch_decode(
|
| 229 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 230 |
-
)
|
| 231 |
-
|
| 232 |
-
rank_outputs.extend(batch_output_text)
|
| 233 |
-
|
| 234 |
-
print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
|
| 235 |
-
|
| 236 |
-
# Gather all outputs from all ranks
|
| 237 |
-
all_outputs = [None] * len(data)
|
| 238 |
-
rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
|
| 239 |
-
|
| 240 |
-
gathered_results = [None] * world_size
|
| 241 |
-
dist.all_gather_object(gathered_results, rank_results)
|
| 242 |
-
|
| 243 |
-
assert gathered_results[-1][-1][0] == len(data) - 1
|
| 244 |
-
|
| 245 |
-
# The main process will collect all results
|
| 246 |
-
if rank == 0:
|
| 247 |
-
for results in gathered_results:
|
| 248 |
-
for idx, output in results:
|
| 249 |
-
assert idx < len(all_outputs)
|
| 250 |
-
all_outputs[idx] = output
|
| 251 |
-
assert all_outputs[-1] is not None
|
| 252 |
-
|
| 253 |
-
final_output = []
|
| 254 |
-
correct_number = 0
|
| 255 |
-
|
| 256 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 257 |
-
original_output = model_output
|
| 258 |
-
ground_truth = input_example['bbox']
|
| 259 |
-
ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], ground_truth[2] / input_example['img_size'][0], ground_truth[3] / input_example['img_size'][1]]
|
| 260 |
-
model_answer = extract_point_answer(original_output)
|
| 261 |
-
resized_height, resized_width = smart_resize(
|
| 262 |
-
input_example['img_size'][1],
|
| 263 |
-
input_example['img_size'][0],
|
| 264 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 265 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 266 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 267 |
-
)
|
| 268 |
-
model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
|
| 269 |
-
# Count correct answers
|
| 270 |
-
correct = 0
|
| 271 |
-
if model_answer is not None:
|
| 272 |
-
correct = point_in_box(model_answer, ground_truth)
|
| 273 |
-
correct_number += correct
|
| 274 |
-
num_all_sample +=1
|
| 275 |
-
num_correct_sample += correct
|
| 276 |
-
|
| 277 |
-
# Create a result dictionary for this example
|
| 278 |
-
result = {
|
| 279 |
-
'image': input_example['img_filename'],
|
| 280 |
-
'question': input_example['instruction'],
|
| 281 |
-
'resized_size': [resized_height, resized_width],
|
| 282 |
-
'ground_truth': ground_truth,
|
| 283 |
-
'model_output': original_output,
|
| 284 |
-
'extracted_answer': model_answer,
|
| 285 |
-
'correct': correct
|
| 286 |
-
}
|
| 287 |
-
final_output.append(result)
|
| 288 |
-
|
| 289 |
-
# Calculate and print accuracy
|
| 290 |
-
accuracy = correct_number / len(data) * 100
|
| 291 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 292 |
-
|
| 293 |
-
# Save results to a JSON file
|
| 294 |
-
output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
|
| 295 |
-
output_dir = os.path.dirname(output_path)
|
| 296 |
-
if not os.path.exists(output_dir):
|
| 297 |
-
os.makedirs(output_dir)
|
| 298 |
-
with open(output_path, "w") as f:
|
| 299 |
-
json.dump({
|
| 300 |
-
'accuracy': accuracy,
|
| 301 |
-
'results': final_output
|
| 302 |
-
}, f, indent=2)
|
| 303 |
-
|
| 304 |
-
print(f"Results saved to {output_path}")
|
| 305 |
-
print("-"*100)
|
| 306 |
-
# 将最后的统计和打印移到rank==0的条件块内
|
| 307 |
-
if rank == 0:
|
| 308 |
-
accuracy = num_correct_sample / num_all_sample * 100
|
| 309 |
-
print(f"\nnumber of correct samples: {num_correct_sample}")
|
| 310 |
-
print(f"number of all samples: {num_all_sample}")
|
| 311 |
-
print(f"Accuracy of all datasets: {accuracy:.2f}%")
|
| 312 |
-
|
| 313 |
-
# Synchronize all processes
|
| 314 |
-
dist.barrier()
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_grounding_r1_nothink.py
DELETED
|
@@ -1,331 +0,0 @@
|
|
| 1 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
-
from qwen_vl_utils import process_vision_info
|
| 3 |
-
import torch
|
| 4 |
-
import json
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
import random
|
| 10 |
-
from PIL import Image
|
| 11 |
-
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
|
| 12 |
-
import torch.distributed as dist
|
| 13 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 14 |
-
import argparse
|
| 15 |
-
|
| 16 |
-
import warnings
|
| 17 |
-
|
| 18 |
-
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
|
| 19 |
-
|
| 20 |
-
def setup_distributed():
|
| 21 |
-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
| 22 |
-
torch.cuda.set_device(local_rank)
|
| 23 |
-
|
| 24 |
-
dist.init_process_group(backend="nccl")
|
| 25 |
-
|
| 26 |
-
world_size = dist.get_world_size()
|
| 27 |
-
rank = dist.get_rank()
|
| 28 |
-
|
| 29 |
-
return local_rank, world_size, rank
|
| 30 |
-
|
| 31 |
-
local_rank, world_size, rank = setup_distributed()
|
| 32 |
-
device = f"cuda:{local_rank}"
|
| 33 |
-
print(f"Process {rank} using {device}")
|
| 34 |
-
|
| 35 |
-
steps = 3860
|
| 36 |
-
if rank == 0:
|
| 37 |
-
print("Steps: ", steps)
|
| 38 |
-
|
| 39 |
-
RUN_NAME = "Qwen2.5-VL-3B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_0.2_filtered_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e_max_pixel_4028160"
|
| 40 |
-
MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
|
| 41 |
-
#MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_acadmic/public_data/11178625/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
|
| 42 |
-
OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
|
| 43 |
-
|
| 44 |
-
BSZ=8
|
| 45 |
-
DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/annotations"
|
| 46 |
-
|
| 47 |
-
TEST_DATASETS = ['android_studio_macos', 'autocad_windows', 'blender_windows','davinci_macos','eviews_windows','excel_macos','fruitloops_windows','illustrator_windows','inventor_windows','linux_common_linux','macos_common_macos','matlab_macos','origin_windows','photoshop_windows','powerpoint_windows','premiere_windows','pycharm_macos','quartus_windows','solidworks_windows','stata_windows','unreal_engine_windows','vivado_windows','vmware_macos','vscode_macos','windows_common_windows','word_macos']
|
| 48 |
-
IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/images"
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
# TEST_DATASETS = ['lisa_test']
|
| 52 |
-
# IMAGE_ROOT = "/data10/shz/dataset/lisa"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 56 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 57 |
-
MODEL_PATH,
|
| 58 |
-
torch_dtype=torch.bfloat16,
|
| 59 |
-
attn_implementation="flash_attention_2",
|
| 60 |
-
device_map={"": local_rank},
|
| 61 |
-
)
|
| 62 |
-
# default processer
|
| 63 |
-
processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=7049280,min_pixels=3136)
|
| 64 |
-
# processor.image_processor.min_pixels=3136
|
| 65 |
-
# processor.image_processor.max_pixels=2007040
|
| 66 |
-
print(processor.image_processor.min_pixels)
|
| 67 |
-
print(processor.image_processor.max_pixels)
|
| 68 |
-
# def extract_point_answer(content):
|
| 69 |
-
# # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
|
| 70 |
-
# answer_tag_pattern = r'<answer>(.*?)</answer>'
|
| 71 |
-
# content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
|
| 72 |
-
# if content_answer_match:
|
| 73 |
-
# content_answer = content_answer_match.group(1).strip()
|
| 74 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
|
| 75 |
-
# if tool_call_match:
|
| 76 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 77 |
-
# # 解析 JSON
|
| 78 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 79 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 80 |
-
# coordinate = arguments.get("coordinate", None)
|
| 81 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 82 |
-
# x, y = coordinate
|
| 83 |
-
# extracted_coordinate = [x, y]
|
| 84 |
-
# return extracted_coordinate
|
| 85 |
-
# return [0, 0]
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
# def extract_point_answer(content):
|
| 89 |
-
# # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 90 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
|
| 91 |
-
# if tool_call_match:
|
| 92 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 93 |
-
# # 首先尝试将 tool_call_content 解析为 JSON
|
| 94 |
-
# try:
|
| 95 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 96 |
-
# print(tool_call_json)
|
| 97 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 98 |
-
# coordinate = arguments.get("coordinate", None)
|
| 99 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 100 |
-
# try:
|
| 101 |
-
# x = float(coordinate[0])
|
| 102 |
-
# y = float(coordinate[1])
|
| 103 |
-
# return [x, y]
|
| 104 |
-
# except (ValueError, TypeError):
|
| 105 |
-
# pass # 如果转换失败,继续尝试正则提取
|
| 106 |
-
# except json.JSONDecodeError:
|
| 107 |
-
# pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 108 |
-
# # 回退到正则表达式提取两个数字
|
| 109 |
-
# numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 110 |
-
# if len(numbers) >= 2:
|
| 111 |
-
# x = float(numbers[-2])
|
| 112 |
-
# y = float(numbers[-1])
|
| 113 |
-
# return [x, y]
|
| 114 |
-
# return [0, 0]
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def extract_point_answer(content):
|
| 119 |
-
# 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 120 |
-
tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
|
| 121 |
-
if tool_call_match:
|
| 122 |
-
tool_call_content = tool_call_match.group(1).strip()
|
| 123 |
-
# 首先尝试将 tool_call_content 解析为 JSON
|
| 124 |
-
try:
|
| 125 |
-
numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 126 |
-
if len(numbers) >= 2:
|
| 127 |
-
x = float(numbers[-2])
|
| 128 |
-
y = float(numbers[-1])
|
| 129 |
-
return [x, y]
|
| 130 |
-
except json.JSONDecodeError:
|
| 131 |
-
pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 132 |
-
# 回退到正则表达式提取两个数字
|
| 133 |
-
return [0, 0]
|
| 134 |
-
|
| 135 |
-
def point_in_box(point, box):
|
| 136 |
-
x,y = point
|
| 137 |
-
if box[0] <= x < box[2] and box[1] <= y < box[3]:
|
| 138 |
-
return 1
|
| 139 |
-
else:
|
| 140 |
-
return 0
|
| 141 |
-
|
| 142 |
-
num_samples = 2000
|
| 143 |
-
num_all_sample = 0
|
| 144 |
-
num_correct_sample = 0
|
| 145 |
-
for ds in TEST_DATASETS:
|
| 146 |
-
if rank == 0:
|
| 147 |
-
print(f"Processing {ds}...")
|
| 148 |
-
ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
|
| 149 |
-
data = json.load(open(ds_path, "r"))
|
| 150 |
-
random.seed(42)
|
| 151 |
-
random.shuffle(data)
|
| 152 |
-
data = data[:num_samples]
|
| 153 |
-
|
| 154 |
-
# Split data for distributed evaluation
|
| 155 |
-
per_rank_data = len(data) // world_size
|
| 156 |
-
start_idx = rank * per_rank_data
|
| 157 |
-
end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
|
| 158 |
-
rank_data = data[start_idx:end_idx]
|
| 159 |
-
|
| 160 |
-
messages = []
|
| 161 |
-
|
| 162 |
-
for x in rank_data:
|
| 163 |
-
image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
|
| 164 |
-
width,height = x['img_size'][0],x['img_size'][1]
|
| 165 |
-
resized_height, resized_width = smart_resize(
|
| 166 |
-
height,
|
| 167 |
-
width,
|
| 168 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 169 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 170 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 171 |
-
)
|
| 172 |
-
system_content = """You are a helpful assistant.
|
| 173 |
-
#Tools
|
| 174 |
-
|
| 175 |
-
You may call one or more functions to assist with the user query.
|
| 176 |
-
|
| 177 |
-
You are provided with function signatures within <tools></tools> XML tags:
|
| 178 |
-
<tools>
|
| 179 |
-
{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
|
| 180 |
-
</tools>
|
| 181 |
-
|
| 182 |
-
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 183 |
-
<tool_call>
|
| 184 |
-
{"name": <function-name>, "arguments": <args-json-object>}
|
| 185 |
-
</tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
|
| 186 |
-
message = [
|
| 187 |
-
{
|
| 188 |
-
"role": "system",
|
| 189 |
-
"content": [
|
| 190 |
-
{
|
| 191 |
-
"type": "text",
|
| 192 |
-
"text": system_content
|
| 193 |
-
}
|
| 194 |
-
]
|
| 195 |
-
},
|
| 196 |
-
{
|
| 197 |
-
"role": "user",
|
| 198 |
-
"content": [
|
| 199 |
-
{
|
| 200 |
-
"type": "image",
|
| 201 |
-
"image": f"file://{image_path}"
|
| 202 |
-
},
|
| 203 |
-
{
|
| 204 |
-
"type": "text",
|
| 205 |
-
"text": x['instruction']
|
| 206 |
-
}
|
| 207 |
-
]
|
| 208 |
-
},
|
| 209 |
-
]
|
| 210 |
-
# print(message)
|
| 211 |
-
messages.append(message)
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
rank_outputs = [] # List to store answers for this rank
|
| 215 |
-
all_outputs = [] # List to store all answers
|
| 216 |
-
|
| 217 |
-
# Process data
|
| 218 |
-
for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
|
| 219 |
-
batch_messages = messages[i:i + BSZ]
|
| 220 |
-
|
| 221 |
-
# Preparation for inference
|
| 222 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 223 |
-
|
| 224 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 225 |
-
inputs = processor(
|
| 226 |
-
text=text,
|
| 227 |
-
images=image_inputs,
|
| 228 |
-
videos=video_inputs,
|
| 229 |
-
padding=True,
|
| 230 |
-
padding_side="left",
|
| 231 |
-
return_tensors="pt",
|
| 232 |
-
)
|
| 233 |
-
inputs = inputs.to(device)
|
| 234 |
-
|
| 235 |
-
# Inference: Generation of the output
|
| 236 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 237 |
-
|
| 238 |
-
generated_ids_trimmed = [
|
| 239 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 240 |
-
]
|
| 241 |
-
batch_output_text = processor.batch_decode(
|
| 242 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 243 |
-
)
|
| 244 |
-
|
| 245 |
-
rank_outputs.extend(batch_output_text)
|
| 246 |
-
|
| 247 |
-
print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
|
| 248 |
-
|
| 249 |
-
# Gather all outputs from all ranks
|
| 250 |
-
all_outputs = [None] * len(data)
|
| 251 |
-
rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
|
| 252 |
-
|
| 253 |
-
gathered_results = [None] * world_size
|
| 254 |
-
dist.all_gather_object(gathered_results, rank_results)
|
| 255 |
-
|
| 256 |
-
assert gathered_results[-1][-1][0] == len(data) - 1
|
| 257 |
-
|
| 258 |
-
# The main process will collect all results
|
| 259 |
-
if rank == 0:
|
| 260 |
-
for results in gathered_results:
|
| 261 |
-
for idx, output in results:
|
| 262 |
-
assert idx < len(all_outputs)
|
| 263 |
-
all_outputs[idx] = output
|
| 264 |
-
assert all_outputs[-1] is not None
|
| 265 |
-
|
| 266 |
-
final_output = []
|
| 267 |
-
correct_number = 0
|
| 268 |
-
|
| 269 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 270 |
-
original_output = model_output
|
| 271 |
-
ground_truth = input_example['bbox']
|
| 272 |
-
ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], ground_truth[2] / input_example['img_size'][0], ground_truth[3] / input_example['img_size'][1]]
|
| 273 |
-
model_answer = extract_point_answer(original_output)
|
| 274 |
-
resized_height, resized_width = smart_resize(
|
| 275 |
-
input_example['img_size'][1],
|
| 276 |
-
input_example['img_size'][0],
|
| 277 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 278 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 279 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 280 |
-
)
|
| 281 |
-
model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
|
| 282 |
-
# Count correct answers
|
| 283 |
-
correct = 0
|
| 284 |
-
if model_answer is not None:
|
| 285 |
-
correct = point_in_box(model_answer, ground_truth)
|
| 286 |
-
correct_number += correct
|
| 287 |
-
num_all_sample +=1
|
| 288 |
-
num_correct_sample += correct
|
| 289 |
-
|
| 290 |
-
# Create a result dictionary for this example
|
| 291 |
-
result = {
|
| 292 |
-
'image': input_example['img_filename'],
|
| 293 |
-
'question': input_example['instruction'],
|
| 294 |
-
'resized_size': [resized_height, resized_width],
|
| 295 |
-
'ground_truth': ground_truth,
|
| 296 |
-
'model_output': original_output,
|
| 297 |
-
'extracted_answer': model_answer,
|
| 298 |
-
'correct': correct
|
| 299 |
-
}
|
| 300 |
-
final_output.append(result)
|
| 301 |
-
|
| 302 |
-
# Calculate and print accuracy
|
| 303 |
-
accuracy = correct_number / len(data) * 100
|
| 304 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 305 |
-
|
| 306 |
-
# Save results to a JSON file
|
| 307 |
-
output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
|
| 308 |
-
output_dir = os.path.dirname(output_path)
|
| 309 |
-
if not os.path.exists(output_dir):
|
| 310 |
-
os.makedirs(output_dir)
|
| 311 |
-
with open(output_path, "w") as f:
|
| 312 |
-
json.dump({
|
| 313 |
-
'accuracy': accuracy,
|
| 314 |
-
'results': final_output
|
| 315 |
-
}, f, indent=2)
|
| 316 |
-
|
| 317 |
-
print(f"Results saved to {output_path}")
|
| 318 |
-
print("-"*100)
|
| 319 |
-
# 将最后的统计和打印移到rank==0的条件块内
|
| 320 |
-
if rank == 0:
|
| 321 |
-
accuracy = num_correct_sample / num_all_sample * 100
|
| 322 |
-
print(f"\nnumber of correct samples: {num_correct_sample}")
|
| 323 |
-
print(f"number of all samples: {num_all_sample}")
|
| 324 |
-
print(f"Accuracy of all datasets: {accuracy:.2f}%")
|
| 325 |
-
|
| 326 |
-
# Synchronize all processes
|
| 327 |
-
dist.barrier()
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_grounding_r1_nothink_ss.py
DELETED
|
@@ -1,352 +0,0 @@
|
|
| 1 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
-
from qwen_vl_utils import process_vision_info
|
| 3 |
-
import torch
|
| 4 |
-
import json
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
import random
|
| 10 |
-
from PIL import Image
|
| 11 |
-
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
|
| 12 |
-
import torch.distributed as dist
|
| 13 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 14 |
-
import argparse
|
| 15 |
-
|
| 16 |
-
import warnings
|
| 17 |
-
|
| 18 |
-
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
|
| 19 |
-
|
| 20 |
-
def setup_distributed():
|
| 21 |
-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
| 22 |
-
torch.cuda.set_device(local_rank)
|
| 23 |
-
|
| 24 |
-
dist.init_process_group(backend="nccl")
|
| 25 |
-
|
| 26 |
-
world_size = dist.get_world_size()
|
| 27 |
-
rank = dist.get_rank()
|
| 28 |
-
|
| 29 |
-
return local_rank, world_size, rank
|
| 30 |
-
|
| 31 |
-
local_rank, world_size, rank = setup_distributed()
|
| 32 |
-
device = f"cuda:{local_rank}"
|
| 33 |
-
print(f"Process {rank} using {device}")
|
| 34 |
-
|
| 35 |
-
steps = 3800
|
| 36 |
-
if rank == 0:
|
| 37 |
-
print("Steps: ", steps)
|
| 38 |
-
#RUN_NAME = "base"
|
| 39 |
-
RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_filtered_only_one_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e"
|
| 40 |
-
#MODEL_PATH="/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-7B-Instruct"
|
| 41 |
-
MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
|
| 42 |
-
OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
|
| 43 |
-
|
| 44 |
-
BSZ=32
|
| 45 |
-
DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot/metadata"
|
| 46 |
-
|
| 47 |
-
TEST_DATASETS = ['hf_test_full']
|
| 48 |
-
IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot/images"
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
# TEST_DATASETS = ['lisa_test']
|
| 52 |
-
# IMAGE_ROOT = "/data10/shz/dataset/lisa"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 56 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 57 |
-
MODEL_PATH,
|
| 58 |
-
torch_dtype=torch.bfloat16,
|
| 59 |
-
attn_implementation="flash_attention_2",
|
| 60 |
-
device_map={"": local_rank},
|
| 61 |
-
)
|
| 62 |
-
# default processer
|
| 63 |
-
processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=2007040,min_pixels=3136)
|
| 64 |
-
# processor.image_processor.min_pixels=3136
|
| 65 |
-
# processor.image_processor.max_pixels=2007040
|
| 66 |
-
print(processor.image_processor.min_pixels)
|
| 67 |
-
print(processor.image_processor.max_pixels)
|
| 68 |
-
# def extract_point_answer(content):
|
| 69 |
-
# # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
|
| 70 |
-
# answer_tag_pattern = r'<answer>(.*?)</answer>'
|
| 71 |
-
# content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
|
| 72 |
-
# if content_answer_match:
|
| 73 |
-
# content_answer = content_answer_match.group(1).strip()
|
| 74 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
|
| 75 |
-
# if tool_call_match:
|
| 76 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 77 |
-
# # 解析 JSON
|
| 78 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 79 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 80 |
-
# coordinate = arguments.get("coordinate", None)
|
| 81 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 82 |
-
# x, y = coordinate
|
| 83 |
-
# extracted_coordinate = [x, y]
|
| 84 |
-
# return extracted_coordinate
|
| 85 |
-
# return [0, 0]
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
# def extract_point_answer(content):
|
| 89 |
-
# # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 90 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
|
| 91 |
-
# if tool_call_match:
|
| 92 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 93 |
-
# # 首先尝试将 tool_call_content 解析为 JSON
|
| 94 |
-
# try:
|
| 95 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 96 |
-
# print(tool_call_json)
|
| 97 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 98 |
-
# coordinate = arguments.get("coordinate", None)
|
| 99 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 100 |
-
# try:
|
| 101 |
-
# x = float(coordinate[0])
|
| 102 |
-
# y = float(coordinate[1])
|
| 103 |
-
# return [x, y]
|
| 104 |
-
# except (ValueError, TypeError):
|
| 105 |
-
# pass # 如果转换失败,继续尝试正则提取
|
| 106 |
-
# except json.JSONDecodeError:
|
| 107 |
-
# pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 108 |
-
# # 回退到正则表达式提取两个数字
|
| 109 |
-
# numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 110 |
-
# if len(numbers) >= 2:
|
| 111 |
-
# x = float(numbers[-2])
|
| 112 |
-
# y = float(numbers[-1])
|
| 113 |
-
# return [x, y]
|
| 114 |
-
# return [0, 0]
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def extract_point_answer(content):
|
| 119 |
-
# 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 120 |
-
tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
|
| 121 |
-
if tool_call_match:
|
| 122 |
-
tool_call_content = tool_call_match.group(1).strip()
|
| 123 |
-
# 首先尝试将 tool_call_content 解析为 JSON
|
| 124 |
-
try:
|
| 125 |
-
numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 126 |
-
if len(numbers) >= 2:
|
| 127 |
-
x = float(numbers[-2])
|
| 128 |
-
y = float(numbers[-1])
|
| 129 |
-
return [x, y]
|
| 130 |
-
except json.JSONDecodeError:
|
| 131 |
-
pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 132 |
-
# 回退到正则表达式提取两个数字
|
| 133 |
-
return [0, 0]
|
| 134 |
-
|
| 135 |
-
def point_in_box(point, box):
|
| 136 |
-
x,y = point
|
| 137 |
-
if box[0] <= x < box[2] and box[1] <= y < box[3]:
|
| 138 |
-
return 1
|
| 139 |
-
else:
|
| 140 |
-
return 0
|
| 141 |
-
|
| 142 |
-
num_samples = 2000
|
| 143 |
-
num_all_sample = 0
|
| 144 |
-
num_desktop_sample = 0
|
| 145 |
-
num_mobile_sample = 0
|
| 146 |
-
num_web_sample = 0
|
| 147 |
-
num_correct_sample = 0
|
| 148 |
-
for ds in TEST_DATASETS:
|
| 149 |
-
if rank == 0:
|
| 150 |
-
print(f"Processing {ds}...")
|
| 151 |
-
ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
|
| 152 |
-
data = json.load(open(ds_path, "r"))
|
| 153 |
-
random.seed(42)
|
| 154 |
-
random.shuffle(data)
|
| 155 |
-
data = data[:num_samples]
|
| 156 |
-
|
| 157 |
-
# Split data for distributed evaluation
|
| 158 |
-
per_rank_data = len(data) // world_size
|
| 159 |
-
start_idx = rank * per_rank_data
|
| 160 |
-
end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
|
| 161 |
-
rank_data = data[start_idx:end_idx]
|
| 162 |
-
|
| 163 |
-
messages = []
|
| 164 |
-
|
| 165 |
-
for x in rank_data:
|
| 166 |
-
image_path = os.path.join(IMAGE_ROOT, x['img_url'])
|
| 167 |
-
width,height = x['img_size'][0],x['img_size'][1]
|
| 168 |
-
resized_height, resized_width = smart_resize(
|
| 169 |
-
height,
|
| 170 |
-
width,
|
| 171 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 172 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 173 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 174 |
-
)
|
| 175 |
-
system_content = """You are a helpful assistant.
|
| 176 |
-
#Tools
|
| 177 |
-
|
| 178 |
-
You may call one or more functions to assist with the user query.
|
| 179 |
-
|
| 180 |
-
You are provided with function signatures within <tools></tools> XML tags:
|
| 181 |
-
<tools>
|
| 182 |
-
{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
|
| 183 |
-
</tools>
|
| 184 |
-
|
| 185 |
-
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 186 |
-
<tool_call>
|
| 187 |
-
{"name": <function-name>, "arguments": <args-json-object>}
|
| 188 |
-
</tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
|
| 189 |
-
message = [
|
| 190 |
-
{
|
| 191 |
-
"role": "system",
|
| 192 |
-
"content": [
|
| 193 |
-
{
|
| 194 |
-
"type": "text",
|
| 195 |
-
"text": system_content
|
| 196 |
-
}
|
| 197 |
-
]
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"role": "user",
|
| 201 |
-
"content": [
|
| 202 |
-
{
|
| 203 |
-
"type": "image",
|
| 204 |
-
"image": f"file://{image_path}"
|
| 205 |
-
},
|
| 206 |
-
{
|
| 207 |
-
"type": "text",
|
| 208 |
-
"text": x['task']
|
| 209 |
-
}
|
| 210 |
-
]
|
| 211 |
-
},
|
| 212 |
-
]
|
| 213 |
-
# print(message)
|
| 214 |
-
messages.append(message)
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
rank_outputs = [] # List to store answers for this rank
|
| 218 |
-
all_outputs = [] # List to store all answers
|
| 219 |
-
|
| 220 |
-
# Process data
|
| 221 |
-
for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
|
| 222 |
-
batch_messages = messages[i:i + BSZ]
|
| 223 |
-
|
| 224 |
-
# Preparation for inference
|
| 225 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 226 |
-
|
| 227 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 228 |
-
inputs = processor(
|
| 229 |
-
text=text,
|
| 230 |
-
images=image_inputs,
|
| 231 |
-
videos=video_inputs,
|
| 232 |
-
padding=True,
|
| 233 |
-
padding_side="left",
|
| 234 |
-
return_tensors="pt",
|
| 235 |
-
)
|
| 236 |
-
inputs = inputs.to(device)
|
| 237 |
-
|
| 238 |
-
# Inference: Generation of the output
|
| 239 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 240 |
-
|
| 241 |
-
generated_ids_trimmed = [
|
| 242 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 243 |
-
]
|
| 244 |
-
batch_output_text = processor.batch_decode(
|
| 245 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 246 |
-
)
|
| 247 |
-
|
| 248 |
-
rank_outputs.extend(batch_output_text)
|
| 249 |
-
|
| 250 |
-
print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
|
| 251 |
-
|
| 252 |
-
# Gather all outputs from all ranks
|
| 253 |
-
all_outputs = [None] * len(data)
|
| 254 |
-
rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
|
| 255 |
-
|
| 256 |
-
gathered_results = [None] * world_size
|
| 257 |
-
dist.all_gather_object(gathered_results, rank_results)
|
| 258 |
-
|
| 259 |
-
assert gathered_results[-1][-1][0] == len(data) - 1
|
| 260 |
-
|
| 261 |
-
# The main process will collect all results
|
| 262 |
-
if rank == 0:
|
| 263 |
-
for results in gathered_results:
|
| 264 |
-
for idx, output in results:
|
| 265 |
-
assert idx < len(all_outputs)
|
| 266 |
-
all_outputs[idx] = output
|
| 267 |
-
assert all_outputs[-1] is not None
|
| 268 |
-
|
| 269 |
-
final_output = []
|
| 270 |
-
correct_number = 0
|
| 271 |
-
correct_number_desktop = 0
|
| 272 |
-
correct_number_mobile = 0
|
| 273 |
-
correct_number_web = 0
|
| 274 |
-
|
| 275 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 276 |
-
original_output = model_output
|
| 277 |
-
ground_truth = input_example['bbox']
|
| 278 |
-
split_class = input_example['split']
|
| 279 |
-
ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], (ground_truth[0]+ground_truth[2]) / input_example['img_size'][0], (ground_truth[1]+ground_truth[3]) / input_example['img_size'][1]]
|
| 280 |
-
model_answer = extract_point_answer(original_output)
|
| 281 |
-
resized_height, resized_width = smart_resize(
|
| 282 |
-
input_example['img_size'][1],
|
| 283 |
-
input_example['img_size'][0],
|
| 284 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 285 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 286 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 287 |
-
)
|
| 288 |
-
model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
|
| 289 |
-
# Count correct answers
|
| 290 |
-
correct = 0
|
| 291 |
-
if model_answer is not None:
|
| 292 |
-
correct = point_in_box(model_answer, ground_truth)
|
| 293 |
-
correct_number += correct
|
| 294 |
-
num_all_sample +=1
|
| 295 |
-
num_correct_sample += correct
|
| 296 |
-
if split_class == "desktop":
|
| 297 |
-
correct_number_desktop += correct
|
| 298 |
-
num_desktop_sample += 1
|
| 299 |
-
if split_class == "mobile":
|
| 300 |
-
correct_number_mobile += correct
|
| 301 |
-
num_mobile_sample += 1
|
| 302 |
-
if split_class == "web":
|
| 303 |
-
correct_number_web += correct
|
| 304 |
-
num_web_sample += 1
|
| 305 |
-
# Create a result dictionary for this example
|
| 306 |
-
result = {
|
| 307 |
-
'image': input_example['img_url'],
|
| 308 |
-
'question': input_example['task'],
|
| 309 |
-
'resized_size': [resized_height, resized_width],
|
| 310 |
-
'ground_truth': ground_truth,
|
| 311 |
-
'model_output': original_output,
|
| 312 |
-
'extracted_answer': model_answer,
|
| 313 |
-
'correct': correct
|
| 314 |
-
}
|
| 315 |
-
final_output.append(result)
|
| 316 |
-
|
| 317 |
-
# Calculate and print accuracy
|
| 318 |
-
accuracy = correct_number / len(data) * 100
|
| 319 |
-
accuracy_desktop = correct_number_desktop / num_desktop_sample * 100
|
| 320 |
-
accuracy_mobile = correct_number_mobile / num_mobile_sample * 100
|
| 321 |
-
accuracy_web = correct_number_web / num_web_sample * 100
|
| 322 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 323 |
-
print(f"Accuracy of desktop: {accuracy_desktop:.2f}%")
|
| 324 |
-
print(f"Accuracy of mobile: {accuracy_mobile:.2f}%")
|
| 325 |
-
print(f"Accuracy of web: {accuracy_web:.2f}%")
|
| 326 |
-
|
| 327 |
-
# Save results to a JSON file
|
| 328 |
-
output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
|
| 329 |
-
output_dir = os.path.dirname(output_path)
|
| 330 |
-
if not os.path.exists(output_dir):
|
| 331 |
-
os.makedirs(output_dir)
|
| 332 |
-
with open(output_path, "w") as f:
|
| 333 |
-
json.dump({
|
| 334 |
-
'accuracy': accuracy,
|
| 335 |
-
'results': final_output
|
| 336 |
-
}, f, indent=2)
|
| 337 |
-
|
| 338 |
-
print(f"Results saved to {output_path}")
|
| 339 |
-
print("-"*100)
|
| 340 |
-
# 将最后的统计和打印移到rank==0的条件块内
|
| 341 |
-
if rank == 0:
|
| 342 |
-
accuracy = num_correct_sample / num_all_sample * 100
|
| 343 |
-
print(f"\nnumber of correct samples: {num_correct_sample}")
|
| 344 |
-
print(f"number of all samples: {num_all_sample}")
|
| 345 |
-
print(f"Accuracy of all datasets: {accuracy:.2f}%")
|
| 346 |
-
|
| 347 |
-
# Synchronize all processes
|
| 348 |
-
dist.barrier()
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_grounding_r1_nothink_ssv2.py
DELETED
|
@@ -1,335 +0,0 @@
|
|
| 1 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
-
from qwen_vl_utils import process_vision_info
|
| 3 |
-
import torch
|
| 4 |
-
import json
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
import random
|
| 10 |
-
from PIL import Image
|
| 11 |
-
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
|
| 12 |
-
import torch.distributed as dist
|
| 13 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 14 |
-
import argparse
|
| 15 |
-
|
| 16 |
-
import warnings
|
| 17 |
-
|
| 18 |
-
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
|
| 19 |
-
|
| 20 |
-
def setup_distributed():
|
| 21 |
-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
| 22 |
-
torch.cuda.set_device(local_rank)
|
| 23 |
-
|
| 24 |
-
dist.init_process_group(backend="nccl")
|
| 25 |
-
|
| 26 |
-
world_size = dist.get_world_size()
|
| 27 |
-
rank = dist.get_rank()
|
| 28 |
-
|
| 29 |
-
return local_rank, world_size, rank
|
| 30 |
-
|
| 31 |
-
local_rank, world_size, rank = setup_distributed()
|
| 32 |
-
device = f"cuda:{local_rank}"
|
| 33 |
-
print(f"Process {rank} using {device}")
|
| 34 |
-
|
| 35 |
-
steps = 3800
|
| 36 |
-
if rank == 0:
|
| 37 |
-
print("Steps: ", steps)
|
| 38 |
-
# #RUN_NAME = "base"
|
| 39 |
-
# RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_filtered_only_one_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e"
|
| 40 |
-
# #MODEL_PATH="/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-7B-Instruct"
|
| 41 |
-
# MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
|
| 42 |
-
# OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
|
| 43 |
-
#RUN_NAME = "base"
|
| 44 |
-
|
| 45 |
-
MODEL_PATH= "ByteDance-Seed/UI-TARS-2B-SFT"
|
| 46 |
-
OUTPUT_PATH="./logs/rec_results_ui_tras_2B.json"
|
| 47 |
-
|
| 48 |
-
BSZ=32
|
| 49 |
-
DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-v2"
|
| 50 |
-
|
| 51 |
-
TEST_DATASETS = ['screenspot_desktop_v2','screenspot_mobile_v2','screenspot_web_v2']
|
| 52 |
-
IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-v2/screenspotv2_image"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
# TEST_DATASETS = ['lisa_test']
|
| 56 |
-
# IMAGE_ROOT = "/data10/shz/dataset/lisa"
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 60 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 61 |
-
MODEL_PATH,
|
| 62 |
-
torch_dtype=torch.bfloat16,
|
| 63 |
-
attn_implementation="flash_attention_2",
|
| 64 |
-
device_map={"": local_rank},
|
| 65 |
-
)
|
| 66 |
-
# default processer
|
| 67 |
-
processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=3512320,min_pixels=3136)
|
| 68 |
-
# processor.image_processor.min_pixels=3136
|
| 69 |
-
# processor.image_processor.max_pixels=2007040
|
| 70 |
-
print(processor.image_processor.min_pixels)
|
| 71 |
-
print(processor.image_processor.max_pixels)
|
| 72 |
-
# def extract_point_answer(content):
|
| 73 |
-
# # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
|
| 74 |
-
# answer_tag_pattern = r'<answer>(.*?)</answer>'
|
| 75 |
-
# content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
|
| 76 |
-
# if content_answer_match:
|
| 77 |
-
# content_answer = content_answer_match.group(1).strip()
|
| 78 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
|
| 79 |
-
# if tool_call_match:
|
| 80 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 81 |
-
# # 解析 JSON
|
| 82 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 83 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 84 |
-
# coordinate = arguments.get("coordinate", None)
|
| 85 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 86 |
-
# x, y = coordinate
|
| 87 |
-
# extracted_coordinate = [x, y]
|
| 88 |
-
# return extracted_coordinate
|
| 89 |
-
# return [0, 0]
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
# def extract_point_answer(content):
|
| 93 |
-
# # 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 94 |
-
# tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
|
| 95 |
-
# if tool_call_match:
|
| 96 |
-
# tool_call_content = tool_call_match.group(1).strip()
|
| 97 |
-
# # 首先尝试将 tool_call_content 解析为 JSON
|
| 98 |
-
# try:
|
| 99 |
-
# tool_call_json = json.loads(tool_call_content)
|
| 100 |
-
# print(tool_call_json)
|
| 101 |
-
# arguments = tool_call_json.get("arguments", {})
|
| 102 |
-
# coordinate = arguments.get("coordinate", None)
|
| 103 |
-
# if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
|
| 104 |
-
# try:
|
| 105 |
-
# x = float(coordinate[0])
|
| 106 |
-
# y = float(coordinate[1])
|
| 107 |
-
# return [x, y]
|
| 108 |
-
# except (ValueError, TypeError):
|
| 109 |
-
# pass # 如果转换失败,继续尝试正则提取
|
| 110 |
-
# except json.JSONDecodeError:
|
| 111 |
-
# pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 112 |
-
# # 回退到正则表达式提取两个数字
|
| 113 |
-
# numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 114 |
-
# if len(numbers) >= 2:
|
| 115 |
-
# x = float(numbers[-2])
|
| 116 |
-
# y = float(numbers[-1])
|
| 117 |
-
# return [x, y]
|
| 118 |
-
# return [0, 0]
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
def extract_point_answer(content):
|
| 123 |
-
# 尝试在 <answer> 标签中查找内容,如果找不到则返回 [0, 0]
|
| 124 |
-
tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
|
| 125 |
-
if tool_call_match:
|
| 126 |
-
tool_call_content = tool_call_match.group(1).strip()
|
| 127 |
-
# 首先尝试将 tool_call_content 解析为 JSON
|
| 128 |
-
try:
|
| 129 |
-
numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
|
| 130 |
-
if len(numbers) >= 2:
|
| 131 |
-
x = float(numbers[-2])
|
| 132 |
-
y = float(numbers[-1])
|
| 133 |
-
return [x, y]
|
| 134 |
-
except json.JSONDecodeError:
|
| 135 |
-
pass # 如果 JSON 解析失败,继续尝试正则提取
|
| 136 |
-
# 回退到正则表达式提取两个数字
|
| 137 |
-
return [0, 0]
|
| 138 |
-
|
| 139 |
-
def point_in_box(point, box):
|
| 140 |
-
x,y = point
|
| 141 |
-
if box[0] <= x < box[2] and box[1] <= y < box[3]:
|
| 142 |
-
return 1
|
| 143 |
-
else:
|
| 144 |
-
return 0
|
| 145 |
-
|
| 146 |
-
num_samples = 2000
|
| 147 |
-
num_all_sample = 0
|
| 148 |
-
num_correct_sample = 0
|
| 149 |
-
for ds in TEST_DATASETS:
|
| 150 |
-
if rank == 0:
|
| 151 |
-
print(f"Processing {ds}...")
|
| 152 |
-
ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
|
| 153 |
-
data = json.load(open(ds_path, "r"))
|
| 154 |
-
random.seed(42)
|
| 155 |
-
random.shuffle(data)
|
| 156 |
-
data = data[:num_samples]
|
| 157 |
-
|
| 158 |
-
# Split data for distributed evaluation
|
| 159 |
-
per_rank_data = len(data) // world_size
|
| 160 |
-
start_idx = rank * per_rank_data
|
| 161 |
-
end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
|
| 162 |
-
rank_data = data[start_idx:end_idx]
|
| 163 |
-
|
| 164 |
-
messages = []
|
| 165 |
-
|
| 166 |
-
for x in rank_data:
|
| 167 |
-
image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
|
| 168 |
-
width,height = x['img_size'][0],x['img_size'][1]
|
| 169 |
-
resized_height, resized_width = smart_resize(
|
| 170 |
-
height,
|
| 171 |
-
width,
|
| 172 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 173 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 174 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 175 |
-
)
|
| 176 |
-
system_content = """You are a helpful assistant.
|
| 177 |
-
#Tools
|
| 178 |
-
|
| 179 |
-
You may call one or more functions to assist with the user query.
|
| 180 |
-
|
| 181 |
-
You are provided with function signatures within <tools></tools> XML tags:
|
| 182 |
-
<tools>
|
| 183 |
-
{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
|
| 184 |
-
</tools>
|
| 185 |
-
|
| 186 |
-
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 187 |
-
<tool_call>
|
| 188 |
-
{"name": <function-name>, "arguments": <args-json-object>}
|
| 189 |
-
</tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
|
| 190 |
-
message = [
|
| 191 |
-
{
|
| 192 |
-
"role": "system",
|
| 193 |
-
"content": [
|
| 194 |
-
{
|
| 195 |
-
"type": "text",
|
| 196 |
-
"text": system_content
|
| 197 |
-
}
|
| 198 |
-
]
|
| 199 |
-
},
|
| 200 |
-
{
|
| 201 |
-
"role": "user",
|
| 202 |
-
"content": [
|
| 203 |
-
{
|
| 204 |
-
"type": "image",
|
| 205 |
-
"image": f"file://{image_path}"
|
| 206 |
-
},
|
| 207 |
-
{
|
| 208 |
-
"type": "text",
|
| 209 |
-
"text": x['instruction']
|
| 210 |
-
}
|
| 211 |
-
]
|
| 212 |
-
},
|
| 213 |
-
]
|
| 214 |
-
# print(message)
|
| 215 |
-
messages.append(message)
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
rank_outputs = [] # List to store answers for this rank
|
| 219 |
-
all_outputs = [] # List to store all answers
|
| 220 |
-
|
| 221 |
-
# Process data
|
| 222 |
-
for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
|
| 223 |
-
batch_messages = messages[i:i + BSZ]
|
| 224 |
-
|
| 225 |
-
# Preparation for inference
|
| 226 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 227 |
-
|
| 228 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 229 |
-
inputs = processor(
|
| 230 |
-
text=text,
|
| 231 |
-
images=image_inputs,
|
| 232 |
-
videos=video_inputs,
|
| 233 |
-
padding=True,
|
| 234 |
-
padding_side="left",
|
| 235 |
-
return_tensors="pt",
|
| 236 |
-
)
|
| 237 |
-
inputs = inputs.to(device)
|
| 238 |
-
|
| 239 |
-
# Inference: Generation of the output
|
| 240 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 241 |
-
|
| 242 |
-
generated_ids_trimmed = [
|
| 243 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 244 |
-
]
|
| 245 |
-
batch_output_text = processor.batch_decode(
|
| 246 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 247 |
-
)
|
| 248 |
-
|
| 249 |
-
rank_outputs.extend(batch_output_text)
|
| 250 |
-
|
| 251 |
-
print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
|
| 252 |
-
|
| 253 |
-
# Gather all outputs from all ranks
|
| 254 |
-
all_outputs = [None] * len(data)
|
| 255 |
-
rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
|
| 256 |
-
|
| 257 |
-
gathered_results = [None] * world_size
|
| 258 |
-
dist.all_gather_object(gathered_results, rank_results)
|
| 259 |
-
|
| 260 |
-
assert gathered_results[-1][-1][0] == len(data) - 1
|
| 261 |
-
|
| 262 |
-
# The main process will collect all results
|
| 263 |
-
if rank == 0:
|
| 264 |
-
for results in gathered_results:
|
| 265 |
-
for idx, output in results:
|
| 266 |
-
assert idx < len(all_outputs)
|
| 267 |
-
all_outputs[idx] = output
|
| 268 |
-
assert all_outputs[-1] is not None
|
| 269 |
-
|
| 270 |
-
final_output = []
|
| 271 |
-
correct_number = 0
|
| 272 |
-
|
| 273 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 274 |
-
original_output = model_output
|
| 275 |
-
ground_truth = input_example['bbox']
|
| 276 |
-
ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], (ground_truth[0]+ground_truth[2]) / input_example['img_size'][0], (ground_truth[1]+ground_truth[3]) / input_example['img_size'][1]]
|
| 277 |
-
model_answer = extract_point_answer(original_output)
|
| 278 |
-
resized_height, resized_width = smart_resize(
|
| 279 |
-
input_example['img_size'][1],
|
| 280 |
-
input_example['img_size'][0],
|
| 281 |
-
factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
|
| 282 |
-
min_pixels = processor.image_processor.min_pixels,
|
| 283 |
-
max_pixels = processor.image_processor.max_pixels,
|
| 284 |
-
)
|
| 285 |
-
model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
|
| 286 |
-
# Count correct answers
|
| 287 |
-
correct = 0
|
| 288 |
-
if model_answer is not None:
|
| 289 |
-
correct = point_in_box(model_answer, ground_truth)
|
| 290 |
-
correct_number += correct
|
| 291 |
-
num_all_sample +=1
|
| 292 |
-
num_correct_sample += correct
|
| 293 |
-
|
| 294 |
-
# Create a result dictionary for this example
|
| 295 |
-
result = {
|
| 296 |
-
'image': input_example['img_filename'],
|
| 297 |
-
'question': input_example['instruction'],
|
| 298 |
-
'resized_size': [resized_height, resized_width],
|
| 299 |
-
'ground_truth': ground_truth,
|
| 300 |
-
'model_output': original_output,
|
| 301 |
-
'extracted_answer': model_answer,
|
| 302 |
-
'correct': correct
|
| 303 |
-
}
|
| 304 |
-
final_output.append(result)
|
| 305 |
-
|
| 306 |
-
# Calculate and print accuracy
|
| 307 |
-
accuracy = correct_number / len(data) * 100
|
| 308 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 309 |
-
|
| 310 |
-
# Save results to a JSON file
|
| 311 |
-
output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
|
| 312 |
-
output_dir = os.path.dirname(output_path)
|
| 313 |
-
if not os.path.exists(output_dir):
|
| 314 |
-
os.makedirs(output_dir)
|
| 315 |
-
with open(output_path, "w") as f:
|
| 316 |
-
json.dump({
|
| 317 |
-
'accuracy': accuracy,
|
| 318 |
-
'results': final_output
|
| 319 |
-
}, f, indent=2)
|
| 320 |
-
|
| 321 |
-
print(f"Results saved to {output_path}")
|
| 322 |
-
print("-"*100)
|
| 323 |
-
# 将最后的统计和打印移到rank==0的条件块内
|
| 324 |
-
if rank == 0:
|
| 325 |
-
accuracy = num_correct_sample / num_all_sample * 100
|
| 326 |
-
print(f"\nnumber of correct samples: {num_correct_sample}")
|
| 327 |
-
print(f"number of all samples: {num_all_sample}")
|
| 328 |
-
print(f"Accuracy of all datasets: {accuracy:.2f}%")
|
| 329 |
-
|
| 330 |
-
# Synchronize all processes
|
| 331 |
-
dist.barrier()
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_od_r1.py
DELETED
|
@@ -1,178 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import os
|
| 3 |
-
import json
|
| 4 |
-
import torch
|
| 5 |
-
import random
|
| 6 |
-
|
| 7 |
-
from tqdm import tqdm
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
from qwen_vl_utils import process_vision_info
|
| 10 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def extract_bbox_answer(content):
|
| 14 |
-
pattern = r'```json(.*?)```'
|
| 15 |
-
json_match = re.search(pattern, content, re.DOTALL)
|
| 16 |
-
bbox_json = json_match.group(1).strip() if json_match else None
|
| 17 |
-
|
| 18 |
-
if bbox_json:
|
| 19 |
-
try:
|
| 20 |
-
bbox = json.loads(bbox_json)[0]['bbox_2d']
|
| 21 |
-
return bbox, False
|
| 22 |
-
except:
|
| 23 |
-
return [0, 0, 0, 0], False
|
| 24 |
-
else:
|
| 25 |
-
return [0, 0, 0, 0], False
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def iou(box1, box2):
|
| 29 |
-
inter_x1 = max(box1[0], box2[0])
|
| 30 |
-
inter_y1 = max(box1[1], box2[1])
|
| 31 |
-
inter_x2 = min(box1[2] - 1, box2[2] - 1)
|
| 32 |
-
inter_y2 = min(box1[3] - 1, box2[3] - 1)
|
| 33 |
-
if inter_x1 < inter_x2 and inter_y1 < inter_y2:
|
| 34 |
-
inter = (inter_x2 - inter_x1 + 1) * (inter_y2 - inter_y1 + 1)
|
| 35 |
-
else:
|
| 36 |
-
inter = 0
|
| 37 |
-
union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + (box2[2] - box2[0]) * (box2[3] - box2[1]) - inter
|
| 38 |
-
return float(inter) / union
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def load_model(model_path, device_map):
|
| 42 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 43 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 44 |
-
model_path,
|
| 45 |
-
torch_dtype=torch.bfloat16,
|
| 46 |
-
attn_implementation="flash_attention_2",
|
| 47 |
-
device_map=device_map,
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
# default processer
|
| 51 |
-
processor = AutoProcessor.from_pretrained(model_path)
|
| 52 |
-
|
| 53 |
-
return model, processor
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def eval_od_r1(
|
| 57 |
-
model_path, test_datasets, data_root, image_root, question_template, output_dir, batch_size=32, sample_num=500, seed=42, device_map="cuda:0"
|
| 58 |
-
):
|
| 59 |
-
random.seed(seed)
|
| 60 |
-
model, processor = load_model(model_path, device_map)
|
| 61 |
-
|
| 62 |
-
for ds in test_datasets:
|
| 63 |
-
print(f"Processing {ds}...")
|
| 64 |
-
|
| 65 |
-
ds_path = os.path.join(data_root, f"{ds}.json")
|
| 66 |
-
data = json.load(open(ds_path, "r"))
|
| 67 |
-
random.shuffle(data)
|
| 68 |
-
data = data[:sample_num]
|
| 69 |
-
messages = []
|
| 70 |
-
|
| 71 |
-
for x in data:
|
| 72 |
-
image_path = os.path.join(image_root, x['image'])
|
| 73 |
-
messages.append(
|
| 74 |
-
[
|
| 75 |
-
{
|
| 76 |
-
"role":
|
| 77 |
-
"user",
|
| 78 |
-
"content":
|
| 79 |
-
[
|
| 80 |
-
{
|
| 81 |
-
"type": "image",
|
| 82 |
-
"image": f"file://{image_path}"
|
| 83 |
-
}, {
|
| 84 |
-
"type": "text",
|
| 85 |
-
"text": question_template.format(Question=x['normal_caption'])
|
| 86 |
-
}
|
| 87 |
-
]
|
| 88 |
-
}
|
| 89 |
-
]
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
all_outputs = [] # List to store all answers
|
| 93 |
-
|
| 94 |
-
# Process data
|
| 95 |
-
for i in tqdm(range(0, len(messages), batch_size)):
|
| 96 |
-
batch_messages = messages[i:i + batch_size]
|
| 97 |
-
|
| 98 |
-
# Preparation for inference
|
| 99 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 100 |
-
|
| 101 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 102 |
-
inputs = processor(
|
| 103 |
-
text=text,
|
| 104 |
-
images=image_inputs,
|
| 105 |
-
videos=video_inputs,
|
| 106 |
-
padding=True,
|
| 107 |
-
return_tensors="pt",
|
| 108 |
-
)
|
| 109 |
-
inputs = inputs.to(device_map)
|
| 110 |
-
|
| 111 |
-
# Inference: Generation of the output
|
| 112 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 113 |
-
|
| 114 |
-
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
| 115 |
-
batch_output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 116 |
-
all_outputs.extend(batch_output_text)
|
| 117 |
-
|
| 118 |
-
final_output = []
|
| 119 |
-
correct_number = 0
|
| 120 |
-
|
| 121 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 122 |
-
original_output = model_output
|
| 123 |
-
ground_truth = input_example['solution']
|
| 124 |
-
ground_truth_normalized = input_example['normalized_solution']
|
| 125 |
-
model_answer, normalized = extract_bbox_answer(original_output)
|
| 126 |
-
|
| 127 |
-
# Count correct answers
|
| 128 |
-
correct = 0
|
| 129 |
-
if model_answer is not None:
|
| 130 |
-
iou_value = iou(model_answer, ground_truth_normalized if normalized else ground_truth)
|
| 131 |
-
if iou_value > 0.5:
|
| 132 |
-
correct = 1
|
| 133 |
-
correct_number += correct
|
| 134 |
-
|
| 135 |
-
# Create a result dictionary for this example
|
| 136 |
-
result = {
|
| 137 |
-
"question": question_template.format(Question=input_example['normal_caption']),
|
| 138 |
-
"ground_truth": ground_truth if not normalized else ground_truth_normalized,
|
| 139 |
-
"model_output": original_output,
|
| 140 |
-
"extracted_answer": model_answer,
|
| 141 |
-
"correct": correct,
|
| 142 |
-
"iou": iou_value
|
| 143 |
-
}
|
| 144 |
-
final_output.append(result)
|
| 145 |
-
|
| 146 |
-
# Calculate and print accuracy
|
| 147 |
-
accuracy = correct_number / len(data) * 100
|
| 148 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 149 |
-
|
| 150 |
-
# Save results to a JSON file
|
| 151 |
-
result_path = os.path.join(output_dir, f"{os.path.basename(model_path)}", f"{ds}_od_r1.json")
|
| 152 |
-
os.makedirs(os.path.dirname(result_path), exist_ok=True)
|
| 153 |
-
with open(result_path, "w") as f:
|
| 154 |
-
json.dump({"accuracy": accuracy, "results": final_output}, f, indent=2)
|
| 155 |
-
|
| 156 |
-
print(f"Results saved to {result_path}")
|
| 157 |
-
print('-' * 100)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
if __name__ == "__main__":
|
| 161 |
-
model_path = '' # Add the path to the model
|
| 162 |
-
data_root = '' # Add the data root
|
| 163 |
-
test_datasets = ['refcoco_val', 'refcocop_val', 'refcocog_val'] # modify the datasets
|
| 164 |
-
image_root = '' # Add the image root
|
| 165 |
-
output_dir = 'logs' # Add the output directory, default is logs
|
| 166 |
-
device_map = 'cuda:0' # select the device, default is cuda:0
|
| 167 |
-
|
| 168 |
-
question_template = '{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format.' # modify the question template which must contain {Question}, {Question} will be replaced by the caption
|
| 169 |
-
|
| 170 |
-
eval_od_r1(
|
| 171 |
-
model_path=model_path,
|
| 172 |
-
data_root=data_root,
|
| 173 |
-
test_datasets=test_datasets,
|
| 174 |
-
image_root=image_root,
|
| 175 |
-
question_template=question_template,
|
| 176 |
-
output_dir=output_dir,
|
| 177 |
-
device_map=device_map
|
| 178 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_rec_baseline.py
DELETED
|
@@ -1,225 +0,0 @@
|
|
| 1 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
-
from qwen_vl_utils import process_vision_info
|
| 3 |
-
import torch
|
| 4 |
-
import json
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
import random
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
import torch.distributed as dist
|
| 13 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 14 |
-
import argparse
|
| 15 |
-
|
| 16 |
-
import warnings
|
| 17 |
-
|
| 18 |
-
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
|
| 19 |
-
|
| 20 |
-
def setup_distributed():
|
| 21 |
-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
| 22 |
-
torch.cuda.set_device(local_rank)
|
| 23 |
-
|
| 24 |
-
dist.init_process_group(backend="nccl")
|
| 25 |
-
|
| 26 |
-
world_size = dist.get_world_size()
|
| 27 |
-
rank = dist.get_rank()
|
| 28 |
-
|
| 29 |
-
print(f"Process {rank}/{world_size} initialized on cuda:{local_rank}")
|
| 30 |
-
return local_rank, world_size, rank
|
| 31 |
-
|
| 32 |
-
local_rank, world_size, rank = setup_distributed()
|
| 33 |
-
device = f"cuda:{local_rank}"
|
| 34 |
-
|
| 35 |
-
steps = 100
|
| 36 |
-
MODEL_PATH=f"/data10/shz/project/LLaMA-Factory/saves/qwen2_5_vl-3b/full/sft/checkpoint-{steps}"
|
| 37 |
-
OUTPUT_PATH="./logs/rec_results_{DATASET}_qwen2_5vl_3b_instruct_sft_{STEPS}.json"
|
| 38 |
-
|
| 39 |
-
# MODEL_PATH = "/data10/shz/ckpt/vlm-r1-related/Qwen2.5-VL-3B-Instruct"
|
| 40 |
-
# OUTPUT_PATH = "./logs/rec_results_{DATASET}_qwen2_5vl_3b_instruct_baseline_{STEPS}.json"
|
| 41 |
-
|
| 42 |
-
BSZ=4
|
| 43 |
-
DATA_ROOT = "/data10/shz/dataset/rec/rec_jsons_processed"
|
| 44 |
-
|
| 45 |
-
TEST_DATASETS = ['refcoco_val', 'refcocop_val', 'refcocog_val']
|
| 46 |
-
IMAGE_ROOT = "/data10/shz/dataset/coco"
|
| 47 |
-
|
| 48 |
-
# TEST_DATASETS = ['lisa_test']
|
| 49 |
-
# IMAGE_ROOT = "/data10/shz/dataset/lisa"
|
| 50 |
-
|
| 51 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 52 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 53 |
-
MODEL_PATH,
|
| 54 |
-
torch_dtype=torch.bfloat16,
|
| 55 |
-
attn_implementation="flash_attention_2",
|
| 56 |
-
device_map={"": local_rank},
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
# default processer
|
| 60 |
-
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
| 61 |
-
|
| 62 |
-
def extract_bbox_answer(content):
|
| 63 |
-
bbox_pattern = r'\[(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*)\]'
|
| 64 |
-
# bbox_pattern = r'\[(-?\d*\.?\d+),\s*(-?\d*\.?\d+),\s*(-?\d*\.?\d+),\s*(-?\d*\.?\d+)\]'
|
| 65 |
-
bbox_match = re.search(bbox_pattern, content)
|
| 66 |
-
|
| 67 |
-
if bbox_match:
|
| 68 |
-
bbox = [float(bbox_match.group(1)), float(bbox_match.group(2)), float(bbox_match.group(3)), float(bbox_match.group(4))]
|
| 69 |
-
return bbox
|
| 70 |
-
return [0, 0, 0, 0]
|
| 71 |
-
|
| 72 |
-
def iou(box1, box2):
|
| 73 |
-
inter_x1 = max(box1[0], box2[0])
|
| 74 |
-
inter_y1 = max(box1[1], box2[1])
|
| 75 |
-
inter_x2 = min(box1[2]-1, box2[2]-1)
|
| 76 |
-
inter_y2 = min(box1[3]-1, box2[3]-1)
|
| 77 |
-
if inter_x1 < inter_x2 and inter_y1 < inter_y2:
|
| 78 |
-
inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
|
| 79 |
-
else:
|
| 80 |
-
inter = 0
|
| 81 |
-
union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
|
| 82 |
-
return float(inter)/union
|
| 83 |
-
|
| 84 |
-
num_samples = 2000
|
| 85 |
-
for ds in TEST_DATASETS:
|
| 86 |
-
if rank == 0:
|
| 87 |
-
print(f"Processing {ds}...")
|
| 88 |
-
ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
|
| 89 |
-
data = json.load(open(ds_path, "r"))
|
| 90 |
-
random.seed(42)
|
| 91 |
-
random.shuffle(data)
|
| 92 |
-
data = data[:num_samples]
|
| 93 |
-
# QUESTION_TEMPLATE = "{Question}" if steps > 0 else "{Question} Please provide the bounding box coordinate in JSON format."
|
| 94 |
-
QUESTION_TEMPLATE = "{Question} Please provide the bounding box coordinate in JSON format."
|
| 95 |
-
|
| 96 |
-
# Split data for distributed evaluation
|
| 97 |
-
per_rank_data = len(data) // world_size
|
| 98 |
-
start_idx = rank * per_rank_data
|
| 99 |
-
end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
|
| 100 |
-
rank_data = data[start_idx:end_idx]
|
| 101 |
-
|
| 102 |
-
messages = []
|
| 103 |
-
|
| 104 |
-
for x in rank_data:
|
| 105 |
-
image_path = os.path.join(IMAGE_ROOT, x['image'])
|
| 106 |
-
message = [
|
| 107 |
-
# {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
|
| 108 |
-
{
|
| 109 |
-
"role": "user",
|
| 110 |
-
"content": [
|
| 111 |
-
{
|
| 112 |
-
"type": "image",
|
| 113 |
-
"image": f"file://{image_path}"
|
| 114 |
-
},
|
| 115 |
-
{
|
| 116 |
-
"type": "text",
|
| 117 |
-
"text": QUESTION_TEMPLATE.format(Question=x['problem'])
|
| 118 |
-
}
|
| 119 |
-
]
|
| 120 |
-
}]
|
| 121 |
-
messages.append(message)
|
| 122 |
-
|
| 123 |
-
rank_outputs = [] # List to store answers for this rank
|
| 124 |
-
all_outputs = [] # List to store all answers
|
| 125 |
-
|
| 126 |
-
# Process data
|
| 127 |
-
for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
|
| 128 |
-
batch_messages = messages[i:i + BSZ]
|
| 129 |
-
|
| 130 |
-
# Preparation for inference
|
| 131 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 132 |
-
|
| 133 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 134 |
-
inputs = processor(
|
| 135 |
-
text=text,
|
| 136 |
-
images=image_inputs,
|
| 137 |
-
videos=video_inputs,
|
| 138 |
-
padding=True,
|
| 139 |
-
padding_side="left",
|
| 140 |
-
return_tensors="pt",
|
| 141 |
-
)
|
| 142 |
-
inputs = inputs.to(device)
|
| 143 |
-
|
| 144 |
-
# Inference: Generation of the output
|
| 145 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 146 |
-
|
| 147 |
-
generated_ids_trimmed = [
|
| 148 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 149 |
-
]
|
| 150 |
-
batch_output_text = processor.batch_decode(
|
| 151 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
rank_outputs.extend(batch_output_text)
|
| 155 |
-
|
| 156 |
-
print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
|
| 157 |
-
|
| 158 |
-
# Gather all outputs from all ranks
|
| 159 |
-
all_outputs = [None] * len(data)
|
| 160 |
-
rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
|
| 161 |
-
|
| 162 |
-
gathered_results = [None] * world_size
|
| 163 |
-
dist.all_gather_object(gathered_results, rank_results)
|
| 164 |
-
|
| 165 |
-
assert gathered_results[-1][-1][0] == len(data) - 1
|
| 166 |
-
|
| 167 |
-
# The main process will collect all results
|
| 168 |
-
if rank == 0:
|
| 169 |
-
for results in gathered_results:
|
| 170 |
-
for idx, output in results:
|
| 171 |
-
assert idx < len(all_outputs)
|
| 172 |
-
all_outputs[idx] = output
|
| 173 |
-
assert all_outputs[-1] is not None
|
| 174 |
-
|
| 175 |
-
final_output = []
|
| 176 |
-
correct_number = 0
|
| 177 |
-
|
| 178 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 179 |
-
original_output = model_output
|
| 180 |
-
ground_truth = input_example['solution']
|
| 181 |
-
model_answer = extract_bbox_answer(original_output)
|
| 182 |
-
|
| 183 |
-
# Count correct answers
|
| 184 |
-
correct = 0
|
| 185 |
-
if model_answer is not None:
|
| 186 |
-
if iou(model_answer, ground_truth) > 0.5:
|
| 187 |
-
correct = 1
|
| 188 |
-
correct_number += correct
|
| 189 |
-
|
| 190 |
-
# Create a result dictionary for this example
|
| 191 |
-
result = {
|
| 192 |
-
'image': input_example['image'],
|
| 193 |
-
'question': input_example['problem'],
|
| 194 |
-
'ground_truth': ground_truth,
|
| 195 |
-
'model_output': original_output,
|
| 196 |
-
'extracted_answer': model_answer,
|
| 197 |
-
'correct': correct
|
| 198 |
-
}
|
| 199 |
-
final_output.append(result)
|
| 200 |
-
|
| 201 |
-
# Calculate and print accuracy
|
| 202 |
-
accuracy = correct_number / len(data) * 100
|
| 203 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 204 |
-
|
| 205 |
-
# Save results to a JSON file
|
| 206 |
-
output_path = OUTPUT_PATH.format(DATASET=ds, STEPS=steps)
|
| 207 |
-
output_dir = os.path.dirname(output_path)
|
| 208 |
-
if not os.path.exists(output_dir):
|
| 209 |
-
os.makedirs(output_dir)
|
| 210 |
-
with open(output_path, "w") as f:
|
| 211 |
-
json.dump({
|
| 212 |
-
'accuracy': accuracy,
|
| 213 |
-
'results': final_output
|
| 214 |
-
}, f, indent=2)
|
| 215 |
-
|
| 216 |
-
print(f"Results saved to {output_path}")
|
| 217 |
-
print("-"*100)
|
| 218 |
-
|
| 219 |
-
# Synchronize all processes
|
| 220 |
-
dist.barrier()
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/test_rec_r1.py
DELETED
|
@@ -1,232 +0,0 @@
|
|
| 1 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 2 |
-
from qwen_vl_utils import process_vision_info
|
| 3 |
-
import torch
|
| 4 |
-
import json
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
from pprint import pprint
|
| 9 |
-
import random
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
import torch.distributed as dist
|
| 13 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 14 |
-
import argparse
|
| 15 |
-
|
| 16 |
-
import warnings
|
| 17 |
-
|
| 18 |
-
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
|
| 19 |
-
|
| 20 |
-
def setup_distributed():
|
| 21 |
-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
|
| 22 |
-
torch.cuda.set_device(local_rank)
|
| 23 |
-
|
| 24 |
-
dist.init_process_group(backend="nccl")
|
| 25 |
-
|
| 26 |
-
world_size = dist.get_world_size()
|
| 27 |
-
rank = dist.get_rank()
|
| 28 |
-
|
| 29 |
-
return local_rank, world_size, rank
|
| 30 |
-
|
| 31 |
-
local_rank, world_size, rank = setup_distributed()
|
| 32 |
-
device = f"cuda:{local_rank}"
|
| 33 |
-
print(f"Process {rank} using {device}")
|
| 34 |
-
|
| 35 |
-
steps = 100
|
| 36 |
-
if rank == 0:
|
| 37 |
-
print("Steps: ", steps)
|
| 38 |
-
|
| 39 |
-
RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_image_size"
|
| 40 |
-
|
| 41 |
-
MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11179904/code/VLM_R1_CUSTOM/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
|
| 42 |
-
OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
|
| 43 |
-
|
| 44 |
-
BSZ=16
|
| 45 |
-
DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed"
|
| 46 |
-
|
| 47 |
-
TEST_DATASETS = ['refcoco_val', 'refcocop_val', 'refcocog_val']
|
| 48 |
-
IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data"
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
# TEST_DATASETS = ['lisa_test']
|
| 52 |
-
# IMAGE_ROOT = "/data10/shz/dataset/lisa"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 56 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 57 |
-
MODEL_PATH,
|
| 58 |
-
torch_dtype=torch.bfloat16,
|
| 59 |
-
attn_implementation="flash_attention_2",
|
| 60 |
-
device_map={"": local_rank},
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
# default processer
|
| 64 |
-
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
| 65 |
-
|
| 66 |
-
def extract_bbox_answer(content):
|
| 67 |
-
# Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
|
| 68 |
-
answer_tag_pattern = r'<answer>(.*?)</answer>'
|
| 69 |
-
bbox_pattern = r'\{.*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]\s*.*\}'
|
| 70 |
-
content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
|
| 71 |
-
if content_answer_match:
|
| 72 |
-
content_answer = content_answer_match.group(1).strip()
|
| 73 |
-
bbox_match = re.search(bbox_pattern, content_answer, re.DOTALL)
|
| 74 |
-
if bbox_match:
|
| 75 |
-
bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
|
| 76 |
-
return bbox
|
| 77 |
-
return [0, 0, 0, 0]
|
| 78 |
-
|
| 79 |
-
def iou(box1, box2):
|
| 80 |
-
inter_x1 = max(box1[0], box2[0])
|
| 81 |
-
inter_y1 = max(box1[1], box2[1])
|
| 82 |
-
inter_x2 = min(box1[2]-1, box2[2]-1)
|
| 83 |
-
inter_y2 = min(box1[3]-1, box2[3]-1)
|
| 84 |
-
if inter_x1 < inter_x2 and inter_y1 < inter_y2:
|
| 85 |
-
inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
|
| 86 |
-
else:
|
| 87 |
-
inter = 0
|
| 88 |
-
union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
|
| 89 |
-
return float(inter)/union
|
| 90 |
-
|
| 91 |
-
num_samples = 2000
|
| 92 |
-
for ds in TEST_DATASETS:
|
| 93 |
-
if rank == 0:
|
| 94 |
-
print(f"Processing {ds}...")
|
| 95 |
-
ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
|
| 96 |
-
data = json.load(open(ds_path, "r"))
|
| 97 |
-
random.seed(42)
|
| 98 |
-
random.shuffle(data)
|
| 99 |
-
data = data[:num_samples]
|
| 100 |
-
|
| 101 |
-
QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
|
| 102 |
-
|
| 103 |
-
# Split data for distributed evaluation
|
| 104 |
-
per_rank_data = len(data) // world_size
|
| 105 |
-
start_idx = rank * per_rank_data
|
| 106 |
-
end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
|
| 107 |
-
rank_data = data[start_idx:end_idx]
|
| 108 |
-
|
| 109 |
-
messages = []
|
| 110 |
-
|
| 111 |
-
for x in rank_data:
|
| 112 |
-
image_path = os.path.join(IMAGE_ROOT, x['image'])
|
| 113 |
-
message = [
|
| 114 |
-
# {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
|
| 115 |
-
{
|
| 116 |
-
"role": "user",
|
| 117 |
-
"content": [
|
| 118 |
-
{
|
| 119 |
-
"type": "image",
|
| 120 |
-
"image": f"file://{image_path}"
|
| 121 |
-
},
|
| 122 |
-
{
|
| 123 |
-
"type": "text",
|
| 124 |
-
"text": QUESTION_TEMPLATE.format(Question=x['problem'])
|
| 125 |
-
}
|
| 126 |
-
]
|
| 127 |
-
}]
|
| 128 |
-
messages.append(message)
|
| 129 |
-
|
| 130 |
-
rank_outputs = [] # List to store answers for this rank
|
| 131 |
-
all_outputs = [] # List to store all answers
|
| 132 |
-
|
| 133 |
-
# Process data
|
| 134 |
-
for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
|
| 135 |
-
batch_messages = messages[i:i + BSZ]
|
| 136 |
-
|
| 137 |
-
# Preparation for inference
|
| 138 |
-
text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
|
| 139 |
-
|
| 140 |
-
image_inputs, video_inputs = process_vision_info(batch_messages)
|
| 141 |
-
inputs = processor(
|
| 142 |
-
text=text,
|
| 143 |
-
images=image_inputs,
|
| 144 |
-
videos=video_inputs,
|
| 145 |
-
padding=True,
|
| 146 |
-
padding_side="left",
|
| 147 |
-
return_tensors="pt",
|
| 148 |
-
)
|
| 149 |
-
inputs = inputs.to(device)
|
| 150 |
-
|
| 151 |
-
# Inference: Generation of the output
|
| 152 |
-
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
|
| 153 |
-
|
| 154 |
-
generated_ids_trimmed = [
|
| 155 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 156 |
-
]
|
| 157 |
-
batch_output_text = processor.batch_decode(
|
| 158 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
rank_outputs.extend(batch_output_text)
|
| 162 |
-
|
| 163 |
-
print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
|
| 164 |
-
|
| 165 |
-
# Gather all outputs from all ranks
|
| 166 |
-
all_outputs = [None] * len(data)
|
| 167 |
-
rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
|
| 168 |
-
|
| 169 |
-
gathered_results = [None] * world_size
|
| 170 |
-
dist.all_gather_object(gathered_results, rank_results)
|
| 171 |
-
|
| 172 |
-
assert gathered_results[-1][-1][0] == len(data) - 1
|
| 173 |
-
|
| 174 |
-
# The main process will collect all results
|
| 175 |
-
if rank == 0:
|
| 176 |
-
for results in gathered_results:
|
| 177 |
-
for idx, output in results:
|
| 178 |
-
assert idx < len(all_outputs)
|
| 179 |
-
all_outputs[idx] = output
|
| 180 |
-
assert all_outputs[-1] is not None
|
| 181 |
-
|
| 182 |
-
final_output = []
|
| 183 |
-
correct_number = 0
|
| 184 |
-
|
| 185 |
-
for input_example, model_output in zip(data, all_outputs):
|
| 186 |
-
original_output = model_output
|
| 187 |
-
ground_truth = input_example['solution']
|
| 188 |
-
model_answer = extract_bbox_answer(original_output)
|
| 189 |
-
|
| 190 |
-
# Count correct answers
|
| 191 |
-
correct = 0
|
| 192 |
-
if model_answer is not None:
|
| 193 |
-
if iou(model_answer, ground_truth) > 0.5:
|
| 194 |
-
correct = 1
|
| 195 |
-
correct_number += correct
|
| 196 |
-
|
| 197 |
-
# Create a result dictionary for this example
|
| 198 |
-
result = {
|
| 199 |
-
'image': input_example['image'],
|
| 200 |
-
'question': input_example['problem'],
|
| 201 |
-
'ground_truth': ground_truth,
|
| 202 |
-
'model_output': original_output,
|
| 203 |
-
'extracted_answer': model_answer,
|
| 204 |
-
'correct': correct
|
| 205 |
-
}
|
| 206 |
-
final_output.append(result)
|
| 207 |
-
|
| 208 |
-
# Calculate and print accuracy
|
| 209 |
-
accuracy = correct_number / len(data) * 100
|
| 210 |
-
print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
|
| 211 |
-
|
| 212 |
-
# Save results to a JSON file
|
| 213 |
-
output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
|
| 214 |
-
output_dir = os.path.dirname(output_path)
|
| 215 |
-
if not os.path.exists(output_dir):
|
| 216 |
-
os.makedirs(output_dir)
|
| 217 |
-
with open(output_path, "w") as f:
|
| 218 |
-
json.dump({
|
| 219 |
-
'accuracy': accuracy,
|
| 220 |
-
'results': final_output
|
| 221 |
-
}, f, indent=2)
|
| 222 |
-
|
| 223 |
-
print(f"Results saved to {output_path}")
|
| 224 |
-
print("-"*100)
|
| 225 |
-
|
| 226 |
-
# Synchronize all processes
|
| 227 |
-
dist.barrier()
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|