| # Manual Examination Script for OSWorld Tasks | |
| # This script is used to manually verify and examine specific benchmark tasks | |
| # Example task IDs for different domains: | |
| # libreoffice_impress | |
| # - 358aa0a7-6677-453f-ae35-e440f004c31e | |
| # - a669ef01-ded5-4099-9ea9-25e99b569840 | |
| # multi_apps | |
| # - 9219480b-3aed-47fc-8bac-d2cffc5849f7 | |
| # chrome | |
| # - bb5e4c0d-f964-439c-97b6-bdb9747de3f4 | |
| # - 2ad9387a-65d8-4e33-ad5b-7580065a27ca (needs to be improved) | |
| # - 2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3 (needs to be improved) | |
| # - 7a5a7856-f1b6-42a4-ade9-1ca81ca0f263 (needs to be improved) | |
| # - e1e75309-3ddb-4d09-92ec-de869c928143 | |
| # - b4f95342-463e-4179-8c3f-193cd7241fb2 | |
| # - b7895e80-f4d1-4648-bee0-4eb45a6f1fa8 | |
| # libreoffice_calc | |
| # - 8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14 | |
| # - 4e6fcf72-daf3-439f-a232-c434ce416af6 | |
| # - 357ef137-7eeb-4c80-a3bb-0951f26a8aff | |
| # chrome | |
| # - c1fa57f3-c3db-4596-8f09-020701085416 | |
| # - 06fe7178-4491-4589-810f-2e2bc9502122 (slow setup) | |
| # gimp | |
| # - 06ca5602-62ca-47f6-ad4f-da151cde54cc | |
| # multi_apps | |
| # - 02ce9a50-7af2-47ed-8596-af0c230501f8 | |
| # Example usage: | |
| python scripts/python/manual_examine.py \ | |
| --headless \ | |
| --observation_type screenshot \ | |
| --result_dir ./results_human_examine \ | |
| --test_all_meta_path evaluation_examples/test_all.json \ | |
| --domain libreoffice_impress \ | |
| --example_id a669ef01-ded5-4099-9ea9-25e99b569840 \ | |
| --max_steps 3 | |