OSWorld / scripts /bash /run_manual_examine.sh
AbdulElahGwaith's picture
Upload folder using huggingface_hub
2d483c2 verified
# Manual Examination Script for OSWorld Tasks
# This script is used to manually verify and examine specific benchmark tasks
# Example task IDs for different domains:
# libreoffice_impress
# - 358aa0a7-6677-453f-ae35-e440f004c31e
# - a669ef01-ded5-4099-9ea9-25e99b569840
# multi_apps
# - 9219480b-3aed-47fc-8bac-d2cffc5849f7
# chrome
# - bb5e4c0d-f964-439c-97b6-bdb9747de3f4
# - 2ad9387a-65d8-4e33-ad5b-7580065a27ca (needs to be improved)
# - 2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3 (needs to be improved)
# - 7a5a7856-f1b6-42a4-ade9-1ca81ca0f263 (needs to be improved)
# - e1e75309-3ddb-4d09-92ec-de869c928143
# - b4f95342-463e-4179-8c3f-193cd7241fb2
# - b7895e80-f4d1-4648-bee0-4eb45a6f1fa8
# libreoffice_calc
# - 8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14
# - 4e6fcf72-daf3-439f-a232-c434ce416af6
# - 357ef137-7eeb-4c80-a3bb-0951f26a8aff
# chrome
# - c1fa57f3-c3db-4596-8f09-020701085416
# - 06fe7178-4491-4589-810f-2e2bc9502122 (slow setup)
# gimp
# - 06ca5602-62ca-47f6-ad4f-da151cde54cc
# multi_apps
# - 02ce9a50-7af2-47ed-8596-af0c230501f8
# Example usage:
python scripts/python/manual_examine.py \
--headless \
--observation_type screenshot \
--result_dir ./results_human_examine \
--test_all_meta_path evaluation_examples/test_all.json \
--domain libreoffice_impress \
--example_id a669ef01-ded5-4099-9ea9-25e99b569840 \
--max_steps 3