| import argparse
|
| import os
|
| import random
|
|
|
| import gymnasium as gym
|
| import pandas as pd
|
|
|
|
|
| from verl.utils.hdfs_io import copy, makedirs
|
|
|
| import rllm
|
|
|
| if __name__ == "__main__":
|
| import importlib
|
| import os
|
|
|
| import browsergym.async_webarena
|
|
|
| importlib.reload(browsergym.async_webarena)
|
|
|
| RLLM_DIR = os.path.dirname(os.path.dirname(os.path.abspath(rllm.__file__)))
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument("--local_dir", default=os.path.join(RLLM_DIR, "data/rllm-miniwob"))
|
| parser.add_argument("--hdfs_dir", default=None)
|
| parser.add_argument("--train_ratio", type=float, default=0.8, help="Ratio of data to use for training (default: 80%)")
|
| args = parser.parse_args()
|
|
|
| local_dir = args.local_dir
|
| os.makedirs(os.path.expanduser(local_dir), exist_ok=True)
|
|
|
| hdfs_dir = args.hdfs_dir
|
| train_ratio = max(0.0, min(1.0, args.train_ratio))
|
|
|
|
|
| env_ids = [env_id for env_id in gym.envs.registry.keys() if env_id.startswith("browsergym_async/webarena")]
|
|
|
| def make_map_fn(split):
|
| def process_fn(env_id, idx):
|
| data = {
|
| "data_source": "webarena",
|
| "prompt": [
|
| {
|
| "role": "user",
|
| "content": "",
|
| }
|
| ],
|
| "ability": "web",
|
| "reward_model": {"style": "rule", "ground_truth": ""},
|
| "extra_info": {
|
| "split": split,
|
| "index": idx,
|
| "env_id": env_id,
|
| },
|
| }
|
|
|
| return data
|
|
|
| return process_fn
|
|
|
|
|
| train_size = int(train_ratio * len(env_ids))
|
| random.seed(42)
|
| random.shuffle(env_ids)
|
| train_envs = env_ids[:train_size]
|
| test_envs = env_ids[train_size:]
|
|
|
|
|
| train_data = [make_map_fn("train")(env_id, idx) for idx, env_id in enumerate(train_envs)]
|
|
|
|
|
| test_data = [make_map_fn("test")(env_id, idx) for idx, env_id in enumerate(test_envs)]
|
|
|
| print("Train data size:", len(train_data))
|
| print("Test data size:", len(test_data))
|
|
|
|
|
| print("Saving train data to", os.path.join(local_dir, "train.parquet"))
|
| train_df = pd.DataFrame(train_data)
|
| train_df.to_parquet(os.path.join(local_dir, "train.parquet"))
|
| test_df = pd.DataFrame(test_data)
|
| test_df.to_parquet(os.path.join(local_dir, "test.parquet"))
|
|
|
|
|
| if hdfs_dir is not None:
|
| makedirs(hdfs_dir)
|
| copy(src=local_dir, dst=hdfs_dir)
|
|
|