| import json |
| import os |
| import random |
| from datasets import load_dataset |
| from tqdm import tqdm |
|
|
| random.seed(1234) |
| VAL_NUM = 5000 |
|
|
|
|
| def create_r1_train_dataset( |
| valid_pair_json, |
| data_dir, |
| img_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/trainA/", |
| ): |
| os.makedirs(data_dir, exist_ok=True) |
| pairs = [json.loads(line) for line in open(valid_pair_json, "r")] |
| mapped_pairs = [] |
|
|
| for idx, pair in tqdm(enumerate(pairs)): |
| img_filename = pair["img_filename"] |
| new_pair = {} |
| try: |
| new_pair["thinking"] = ( |
| pair["r1_response"] |
| .split("<think>")[1] |
| .split("</think>")[0] |
| .replace("scene description", "image") |
| ) |
| except Exception as e: |
| print(f"Error processing pair response: ", pair["r1_response"]) |
| continue |
| |
| dataset_filename = ( |
| img_filename.split(".")[0] + "_" + str(idx) + "." + img_filename.split(".")[1] |
| ) |
| if not os.path.exists(f"{data_dir}/{img_filename}"): |
| os.system(f"cp {img_dir}/{img_filename} {data_dir}/{dataset_filename}") |
| q, a = pair["q"], pair["a"] |
| new_pair["problem"] = q |
| |
| |
| new_pair["thinking"] = "<think>" + new_pair["thinking"] + "</think>" |
| new_pair["solution"] = f"<answer> {a} </answer>" |
| new_pair["file_name"] = dataset_filename |
| mapped_pairs.append(new_pair) |
| with open(f"{data_dir}/metadata.jsonl", "w") as f: |
| for pair in mapped_pairs: |
| f.write(json.dumps(pair) + "\n") |
|
|
| train_dataset = load_dataset( |
| "imagefolder", |
| data_dir=data_dir, |
| split="train", |
| ) |
| return train_dataset |
|
|
|
|
| def create_val_dataset( |
| json_file, |
| data_dir, |
| val_num=VAL_NUM, |
| image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB", |
| ): |
| os.makedirs(data_dir, exist_ok=True) |
| val = json.load(open(json_file)) |
| random.shuffle(val) |
| val = val[:val_num] |
| val_pairs = [] |
| for idx, pair in tqdm(enumerate(val)): |
| q, a = pair["q"], pair["a"] |
| img_filename = pair["img_filename"] |
| |
| val_filename = ( |
| img_filename.split(".")[0] + f"_{idx}." + img_filename.split(".")[1] |
| ) |
| if not os.path.exists(f"{data_dir}/{img_filename}"): |
| os.system(f"cp {image_dir}/{img_filename} {data_dir}/{val_filename}") |
| new_pair = {} |
| new_pair["problem"] = q |
| new_pair["solution"] = f"<answer> {a} </answer>" |
| new_pair["file_name"] = val_filename |
| val_pairs.append(new_pair) |
| with open(f"{data_dir}/metadata.jsonl", "w") as f: |
| for pair in val_pairs: |
| f.write(json.dumps(pair) + "\n") |
| val_dataset = load_dataset("imagefolder", data_dir=data_dir, split="train") |
| return val_dataset |
|
|
|
|
| |
| VALA_DATA_DIR = "data/Clevr_CoGenT_ValA" |
| VALB_DATA_DIR = "data/Clevr_CoGenT_ValB" |
| valA_json = ( |
| "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valA.json" |
| ) |
| valB_json = ( |
| "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valB.json" |
| ) |
| TRAIN_DATADIR = "data/Clevr_CoGenT_TrainA_R1" |
| train_dataset = create_r1_train_dataset( |
| "/home/lilei/Visual-R1/filter_results_v2/valid_pairs.jsonl", |
| TRAIN_DATADIR, |
| ) |
|
|
| |
| valA_dataset = create_val_dataset( |
| valA_json, |
| VALA_DATA_DIR, |
| image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valA", |
| ) |
| valB_dataset = create_val_dataset( |
| valB_json, |
| VALB_DATA_DIR, |
| image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB", |
| ) |
| valA_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValA") |
| valB_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValB") |
| train_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_TrainA_R1") |
|
|