File size: 3,916 Bytes
7ed0fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
import os
import random
from datasets import load_dataset
from tqdm import tqdm

random.seed(1234)
VAL_NUM = 5000


def create_r1_train_dataset(
    valid_pair_json,
    data_dir,
    img_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/trainA/",
):
    os.makedirs(data_dir, exist_ok=True)
    pairs = [json.loads(line) for line in open(valid_pair_json, "r")]
    mapped_pairs = []

    for idx, pair in tqdm(enumerate(pairs)):
        img_filename = pair["img_filename"]
        new_pair = {}
        try:
            new_pair["thinking"] = (
                pair["r1_response"]
                .split("<think>")[1]
                .split("</think>")[0]
                .replace("scene description", "image")
            )
        except Exception as e:
            print(f"Error processing pair response: ", pair["r1_response"])
            continue  # skip this pair
        # add index to distinguish the same image
        dataset_filename = (
            img_filename.split(".")[0] + "_" + str(idx) + "." + img_filename.split(".")[1]
        )
        if not os.path.exists(f"{data_dir}/{img_filename}"):
            os.system(f"cp {img_dir}/{img_filename} {data_dir}/{dataset_filename}")
        q, a = pair["q"], pair["a"]
        new_pair["problem"] = q
        # get the thinking path
        
        new_pair["thinking"] = "<think>" + new_pair["thinking"] + "</think>"
        new_pair["solution"] = f"<answer> {a} </answer>"
        new_pair["file_name"] = dataset_filename
        mapped_pairs.append(new_pair)
    with open(f"{data_dir}/metadata.jsonl", "w") as f:
        for pair in mapped_pairs:
            f.write(json.dumps(pair) + "\n")

    train_dataset = load_dataset(
        "imagefolder",
        data_dir=data_dir,
        split="train",
    )
    return train_dataset


def create_val_dataset(
    json_file,
    data_dir,
    val_num=VAL_NUM,
    image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB",
):
    os.makedirs(data_dir, exist_ok=True)
    val = json.load(open(json_file))
    random.shuffle(val)
    val = val[:val_num]
    val_pairs = []
    for idx, pair in tqdm(enumerate(val)):
        q, a = pair["q"], pair["a"]
        img_filename = pair["img_filename"]
        # copy images to the DATA_DIR
        val_filename = (
            img_filename.split(".")[0] + f"_{idx}." + img_filename.split(".")[1]
        )
        if not os.path.exists(f"{data_dir}/{img_filename}"):
            os.system(f"cp {image_dir}/{img_filename} {data_dir}/{val_filename}")
        new_pair = {}
        new_pair["problem"] = q
        new_pair["solution"] = f"<answer> {a} </answer>"
        new_pair["file_name"] = val_filename
        val_pairs.append(new_pair)
    with open(f"{data_dir}/metadata.jsonl", "w") as f:
        for pair in val_pairs:
            f.write(json.dumps(pair) + "\n")
    val_dataset = load_dataset("imagefolder", data_dir=data_dir, split="train")
    return val_dataset


# valA split
VALA_DATA_DIR = "data/Clevr_CoGenT_ValA"
VALB_DATA_DIR = "data/Clevr_CoGenT_ValB"
valA_json = (
    "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valA.json"
)
valB_json = (
    "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valB.json"
)
TRAIN_DATADIR = "data/Clevr_CoGenT_TrainA_R1"
train_dataset = create_r1_train_dataset(
    "/home/lilei/Visual-R1/filter_results_v2/valid_pairs.jsonl",
    TRAIN_DATADIR,
)

# print(train_dataset)
valA_dataset = create_val_dataset(
    valA_json,
    VALA_DATA_DIR,
    image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valA",
)
valB_dataset = create_val_dataset(
    valB_json,
    VALB_DATA_DIR,
    image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB",
)
valA_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValA")
valB_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValB")
train_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_TrainA_R1")