|
|
|
|
|
""" |
|
|
Convert math_vision data_all.json to mm_math format and split into train/test sets |
|
|
""" |
|
|
import json |
|
|
import random |
|
|
from pathlib import Path |
|
|
|
|
|
def convert_to_mm_math_format(item): |
|
|
""" |
|
|
Convert math_vision format to mm_math format |
|
|
|
|
|
Math vision format: |
|
|
{ |
|
|
"id": "1", |
|
|
"question": "Which number should be written in place of the question mark?\n<image1>", |
|
|
"options": [], |
|
|
"image": "images/1.jpg", |
|
|
"answer": "60", |
|
|
"solution": null, |
|
|
"level": 2, |
|
|
"subject": "arithmetic" |
|
|
} |
|
|
|
|
|
MM Math format: |
|
|
{ |
|
|
"solution": "\\boxed{28^\\circ}", |
|
|
"prompt": "Solve the problem and output the answer in the format of \\boxed{your answer}.\\n Question: ... \\n Options: ...", |
|
|
"completion": "answer", |
|
|
"image_path": "dataset/mm_math/images/MM_Math/52076087.png" |
|
|
} |
|
|
""" |
|
|
|
|
|
answer = item.get('answer', '') |
|
|
solution = f"\\boxed{{{answer}}}" |
|
|
|
|
|
|
|
|
question = item.get('question', '') |
|
|
options = item.get('options', []) |
|
|
|
|
|
|
|
|
prompt = f"Solve the problem and output the answer in the format of \\boxed{{your answer}}.\\n Question: {question}" |
|
|
|
|
|
|
|
|
if options and len(options) > 0: |
|
|
options_str = ", ".join(options) |
|
|
prompt += f"\\n Options: {options_str}" |
|
|
|
|
|
|
|
|
completion = f"\\boxed{{{answer}}}" |
|
|
|
|
|
|
|
|
image = item.get('image', '') |
|
|
if image: |
|
|
|
|
|
image_path = f"dataset/math_vision/{image}" |
|
|
else: |
|
|
image_path = "" |
|
|
|
|
|
return { |
|
|
"solution": solution, |
|
|
"prompt": prompt, |
|
|
"completion": completion, |
|
|
"image_path": image_path |
|
|
} |
|
|
|
|
|
def main(): |
|
|
|
|
|
random.seed(42) |
|
|
|
|
|
|
|
|
input_file = Path("/root/CVPR/MemGen/data/math_vision/data_all.json") |
|
|
print(f"Reading {input_file}...") |
|
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
print(f"Total samples: {len(data)}") |
|
|
|
|
|
|
|
|
random.shuffle(data) |
|
|
|
|
|
|
|
|
split_ratio = 0.8 |
|
|
split_index = int(len(data) * split_ratio) |
|
|
|
|
|
train_data = data[:split_index] |
|
|
test_data = data[split_index:] |
|
|
|
|
|
print(f"Train samples: {len(train_data)}") |
|
|
print(f"Test samples: {len(test_data)}") |
|
|
|
|
|
|
|
|
print("\nConverting train data to mm_math format...") |
|
|
train_converted = [convert_to_mm_math_format(item) for item in train_data] |
|
|
|
|
|
print("Converting test data to mm_math format...") |
|
|
test_converted = [convert_to_mm_math_format(item) for item in test_data] |
|
|
|
|
|
|
|
|
output_dir = Path("/root/CVPR/MemGen/data/math_vision") |
|
|
|
|
|
train_output = output_dir / "train.json" |
|
|
test_output = output_dir / "test.json" |
|
|
|
|
|
print(f"\nSaving train data to {train_output}...") |
|
|
with open(train_output, 'w', encoding='utf-8') as f: |
|
|
json.dump(train_converted, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"Saving test data to {test_output}...") |
|
|
with open(test_output, 'w', encoding='utf-8') as f: |
|
|
json.dump(test_converted, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print("\nConversion complete!") |
|
|
print(f"Train set: {len(train_converted)} samples -> {train_output}") |
|
|
print(f"Test set: {len(test_converted)} samples -> {test_output}") |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("Sample converted train data:") |
|
|
print("="*80) |
|
|
print(json.dumps(train_converted[0], ensure_ascii=False, indent=2)) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|