File size: 3,927 Bytes
e34b94f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
#!/usr/bin/env python3
"""
Convert math_vision data_all.json to mm_math format and split into train/test sets
"""
import json
import random
from pathlib import Path
def convert_to_mm_math_format(item):
"""
Convert math_vision format to mm_math format
Math vision format:
{
"id": "1",
"question": "Which number should be written in place of the question mark?\n<image1>",
"options": [],
"image": "images/1.jpg",
"answer": "60",
"solution": null,
"level": 2,
"subject": "arithmetic"
}
MM Math format:
{
"solution": "\\boxed{28^\\circ}",
"prompt": "Solve the problem and output the answer in the format of \\boxed{your answer}.\\n Question: ... \\n Options: ...",
"completion": "answer",
"image_path": "dataset/mm_math/images/MM_Math/52076087.png"
}
"""
# Format the answer in boxed format
answer = item.get('answer', '')
solution = f"\\boxed{{{answer}}}"
# Format the prompt with question and options
question = item.get('question', '')
options = item.get('options', [])
# Build the prompt
prompt = f"Solve the problem and output the answer in the format of \\boxed{{your answer}}.\\n Question: {question}"
# Add options if they exist
if options and len(options) > 0:
options_str = ", ".join(options)
prompt += f"\\n Options: {options_str}"
# Completion is also in boxed format (same as solution)
completion = f"\\boxed{{{answer}}}"
# Format the image path
image = item.get('image', '')
if image:
# Convert from "images/1.jpg" to "dataset/math_vision/images/1.jpg"
image_path = f"dataset/math_vision/{image}"
else:
image_path = ""
return {
"solution": solution,
"prompt": prompt,
"completion": completion,
"image_path": image_path
}
def main():
# Set random seed for reproducibility
random.seed(42)
# Read data_all.json
input_file = Path("/root/CVPR/MemGen/data/math_vision/data_all.json")
print(f"Reading {input_file}...")
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"Total samples: {len(data)}")
# Shuffle the data
random.shuffle(data)
# Split into train (80%) and test (20%)
split_ratio = 0.8
split_index = int(len(data) * split_ratio)
train_data = data[:split_index]
test_data = data[split_index:]
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
# Convert to mm_math format
print("\nConverting train data to mm_math format...")
train_converted = [convert_to_mm_math_format(item) for item in train_data]
print("Converting test data to mm_math format...")
test_converted = [convert_to_mm_math_format(item) for item in test_data]
# Save the converted data
output_dir = Path("/root/CVPR/MemGen/data/math_vision")
train_output = output_dir / "train.json"
test_output = output_dir / "test.json"
print(f"\nSaving train data to {train_output}...")
with open(train_output, 'w', encoding='utf-8') as f:
json.dump(train_converted, f, ensure_ascii=False, indent=2)
print(f"Saving test data to {test_output}...")
with open(test_output, 'w', encoding='utf-8') as f:
json.dump(test_converted, f, ensure_ascii=False, indent=2)
print("\nConversion complete!")
print(f"Train set: {len(train_converted)} samples -> {train_output}")
print(f"Test set: {len(test_converted)} samples -> {test_output}")
# Show sample converted data
print("\n" + "="*80)
print("Sample converted train data:")
print("="*80)
print(json.dumps(train_converted[0], ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
|