#!/usr/bin/env python3 """ Convert math_vision data_all.json to mm_math format and split into train/test sets """ import json import random from pathlib import Path def convert_to_mm_math_format(item): """ Convert math_vision format to mm_math format Math vision format: { "id": "1", "question": "Which number should be written in place of the question mark?\n", "options": [], "image": "images/1.jpg", "answer": "60", "solution": null, "level": 2, "subject": "arithmetic" } MM Math format: { "solution": "\\boxed{28^\\circ}", "prompt": "Solve the problem and output the answer in the format of \\boxed{your answer}.\\n Question: ... \\n Options: ...", "completion": "answer", "image_path": "dataset/mm_math/images/MM_Math/52076087.png" } """ # Format the answer in boxed format answer = item.get('answer', '') solution = f"\\boxed{{{answer}}}" # Format the prompt with question and options question = item.get('question', '') options = item.get('options', []) # Build the prompt prompt = f"Solve the problem and output the answer in the format of \\boxed{{your answer}}.\\n Question: {question}" # Add options if they exist if options and len(options) > 0: options_str = ", ".join(options) prompt += f"\\n Options: {options_str}" # Completion is also in boxed format (same as solution) completion = f"\\boxed{{{answer}}}" # Format the image path image = item.get('image', '') if image: # Convert from "images/1.jpg" to "dataset/math_vision/images/1.jpg" image_path = f"dataset/math_vision/{image}" else: image_path = "" return { "solution": solution, "prompt": prompt, "completion": completion, "image_path": image_path } def main(): # Set random seed for reproducibility random.seed(42) # Read data_all.json input_file = Path("/root/CVPR/MemGen/data/math_vision/data_all.json") print(f"Reading {input_file}...") with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) print(f"Total samples: {len(data)}") # Shuffle the data random.shuffle(data) # Split into train (80%) and test (20%) split_ratio = 0.8 split_index = int(len(data) * split_ratio) train_data = data[:split_index] test_data = data[split_index:] print(f"Train samples: {len(train_data)}") print(f"Test samples: {len(test_data)}") # Convert to mm_math format print("\nConverting train data to mm_math format...") train_converted = [convert_to_mm_math_format(item) for item in train_data] print("Converting test data to mm_math format...") test_converted = [convert_to_mm_math_format(item) for item in test_data] # Save the converted data output_dir = Path("/root/CVPR/MemGen/data/math_vision") train_output = output_dir / "train.json" test_output = output_dir / "test.json" print(f"\nSaving train data to {train_output}...") with open(train_output, 'w', encoding='utf-8') as f: json.dump(train_converted, f, ensure_ascii=False, indent=2) print(f"Saving test data to {test_output}...") with open(test_output, 'w', encoding='utf-8') as f: json.dump(test_converted, f, ensure_ascii=False, indent=2) print("\nConversion complete!") print(f"Train set: {len(train_converted)} samples -> {train_output}") print(f"Test set: {len(test_converted)} samples -> {test_output}") # Show sample converted data print("\n" + "="*80) print("Sample converted train data:") print("="*80) print(json.dumps(train_converted[0], ensure_ascii=False, indent=2)) if __name__ == "__main__": main()