File size: 3,927 Bytes

e34b94f

#!/usr/bin/env python3
"""
Convert math_vision data_all.json to mm_math format and split into train/test sets
"""
import json
import random
from pathlib import Path

def convert_to_mm_math_format(item):
    """
    Convert math_vision format to mm_math format
    
    Math vision format:
    {
        "id": "1",
        "question": "Which number should be written in place of the question mark?\n<image1>",
        "options": [],
        "image": "images/1.jpg",
        "answer": "60",
        "solution": null,
        "level": 2,
        "subject": "arithmetic"
    }
    
    MM Math format:
    {
        "solution": "\\boxed{28^\\circ}",
        "prompt": "Solve the problem and output the answer in the format of \\boxed{your answer}.\\n Question: ... \\n Options: ...",
        "completion": "answer",
        "image_path": "dataset/mm_math/images/MM_Math/52076087.png"
    }
    """
    # Format the answer in boxed format
    answer = item.get('answer', '')
    solution = f"\\boxed{{{answer}}}"
    
    # Format the prompt with question and options
    question = item.get('question', '')
    options = item.get('options', [])
    
    # Build the prompt
    prompt = f"Solve the problem and output the answer in the format of \\boxed{{your answer}}.\\n Question: {question}"
    
    # Add options if they exist
    if options and len(options) > 0:
        options_str = ", ".join(options)
        prompt += f"\\n Options: {options_str}"
    
    # Completion is also in boxed format (same as solution)
    completion = f"\\boxed{{{answer}}}"
    
    # Format the image path
    image = item.get('image', '')
    if image:
        # Convert from "images/1.jpg" to "dataset/math_vision/images/1.jpg"
        image_path = f"dataset/math_vision/{image}"
    else:
        image_path = ""
    
    return {
        "solution": solution,
        "prompt": prompt,
        "completion": completion,
        "image_path": image_path
    }

def main():
    # Set random seed for reproducibility
    random.seed(42)
    
    # Read data_all.json
    input_file = Path("/root/CVPR/MemGen/data/math_vision/data_all.json")
    print(f"Reading {input_file}...")
    
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Total samples: {len(data)}")
    
    # Shuffle the data
    random.shuffle(data)
    
    # Split into train (80%) and test (20%)
    split_ratio = 0.8
    split_index = int(len(data) * split_ratio)
    
    train_data = data[:split_index]
    test_data = data[split_index:]
    
    print(f"Train samples: {len(train_data)}")
    print(f"Test samples: {len(test_data)}")
    
    # Convert to mm_math format
    print("\nConverting train data to mm_math format...")
    train_converted = [convert_to_mm_math_format(item) for item in train_data]
    
    print("Converting test data to mm_math format...")
    test_converted = [convert_to_mm_math_format(item) for item in test_data]
    
    # Save the converted data
    output_dir = Path("/root/CVPR/MemGen/data/math_vision")
    
    train_output = output_dir / "train.json"
    test_output = output_dir / "test.json"
    
    print(f"\nSaving train data to {train_output}...")
    with open(train_output, 'w', encoding='utf-8') as f:
        json.dump(train_converted, f, ensure_ascii=False, indent=2)
    
    print(f"Saving test data to {test_output}...")
    with open(test_output, 'w', encoding='utf-8') as f:
        json.dump(test_converted, f, ensure_ascii=False, indent=2)
    
    print("\nConversion complete!")
    print(f"Train set: {len(train_converted)} samples -> {train_output}")
    print(f"Test set: {len(test_converted)} samples -> {test_output}")
    
    # Show sample converted data
    print("\n" + "="*80)
    print("Sample converted train data:")
    print("="*80)
    print(json.dumps(train_converted[0], ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()