model111 / convert_mathvision.py
LCZZZZ's picture
Upload MemGen code and data
e34b94f verified
import pyarrow.parquet as pq
import json
import os
from pathlib import Path
from PIL import Image
import io
# Configuration
parquet_path = "/root/CVPR/MemGen/data/mathvision/data/testmini-00000-of-00001-f8ff70fcb2f29b1d.parquet"
output_dir = "/root/CVPR/MemGen/data/mathvision/processed"
images_dir = os.path.join(output_dir, "images")
json_path = os.path.join(output_dir, "data.json")
# Create output directories
os.makedirs(images_dir, exist_ok=True)
print("Reading parquet file...")
# Read the parquet file
with open(parquet_path, 'rb') as f:
parquet_file = pq.ParquetFile(f)
# Read all data
table = parquet_file.read()
print(f"Total rows: {len(table)}")
print(f"Columns: {table.column_names}\n")
# Process each row
data_list = []
for i in range(len(table)):
row = {col: table[col][i].as_py() for col in table.column_names}
# Prepare data entry
data_entry = {
"id": row["id"],
"question": row["question"],
"options": row["options"] if row["options"] else [],
"answer": row["answer"],
"solution": row["solution"],
"level": row["level"],
"subject": row["subject"],
}
# Handle image
if row["decoded_image"] and row["decoded_image"]["bytes"]:
image_bytes = row["decoded_image"]["bytes"]
image_filename = f"{row['id']}.png"
image_path = os.path.join(images_dir, image_filename)
try:
# Save image
image = Image.open(io.BytesIO(image_bytes))
image.save(image_path)
data_entry["image"] = f"images/{image_filename}"
print(f"Saved image: {image_filename}")
except Exception as e:
print(f"Error saving image for id {row['id']}: {e}")
data_entry["image"] = None
else:
data_entry["image"] = None
data_list.append(data_entry)
# Save JSON
print(f"\nSaving JSON to: {json_path}")
with open(json_path, 'w', encoding='utf-8') as json_file:
json.dump(data_list, json_file, ensure_ascii=False, indent=2)
print(f"\n✓ Conversion complete!")
print(f" - JSON file: {json_path}")
print(f" - Images directory: {images_dir}")
print(f" - Total items: {len(data_list)}")
print(f" - Total images saved: {len([d for d in data_list if d['image']])}")