|
|
import pyarrow.parquet as pq |
|
|
import json |
|
|
import os |
|
|
from pathlib import Path |
|
|
from PIL import Image |
|
|
import io |
|
|
|
|
|
|
|
|
parquet_path = "/root/CVPR/MemGen/data/mathvision/data/testmini-00000-of-00001-f8ff70fcb2f29b1d.parquet" |
|
|
output_dir = "/root/CVPR/MemGen/data/mathvision/processed" |
|
|
images_dir = os.path.join(output_dir, "images") |
|
|
json_path = os.path.join(output_dir, "data.json") |
|
|
|
|
|
|
|
|
os.makedirs(images_dir, exist_ok=True) |
|
|
|
|
|
print("Reading parquet file...") |
|
|
|
|
|
with open(parquet_path, 'rb') as f: |
|
|
parquet_file = pq.ParquetFile(f) |
|
|
|
|
|
|
|
|
table = parquet_file.read() |
|
|
|
|
|
print(f"Total rows: {len(table)}") |
|
|
print(f"Columns: {table.column_names}\n") |
|
|
|
|
|
|
|
|
data_list = [] |
|
|
for i in range(len(table)): |
|
|
row = {col: table[col][i].as_py() for col in table.column_names} |
|
|
|
|
|
|
|
|
data_entry = { |
|
|
"id": row["id"], |
|
|
"question": row["question"], |
|
|
"options": row["options"] if row["options"] else [], |
|
|
"answer": row["answer"], |
|
|
"solution": row["solution"], |
|
|
"level": row["level"], |
|
|
"subject": row["subject"], |
|
|
} |
|
|
|
|
|
|
|
|
if row["decoded_image"] and row["decoded_image"]["bytes"]: |
|
|
image_bytes = row["decoded_image"]["bytes"] |
|
|
image_filename = f"{row['id']}.png" |
|
|
image_path = os.path.join(images_dir, image_filename) |
|
|
|
|
|
try: |
|
|
|
|
|
image = Image.open(io.BytesIO(image_bytes)) |
|
|
image.save(image_path) |
|
|
data_entry["image"] = f"images/{image_filename}" |
|
|
print(f"Saved image: {image_filename}") |
|
|
except Exception as e: |
|
|
print(f"Error saving image for id {row['id']}: {e}") |
|
|
data_entry["image"] = None |
|
|
else: |
|
|
data_entry["image"] = None |
|
|
|
|
|
data_list.append(data_entry) |
|
|
|
|
|
|
|
|
print(f"\nSaving JSON to: {json_path}") |
|
|
with open(json_path, 'w', encoding='utf-8') as json_file: |
|
|
json.dump(data_list, json_file, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"\n✓ Conversion complete!") |
|
|
print(f" - JSON file: {json_path}") |
|
|
print(f" - Images directory: {images_dir}") |
|
|
print(f" - Total items: {len(data_list)}") |
|
|
print(f" - Total images saved: {len([d for d in data_list if d['image']])}") |
|
|
|
|
|
|