File size: 2,501 Bytes
e34b94f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pyarrow.parquet as pq
import json
import os
from pathlib import Path
from PIL import Image
import io

# Configuration
parquet_path = "/root/CVPR/MemGen/data/mathvision/data/testmini-00000-of-00001-f8ff70fcb2f29b1d.parquet"
output_dir = "/root/CVPR/MemGen/data/mathvision/processed"
images_dir = os.path.join(output_dir, "images")
json_path = os.path.join(output_dir, "data.json")

# Create output directories
os.makedirs(images_dir, exist_ok=True)

print("Reading parquet file...")
# Read the parquet file
with open(parquet_path, 'rb') as f:
    parquet_file = pq.ParquetFile(f)
    
    # Read all data
    table = parquet_file.read()
    
    print(f"Total rows: {len(table)}")
    print(f"Columns: {table.column_names}\n")
    
    # Process each row
    data_list = []
    for i in range(len(table)):
        row = {col: table[col][i].as_py() for col in table.column_names}
        
        # Prepare data entry
        data_entry = {
            "id": row["id"],
            "question": row["question"],
            "options": row["options"] if row["options"] else [],
            "answer": row["answer"],
            "solution": row["solution"],
            "level": row["level"],
            "subject": row["subject"],
        }
        
        # Handle image
        if row["decoded_image"] and row["decoded_image"]["bytes"]:
            image_bytes = row["decoded_image"]["bytes"]
            image_filename = f"{row['id']}.png"
            image_path = os.path.join(images_dir, image_filename)
            
            try:
                # Save image
                image = Image.open(io.BytesIO(image_bytes))
                image.save(image_path)
                data_entry["image"] = f"images/{image_filename}"
                print(f"Saved image: {image_filename}")
            except Exception as e:
                print(f"Error saving image for id {row['id']}: {e}")
                data_entry["image"] = None
        else:
            data_entry["image"] = None
        
        data_list.append(data_entry)
    
    # Save JSON
    print(f"\nSaving JSON to: {json_path}")
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(data_list, json_file, ensure_ascii=False, indent=2)
    
    print(f"\n✓ Conversion complete!")
    print(f"  - JSON file: {json_path}")
    print(f"  - Images directory: {images_dir}")
    print(f"  - Total items: {len(data_list)}")
    print(f"  - Total images saved: {len([d for d in data_list if d['image']])}")