File size: 1,280 Bytes
be305fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import json
import pandas as pd
from datasets import load_dataset
from PIL import Image
import shutil
from tqdm import tqdm

def load_and_process():
    dataset = load_dataset("poloclub/diffusiondb", split="train[:1000]")
    
    os.makedirs("processed/images", exist_ok=True)
    processed_data = []
    
    for idx, sample in enumerate(tqdm(dataset)):
        image_id = f"{idx:06d}.png"
        
        if sample.get('image'):
            sample['image'].save(f"processed/images/{image_id}")
        
        data_entry = {
            "id": idx,
            "image_file": image_id,
            "prompt": sample.get('p', ''),
            "seed": sample.get('se', 0),
            "cfg_scale": sample.get('c', 0.0),
            "steps": sample.get('st', 0),
            "sampler": sample.get('sa', '')
        }
        processed_data.append(data_entry)
    
    return processed_data

def save_data(data):
    with open("processed/data.json", "w") as f:
        json.dump(data, f)
    
    df = pd.DataFrame(data)
    df.to_csv("processed/data.csv", index=False)
    df.to_parquet("processed/data.parquet", index=False)

def main():
    data = load_and_process()
    save_data(data)
    print(f"Processed {len(data)} samples")

if __name__ == "__main__":
    main()