File size: 1,581 Bytes
8d55a0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
from datasets import load_dataset

# 1. 讟注讬谞转 讛讚讗讟讛 讜注专讘讜讘 (讘讜讞专讬诐 1200 讻讚讬 诇讞诇拽 讘爪讜专讛 谞讜讞讛)
dataset = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train")
dataset = dataset.shuffle(seed=42)
subset_size = 3000
small_subset = dataset.select(range(subset_size))

train_float , val_float = 0.8 , 0.1
# 2. 讛讙讚专转 讬讞住讬 讞诇讜拽讛
train_end = int(subset_size * train_float)  # 840 转诪讜谞讜转
val_end = train_end + int(subset_size * val_float)  # 180 转诪讜谞讜转

# 3. 讬爪讬专转 诪讘谞讛 讛转讬拽讬讜转
base_dir = "data"
splits = ['train', 'val', 'test']
categories = ['ai', 'real']

for s in splits:
    for cat in categories:
        os.makedirs(f"{base_dir}/{s}/{cat}", exist_ok=True)

# 4. 诪注讘专 注诇 讛转诪讜谞讜转 讜砖诪讬专转谉 讘诪拽讜诐 讛谞讻讜谉
for i, item in enumerate(small_subset):
    image = item['image']
    label = item['label']  # 0=Real, 1=Fake/AI

    # 拽讘讬注转 讛驻讬爪讜诇 (Split)
    if i < train_end:
        current_split = 'train'
    elif i < val_end:
        current_split = 'val'
    else:
        current_split = 'test'

    # 拽讘讬注转 讛拽讟讙讜专讬讛 (Category)
    # 砖讬诐 诇讘: 讘诪注专讱 讛讝讛 1 讛讜讗 Fake (AI) 讜-0 讛讜讗 Real
    category = "ai" if label == 1 else "real"

    # 砖诪讬专转 讛转诪讜谞讛
    file_path = f"{base_dir}/{current_split}/{category}/img_{i}.png"
    image.save(file_path)

print(f"讛住转讬讬诐! 讛诪讘谞讛 谞讜爪专 讘转讬拽讬讬转 '{base_dir}':")
print("- Train: 840 images")
print("- Val: 180 images")
print("- Test: 180 images")