imagePrediction / create_data.py
sahar-yaccov's picture
Upload 2 files
8d55a0c verified
import os
from datasets import load_dataset
# 1. 讟注讬谞转 讛讚讗讟讛 讜注专讘讜讘 (讘讜讞专讬诐 1200 讻讚讬 诇讞诇拽 讘爪讜专讛 谞讜讞讛)
dataset = load_dataset("Hemg/AI-Generated-vs-Real-Images-Datasets", split="train")
dataset = dataset.shuffle(seed=42)
subset_size = 3000
small_subset = dataset.select(range(subset_size))
train_float , val_float = 0.8 , 0.1
# 2. 讛讙讚专转 讬讞住讬 讞诇讜拽讛
train_end = int(subset_size * train_float) # 840 转诪讜谞讜转
val_end = train_end + int(subset_size * val_float) # 180 转诪讜谞讜转
# 3. 讬爪讬专转 诪讘谞讛 讛转讬拽讬讜转
base_dir = "data"
splits = ['train', 'val', 'test']
categories = ['ai', 'real']
for s in splits:
for cat in categories:
os.makedirs(f"{base_dir}/{s}/{cat}", exist_ok=True)
# 4. 诪注讘专 注诇 讛转诪讜谞讜转 讜砖诪讬专转谉 讘诪拽讜诐 讛谞讻讜谉
for i, item in enumerate(small_subset):
image = item['image']
label = item['label'] # 0=Real, 1=Fake/AI
# 拽讘讬注转 讛驻讬爪讜诇 (Split)
if i < train_end:
current_split = 'train'
elif i < val_end:
current_split = 'val'
else:
current_split = 'test'
# 拽讘讬注转 讛拽讟讙讜专讬讛 (Category)
# 砖讬诐 诇讘: 讘诪注专讱 讛讝讛 1 讛讜讗 Fake (AI) 讜-0 讛讜讗 Real
category = "ai" if label == 1 else "real"
# 砖诪讬专转 讛转诪讜谞讛
file_path = f"{base_dir}/{current_split}/{category}/img_{i}.png"
image.save(file_path)
print(f"讛住转讬讬诐! 讛诪讘谞讛 谞讜爪专 讘转讬拽讬讬转 '{base_dir}':")
print("- Train: 840 images")
print("- Val: 180 images")
print("- Test: 180 images")