garbage-classification / split_dataset.py
hutiger's picture
Upload folder using huggingface_hub
bf5b4d8 verified
Raw
History Blame Contribute Delete
2.92 kB
"""
数据集划分脚本
将 dataset/trashnet/ 按比例拆分为 train / val / test
生成结构:
dataset/
├── trashnet/ (原始数据, 不动)
├── train/
│ ├── cardboard/
│ └── ...
├── val/
│ ├── cardboard/
│ └── ...
└── test/
├── cardboard/
└── ...
"""
import argparse
import random
import shutil
from pathlib import Path
def split_dataset(data_dir, output_dir, train_ratio=0.7, val_ratio=0.15, seed=42):
data_dir = Path(data_dir)
output_dir = Path(output_dir)
if not data_dir.exists():
print(f"✗ 数据集路径不存在: {data_dir}")
return
random.seed(seed)
# 收集所有类别
classes = sorted([d.name for d in data_dir.iterdir() if d.is_dir()])
print(f"发现 {len(classes)} 个类别: {classes}")
splits = {"train": train_ratio, "val": val_ratio, "test": 1 - train_ratio - val_ratio}
print(f"\n划分比例: {splits}")
for cls in classes:
src_dir = data_dir / cls
images = sorted([f for f in src_dir.iterdir() if f.is_file()])
random.shuffle(images)
n = len(images)
n_train = int(n * train_ratio)
n_val = int(n * val_ratio)
split_files = {
"train": images[:n_train],
"val": images[n_train:n_train + n_val],
"test": images[n_train + n_val:],
}
for split_name, files in split_files.items():
dest_dir = output_dir / split_name / cls
dest_dir.mkdir(parents=True, exist_ok=True)
for f in files:
shutil.copy2(f, dest_dir / f.name)
print(f" {cls:12s}: train={len(split_files['train']):4d} "
f"val={len(split_files['val']):4d} "
f"test={len(split_files['test']):4d}")
print(f"\n✓ 划分完成!")
print(f" 输出目录: {output_dir.resolve()}")
print(f" 结构: ")
print(f" {output_dir.name}/")
for split_name in ["train", "val", "test"]:
total = sum(len(list((output_dir / split_name / cls).iterdir())) for cls in classes)
print(f" ├── {split_name}/ ({total} 张)")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="划分训练集/验证集/测试集")
parser.add_argument("--data-dir", default="dataset/trashnet", help="原始数据集路径")
parser.add_argument("--output-dir", default="dataset", help="输出目录 (将在其中创建 train/val/test)")
parser.add_argument("--train-ratio", type=float, default=0.7, help="训练集比例")
parser.add_argument("--val-ratio", type=float, default=0.15, help="验证集比例")
parser.add_argument("--seed", type=int, default=42, help="随机种子")
args = parser.parse_args()
split_dataset(args.data_dir, args.output_dir, args.train_ratio, args.val_ratio, args.seed)