""" 数据集划分脚本 将 dataset/trashnet/ 按比例拆分为 train / val / test 生成结构: dataset/ ├── trashnet/ (原始数据, 不动) ├── train/ │ ├── cardboard/ │ └── ... ├── val/ │ ├── cardboard/ │ └── ... └── test/ ├── cardboard/ └── ... """ import argparse import random import shutil from pathlib import Path def split_dataset(data_dir, output_dir, train_ratio=0.7, val_ratio=0.15, seed=42): data_dir = Path(data_dir) output_dir = Path(output_dir) if not data_dir.exists(): print(f"✗ 数据集路径不存在: {data_dir}") return random.seed(seed) # 收集所有类别 classes = sorted([d.name for d in data_dir.iterdir() if d.is_dir()]) print(f"发现 {len(classes)} 个类别: {classes}") splits = {"train": train_ratio, "val": val_ratio, "test": 1 - train_ratio - val_ratio} print(f"\n划分比例: {splits}") for cls in classes: src_dir = data_dir / cls images = sorted([f for f in src_dir.iterdir() if f.is_file()]) random.shuffle(images) n = len(images) n_train = int(n * train_ratio) n_val = int(n * val_ratio) split_files = { "train": images[:n_train], "val": images[n_train:n_train + n_val], "test": images[n_train + n_val:], } for split_name, files in split_files.items(): dest_dir = output_dir / split_name / cls dest_dir.mkdir(parents=True, exist_ok=True) for f in files: shutil.copy2(f, dest_dir / f.name) print(f" {cls:12s}: train={len(split_files['train']):4d} " f"val={len(split_files['val']):4d} " f"test={len(split_files['test']):4d}") print(f"\n✓ 划分完成!") print(f" 输出目录: {output_dir.resolve()}") print(f" 结构: ") print(f" {output_dir.name}/") for split_name in ["train", "val", "test"]: total = sum(len(list((output_dir / split_name / cls).iterdir())) for cls in classes) print(f" ├── {split_name}/ ({total} 张)") if __name__ == "__main__": parser = argparse.ArgumentParser(description="划分训练集/验证集/测试集") parser.add_argument("--data-dir", default="dataset/trashnet", help="原始数据集路径") parser.add_argument("--output-dir", default="dataset", help="输出目录 (将在其中创建 train/val/test)") parser.add_argument("--train-ratio", type=float, default=0.7, help="训练集比例") parser.add_argument("--val-ratio", type=float, default=0.15, help="验证集比例") parser.add_argument("--seed", type=int, default=42, help="随机种子") args = parser.parse_args() split_dataset(args.data_dir, args.output_dir, args.train_ratio, args.val_ratio, args.seed)