Spaces:
Sleeping
Sleeping
| """ | |
| 数据集划分脚本 | |
| 将 dataset/trashnet/ 按比例拆分为 train / val / test | |
| 生成结构: | |
| dataset/ | |
| ├── trashnet/ (原始数据, 不动) | |
| ├── train/ | |
| │ ├── cardboard/ | |
| │ └── ... | |
| ├── val/ | |
| │ ├── cardboard/ | |
| │ └── ... | |
| └── test/ | |
| ├── cardboard/ | |
| └── ... | |
| """ | |
| import argparse | |
| import random | |
| import shutil | |
| from pathlib import Path | |
| def split_dataset(data_dir, output_dir, train_ratio=0.7, val_ratio=0.15, seed=42): | |
| data_dir = Path(data_dir) | |
| output_dir = Path(output_dir) | |
| if not data_dir.exists(): | |
| print(f"✗ 数据集路径不存在: {data_dir}") | |
| return | |
| random.seed(seed) | |
| # 收集所有类别 | |
| classes = sorted([d.name for d in data_dir.iterdir() if d.is_dir()]) | |
| print(f"发现 {len(classes)} 个类别: {classes}") | |
| splits = {"train": train_ratio, "val": val_ratio, "test": 1 - train_ratio - val_ratio} | |
| print(f"\n划分比例: {splits}") | |
| for cls in classes: | |
| src_dir = data_dir / cls | |
| images = sorted([f for f in src_dir.iterdir() if f.is_file()]) | |
| random.shuffle(images) | |
| n = len(images) | |
| n_train = int(n * train_ratio) | |
| n_val = int(n * val_ratio) | |
| split_files = { | |
| "train": images[:n_train], | |
| "val": images[n_train:n_train + n_val], | |
| "test": images[n_train + n_val:], | |
| } | |
| for split_name, files in split_files.items(): | |
| dest_dir = output_dir / split_name / cls | |
| dest_dir.mkdir(parents=True, exist_ok=True) | |
| for f in files: | |
| shutil.copy2(f, dest_dir / f.name) | |
| print(f" {cls:12s}: train={len(split_files['train']):4d} " | |
| f"val={len(split_files['val']):4d} " | |
| f"test={len(split_files['test']):4d}") | |
| print(f"\n✓ 划分完成!") | |
| print(f" 输出目录: {output_dir.resolve()}") | |
| print(f" 结构: ") | |
| print(f" {output_dir.name}/") | |
| for split_name in ["train", "val", "test"]: | |
| total = sum(len(list((output_dir / split_name / cls).iterdir())) for cls in classes) | |
| print(f" ├── {split_name}/ ({total} 张)") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="划分训练集/验证集/测试集") | |
| parser.add_argument("--data-dir", default="dataset/trashnet", help="原始数据集路径") | |
| parser.add_argument("--output-dir", default="dataset", help="输出目录 (将在其中创建 train/val/test)") | |
| parser.add_argument("--train-ratio", type=float, default=0.7, help="训练集比例") | |
| parser.add_argument("--val-ratio", type=float, default=0.15, help="验证集比例") | |
| parser.add_argument("--seed", type=int, default=42, help="随机种子") | |
| args = parser.parse_args() | |
| split_dataset(args.data_dir, args.output_dir, args.train_ratio, args.val_ratio, args.seed) | |