Spaces:
Sleeping
Sleeping
File size: 8,441 Bytes
c8df794 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
"""
Dataset preprocessing script for crop disease detection
Organizes raw dataset into train/val/test splits
"""
import os
import shutil
import random
from pathlib import Path
from collections import defaultdict
import json
class DatasetPreprocessor:
"""Preprocesses raw crop disease dataset into train/val/test splits"""
def __init__(self, raw_data_path, output_path, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
"""
Args:
raw_data_path: Path to raw dataset
output_path: Path where processed dataset will be saved
train_ratio: Proportion of data for training
val_ratio: Proportion of data for validation
test_ratio: Proportion of data for testing
"""
self.raw_data_path = Path(raw_data_path)
self.output_path = Path(output_path)
self.train_ratio = train_ratio
self.val_ratio = val_ratio
self.test_ratio = test_ratio
# Ensure ratios sum to 1
assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 0.001, "Ratios must sum to 1"
# Create output directories
self.train_dir = self.output_path / "train"
self.val_dir = self.output_path / "val"
self.test_dir = self.output_path / "test"
def get_class_directories(self):
"""Get all class directories from raw data"""
class_dirs = []
for item in self.raw_data_path.iterdir():
if item.is_dir() and not item.name.startswith('.'):
class_dirs.append(item)
return sorted(class_dirs)
def count_images_per_class(self):
"""Count number of images per class"""
class_counts = {}
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
for class_dir in self.get_class_directories():
count = 0
for img_file in class_dir.iterdir():
if img_file.suffix.lower() in image_extensions:
count += 1
class_counts[class_dir.name] = count
return class_counts
def create_output_structure(self):
"""Create output directory structure"""
# Remove existing output if it exists
if self.output_path.exists():
shutil.rmtree(self.output_path)
# Create base directories
self.output_path.mkdir(parents=True, exist_ok=True)
self.train_dir.mkdir(exist_ok=True)
self.val_dir.mkdir(exist_ok=True)
self.test_dir.mkdir(exist_ok=True)
# Create class subdirectories
for class_dir in self.get_class_directories():
class_name = class_dir.name
(self.train_dir / class_name).mkdir(exist_ok=True)
(self.val_dir / class_name).mkdir(exist_ok=True)
(self.test_dir / class_name).mkdir(exist_ok=True)
def split_and_copy_data(self):
"""Split data into train/val/test and copy files"""
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
split_stats = defaultdict(lambda: defaultdict(int))
for class_dir in self.get_class_directories():
class_name = class_dir.name
print(f"Processing class: {class_name}")
# Get all image files
image_files = []
for img_file in class_dir.iterdir():
if img_file.suffix.lower() in image_extensions:
image_files.append(img_file)
# Shuffle files for random split
random.shuffle(image_files)
# Calculate split indices
total_images = len(image_files)
train_end = int(total_images * self.train_ratio)
val_end = train_end + int(total_images * self.val_ratio)
# Split files
train_files = image_files[:train_end]
val_files = image_files[train_end:val_end]
test_files = image_files[val_end:]
# Copy files to respective directories
for files, target_dir, split_name in [
(train_files, self.train_dir, 'train'),
(val_files, self.val_dir, 'val'),
(test_files, self.test_dir, 'test')
]:
target_class_dir = target_dir / class_name
for img_file in files:
shutil.copy2(img_file, target_class_dir / img_file.name)
split_stats[split_name][class_name] = len(files)
print(f" {split_name}: {len(files)} images")
return split_stats
def generate_dataset_info(self, split_stats):
"""Generate dataset information JSON"""
# Get class names
class_names = sorted([d.name for d in self.get_class_directories()])
# Create class to index mapping
class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}
# Calculate totals
total_stats = {}
for split in ['train', 'val', 'test']:
total_stats[split] = sum(split_stats[split].values())
dataset_info = {
'dataset_name': 'Crop Disease Detection - Retrained',
'num_classes': len(class_names),
'class_names': class_names,
'class_to_idx': class_to_idx,
'split_ratios': {
'train': self.train_ratio,
'val': self.val_ratio,
'test': self.test_ratio
},
'split_stats': dict(split_stats),
'total_images': {
'train': total_stats['train'],
'val': total_stats['val'],
'test': total_stats['test'],
'total': sum(total_stats.values())
}
}
# Save dataset info
info_file = self.output_path / 'dataset_info.json'
with open(info_file, 'w') as f:
json.dump(dataset_info, f, indent=2)
return dataset_info
def preprocess(self, seed=42):
"""Main preprocessing function"""
print("Starting dataset preprocessing...")
print(f"Raw data path: {self.raw_data_path}")
print(f"Output path: {self.output_path}")
print(f"Split ratios - Train: {self.train_ratio}, Val: {self.val_ratio}, Test: {self.test_ratio}")
# Set random seed for reproducibility
random.seed(seed)
# Count images per class
class_counts = self.count_images_per_class()
print("\nImages per class in raw dataset:")
for class_name, count in class_counts.items():
print(f" {class_name}: {count}")
total_images = sum(class_counts.values())
print(f"\nTotal images: {total_images}")
# Create output structure
print("\nCreating output directory structure...")
self.create_output_structure()
# Split and copy data
print("\nSplitting and copying data...")
split_stats = self.split_and_copy_data()
# Generate dataset info
print("\nGenerating dataset information...")
dataset_info = self.generate_dataset_info(split_stats)
print("\nDataset preprocessing completed!")
print(f"Train images: {dataset_info['total_images']['train']}")
print(f"Val images: {dataset_info['total_images']['val']}")
print(f"Test images: {dataset_info['total_images']['test']}")
print(f"Total processed: {dataset_info['total_images']['total']}")
return dataset_info
def main():
"""Main function to run preprocessing"""
# Set paths
raw_data_path = "data/raw"
output_path = "data/processed"
# Create preprocessor
preprocessor = DatasetPreprocessor(
raw_data_path=raw_data_path,
output_path=output_path,
train_ratio=0.7,
val_ratio=0.15,
test_ratio=0.15
)
# Run preprocessing
dataset_info = preprocessor.preprocess()
print(f"\nDataset info saved to: {output_path}/dataset_info.json")
print(f"Classes found: {dataset_info['num_classes']}")
print("Class names:")
for i, class_name in enumerate(dataset_info['class_names']):
print(f" {i}: {class_name}")
if __name__ == "__main__":
main()
|