File size: 8,441 Bytes
c8df794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
Dataset preprocessing script for crop disease detection
Organizes raw dataset into train/val/test splits
"""

import os
import shutil
import random
from pathlib import Path
from collections import defaultdict
import json

class DatasetPreprocessor:
    """Preprocesses raw crop disease dataset into train/val/test splits"""
    
    def __init__(self, raw_data_path, output_path, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
        """
        Args:
            raw_data_path: Path to raw dataset
            output_path: Path where processed dataset will be saved
            train_ratio: Proportion of data for training
            val_ratio: Proportion of data for validation  
            test_ratio: Proportion of data for testing
        """
        self.raw_data_path = Path(raw_data_path)
        self.output_path = Path(output_path)
        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.test_ratio = test_ratio
        
        # Ensure ratios sum to 1
        assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 0.001, "Ratios must sum to 1"
        
        # Create output directories
        self.train_dir = self.output_path / "train"
        self.val_dir = self.output_path / "val"
        self.test_dir = self.output_path / "test"
        
    def get_class_directories(self):
        """Get all class directories from raw data"""
        class_dirs = []
        for item in self.raw_data_path.iterdir():
            if item.is_dir() and not item.name.startswith('.'):
                class_dirs.append(item)
        return sorted(class_dirs)
    
    def count_images_per_class(self):
        """Count number of images per class"""
        class_counts = {}
        image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
        
        for class_dir in self.get_class_directories():
            count = 0
            for img_file in class_dir.iterdir():
                if img_file.suffix.lower() in image_extensions:
                    count += 1
            class_counts[class_dir.name] = count
            
        return class_counts
    
    def create_output_structure(self):
        """Create output directory structure"""
        # Remove existing output if it exists
        if self.output_path.exists():
            shutil.rmtree(self.output_path)
        
        # Create base directories
        self.output_path.mkdir(parents=True, exist_ok=True)
        self.train_dir.mkdir(exist_ok=True)
        self.val_dir.mkdir(exist_ok=True)
        self.test_dir.mkdir(exist_ok=True)
        
        # Create class subdirectories
        for class_dir in self.get_class_directories():
            class_name = class_dir.name
            (self.train_dir / class_name).mkdir(exist_ok=True)
            (self.val_dir / class_name).mkdir(exist_ok=True)
            (self.test_dir / class_name).mkdir(exist_ok=True)
    
    def split_and_copy_data(self):
        """Split data into train/val/test and copy files"""
        image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
        split_stats = defaultdict(lambda: defaultdict(int))
        
        for class_dir in self.get_class_directories():
            class_name = class_dir.name
            print(f"Processing class: {class_name}")
            
            # Get all image files
            image_files = []
            for img_file in class_dir.iterdir():
                if img_file.suffix.lower() in image_extensions:
                    image_files.append(img_file)
            
            # Shuffle files for random split
            random.shuffle(image_files)
            
            # Calculate split indices
            total_images = len(image_files)
            train_end = int(total_images * self.train_ratio)
            val_end = train_end + int(total_images * self.val_ratio)
            
            # Split files
            train_files = image_files[:train_end]
            val_files = image_files[train_end:val_end]
            test_files = image_files[val_end:]
            
            # Copy files to respective directories
            for files, target_dir, split_name in [
                (train_files, self.train_dir, 'train'),
                (val_files, self.val_dir, 'val'),
                (test_files, self.test_dir, 'test')
            ]:
                target_class_dir = target_dir / class_name
                for img_file in files:
                    shutil.copy2(img_file, target_class_dir / img_file.name)
                
                split_stats[split_name][class_name] = len(files)
                print(f"  {split_name}: {len(files)} images")
        
        return split_stats
    
    def generate_dataset_info(self, split_stats):
        """Generate dataset information JSON"""
        # Get class names
        class_names = sorted([d.name for d in self.get_class_directories()])
        
        # Create class to index mapping
        class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}
        
        # Calculate totals
        total_stats = {}
        for split in ['train', 'val', 'test']:
            total_stats[split] = sum(split_stats[split].values())
        
        dataset_info = {
            'dataset_name': 'Crop Disease Detection - Retrained',
            'num_classes': len(class_names),
            'class_names': class_names,
            'class_to_idx': class_to_idx,
            'split_ratios': {
                'train': self.train_ratio,
                'val': self.val_ratio,
                'test': self.test_ratio
            },
            'split_stats': dict(split_stats),
            'total_images': {
                'train': total_stats['train'],
                'val': total_stats['val'],
                'test': total_stats['test'],
                'total': sum(total_stats.values())
            }
        }
        
        # Save dataset info
        info_file = self.output_path / 'dataset_info.json'
        with open(info_file, 'w') as f:
            json.dump(dataset_info, f, indent=2)
        
        return dataset_info
    
    def preprocess(self, seed=42):
        """Main preprocessing function"""
        print("Starting dataset preprocessing...")
        print(f"Raw data path: {self.raw_data_path}")
        print(f"Output path: {self.output_path}")
        print(f"Split ratios - Train: {self.train_ratio}, Val: {self.val_ratio}, Test: {self.test_ratio}")
        
        # Set random seed for reproducibility
        random.seed(seed)
        
        # Count images per class
        class_counts = self.count_images_per_class()
        print("\nImages per class in raw dataset:")
        for class_name, count in class_counts.items():
            print(f"  {class_name}: {count}")
        
        total_images = sum(class_counts.values())
        print(f"\nTotal images: {total_images}")
        
        # Create output structure
        print("\nCreating output directory structure...")
        self.create_output_structure()
        
        # Split and copy data
        print("\nSplitting and copying data...")
        split_stats = self.split_and_copy_data()
        
        # Generate dataset info
        print("\nGenerating dataset information...")
        dataset_info = self.generate_dataset_info(split_stats)
        
        print("\nDataset preprocessing completed!")
        print(f"Train images: {dataset_info['total_images']['train']}")
        print(f"Val images: {dataset_info['total_images']['val']}")
        print(f"Test images: {dataset_info['total_images']['test']}")
        print(f"Total processed: {dataset_info['total_images']['total']}")
        
        return dataset_info

def main():
    """Main function to run preprocessing"""
    # Set paths
    raw_data_path = "data/raw"
    output_path = "data/processed"
    
    # Create preprocessor
    preprocessor = DatasetPreprocessor(
        raw_data_path=raw_data_path,
        output_path=output_path,
        train_ratio=0.7,
        val_ratio=0.15,
        test_ratio=0.15
    )
    
    # Run preprocessing
    dataset_info = preprocessor.preprocess()
    
    print(f"\nDataset info saved to: {output_path}/dataset_info.json")
    print(f"Classes found: {dataset_info['num_classes']}")
    print("Class names:")
    for i, class_name in enumerate(dataset_info['class_names']):
        print(f"  {i}: {class_name}")

if __name__ == "__main__":
    main()