crop_ai_diseases / uselessfiles /preprocess_raw_data.py
vivek12coder's picture
Upload 20960 files
c8df794 verified
"""
Dataset preprocessing script for crop disease detection
Organizes raw dataset into train/val/test splits
"""
import os
import shutil
import random
from pathlib import Path
from collections import defaultdict
import json
class DatasetPreprocessor:
"""Preprocesses raw crop disease dataset into train/val/test splits"""
def __init__(self, raw_data_path, output_path, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
"""
Args:
raw_data_path: Path to raw dataset
output_path: Path where processed dataset will be saved
train_ratio: Proportion of data for training
val_ratio: Proportion of data for validation
test_ratio: Proportion of data for testing
"""
self.raw_data_path = Path(raw_data_path)
self.output_path = Path(output_path)
self.train_ratio = train_ratio
self.val_ratio = val_ratio
self.test_ratio = test_ratio
# Ensure ratios sum to 1
assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 0.001, "Ratios must sum to 1"
# Create output directories
self.train_dir = self.output_path / "train"
self.val_dir = self.output_path / "val"
self.test_dir = self.output_path / "test"
def get_class_directories(self):
"""Get all class directories from raw data"""
class_dirs = []
for item in self.raw_data_path.iterdir():
if item.is_dir() and not item.name.startswith('.'):
class_dirs.append(item)
return sorted(class_dirs)
def count_images_per_class(self):
"""Count number of images per class"""
class_counts = {}
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
for class_dir in self.get_class_directories():
count = 0
for img_file in class_dir.iterdir():
if img_file.suffix.lower() in image_extensions:
count += 1
class_counts[class_dir.name] = count
return class_counts
def create_output_structure(self):
"""Create output directory structure"""
# Remove existing output if it exists
if self.output_path.exists():
shutil.rmtree(self.output_path)
# Create base directories
self.output_path.mkdir(parents=True, exist_ok=True)
self.train_dir.mkdir(exist_ok=True)
self.val_dir.mkdir(exist_ok=True)
self.test_dir.mkdir(exist_ok=True)
# Create class subdirectories
for class_dir in self.get_class_directories():
class_name = class_dir.name
(self.train_dir / class_name).mkdir(exist_ok=True)
(self.val_dir / class_name).mkdir(exist_ok=True)
(self.test_dir / class_name).mkdir(exist_ok=True)
def split_and_copy_data(self):
"""Split data into train/val/test and copy files"""
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
split_stats = defaultdict(lambda: defaultdict(int))
for class_dir in self.get_class_directories():
class_name = class_dir.name
print(f"Processing class: {class_name}")
# Get all image files
image_files = []
for img_file in class_dir.iterdir():
if img_file.suffix.lower() in image_extensions:
image_files.append(img_file)
# Shuffle files for random split
random.shuffle(image_files)
# Calculate split indices
total_images = len(image_files)
train_end = int(total_images * self.train_ratio)
val_end = train_end + int(total_images * self.val_ratio)
# Split files
train_files = image_files[:train_end]
val_files = image_files[train_end:val_end]
test_files = image_files[val_end:]
# Copy files to respective directories
for files, target_dir, split_name in [
(train_files, self.train_dir, 'train'),
(val_files, self.val_dir, 'val'),
(test_files, self.test_dir, 'test')
]:
target_class_dir = target_dir / class_name
for img_file in files:
shutil.copy2(img_file, target_class_dir / img_file.name)
split_stats[split_name][class_name] = len(files)
print(f" {split_name}: {len(files)} images")
return split_stats
def generate_dataset_info(self, split_stats):
"""Generate dataset information JSON"""
# Get class names
class_names = sorted([d.name for d in self.get_class_directories()])
# Create class to index mapping
class_to_idx = {cls_name: idx for idx, cls_name in enumerate(class_names)}
# Calculate totals
total_stats = {}
for split in ['train', 'val', 'test']:
total_stats[split] = sum(split_stats[split].values())
dataset_info = {
'dataset_name': 'Crop Disease Detection - Retrained',
'num_classes': len(class_names),
'class_names': class_names,
'class_to_idx': class_to_idx,
'split_ratios': {
'train': self.train_ratio,
'val': self.val_ratio,
'test': self.test_ratio
},
'split_stats': dict(split_stats),
'total_images': {
'train': total_stats['train'],
'val': total_stats['val'],
'test': total_stats['test'],
'total': sum(total_stats.values())
}
}
# Save dataset info
info_file = self.output_path / 'dataset_info.json'
with open(info_file, 'w') as f:
json.dump(dataset_info, f, indent=2)
return dataset_info
def preprocess(self, seed=42):
"""Main preprocessing function"""
print("Starting dataset preprocessing...")
print(f"Raw data path: {self.raw_data_path}")
print(f"Output path: {self.output_path}")
print(f"Split ratios - Train: {self.train_ratio}, Val: {self.val_ratio}, Test: {self.test_ratio}")
# Set random seed for reproducibility
random.seed(seed)
# Count images per class
class_counts = self.count_images_per_class()
print("\nImages per class in raw dataset:")
for class_name, count in class_counts.items():
print(f" {class_name}: {count}")
total_images = sum(class_counts.values())
print(f"\nTotal images: {total_images}")
# Create output structure
print("\nCreating output directory structure...")
self.create_output_structure()
# Split and copy data
print("\nSplitting and copying data...")
split_stats = self.split_and_copy_data()
# Generate dataset info
print("\nGenerating dataset information...")
dataset_info = self.generate_dataset_info(split_stats)
print("\nDataset preprocessing completed!")
print(f"Train images: {dataset_info['total_images']['train']}")
print(f"Val images: {dataset_info['total_images']['val']}")
print(f"Test images: {dataset_info['total_images']['test']}")
print(f"Total processed: {dataset_info['total_images']['total']}")
return dataset_info
def main():
"""Main function to run preprocessing"""
# Set paths
raw_data_path = "data/raw"
output_path = "data/processed"
# Create preprocessor
preprocessor = DatasetPreprocessor(
raw_data_path=raw_data_path,
output_path=output_path,
train_ratio=0.7,
val_ratio=0.15,
test_ratio=0.15
)
# Run preprocessing
dataset_info = preprocessor.preprocess()
print(f"\nDataset info saved to: {output_path}/dataset_info.json")
print(f"Classes found: {dataset_info['num_classes']}")
print("Class names:")
for i, class_name in enumerate(dataset_info['class_names']):
print(f" {i}: {class_name}")
if __name__ == "__main__":
main()