File size: 3,600 Bytes
c3da629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import hashlib
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import io

class DataProcessor:
    def __init__(self):
        self.input_shape = (64, 64, 3)  # Default input shape for EuroSAT
        
    def preprocess_for_inference(self, image):
        """Preprocess a single image for model inference"""
        if isinstance(image, bytes):
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image))
            
            # Convert RGBA to RGB if necessary
            if image.mode == 'RGBA':
                image = image.convert('RGB')
            
            # Convert PIL Image to numpy array
            image = np.array(image)
            
            # Convert to float32 and normalize
            image = image.astype(np.float32) / 255.0
            
            # Resize image
            image = tf.image.resize(image, (self.input_shape[0], self.input_shape[1]))
            
            # Add batch dimension
            image = tf.expand_dims(image, 0)
            
            return image
        else:
            raise ValueError("Input must be bytes (image file content)")

    @staticmethod
    def check_image_size(image_path):
        """Check dimensions of an image file"""
        with Image.open(image_path) as img:
            return img.size

    @staticmethod
    def check_image_dimensions(dataset_path):
        """Check dimensions of all images in dataset"""
        all_dimensions = set()
        for folder in os.listdir(dataset_path):
            class_path = os.path.join(dataset_path, folder)
            if os.path.isdir(class_path):
                for image_name in os.listdir(class_path):
                    image_path = os.path.join(class_path, image_name)
                    width, height = DataProcessor.check_image_size(image_path)
                    all_dimensions.add((width, height))
        return all_dimensions

    @staticmethod
    def get_data_generators():
        """Get data generators for training and validation"""
        train_gen = ImageDataGenerator(
            rescale=1./255,
            rotation_range=60,
            width_shift_range=0.2,
            height_shift_range=0.2,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            vertical_flip=True
        )

        test_gen = ImageDataGenerator(rescale=1./255)
        
        return train_gen, test_gen

    @staticmethod
    def get_image_hash(image_path):
        """Calculate MD5 hash of an image file"""
        with open(image_path, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()

    @staticmethod
    def check_duplicates(dataset_path):
        """Find duplicate images in dataset"""
        seen_hashes = set()
        duplicates = []
        for folder in os.listdir(dataset_path):
            class_path = os.path.join(dataset_path, folder)
            if os.path.isdir(class_path):
                for image_name in os.listdir(class_path):
                    image_path = os.path.join(class_path, image_name)
                    img_hash = DataProcessor.get_image_hash(image_path)
                    if img_hash in seen_hashes:
                        duplicates.append(image_path)
                    else:
                        seen_hashes.add(img_hash)
        return duplicates