| | import os |
| | import shutil |
| | import uuid |
| | from dataclasses import dataclass |
| | from typing import Optional |
| |
|
| | import pandas as pd |
| | from datasets import load_dataset |
| | from loguru import logger |
| | from sklearn.model_selection import train_test_split |
| |
|
| |
|
| | ALLOWED_EXTENSIONS = ("jpeg", "png", "jpg", "JPG", "JPEG", "PNG") |
| |
|
| |
|
| | @dataclass |
| | class ImageClassificationPreprocessor: |
| | train_data: str |
| | username: str |
| | project_name: str |
| | token: str |
| | valid_data: Optional[str] = None |
| | test_size: Optional[float] = 0.2 |
| | seed: Optional[int] = 42 |
| |
|
| | def __post_init__(self): |
| | |
| | if not os.path.exists(self.train_data): |
| | raise ValueError(f"{self.train_data} does not exist.") |
| |
|
| | |
| | subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()] |
| | |
| | logger.info(f"🚀 Subfolders: {subfolders}") |
| | if len(subfolders) < 2: |
| | raise ValueError(f"{self.train_data} should contain at least 2 subfolders.") |
| |
|
| | |
| | for subfolder in subfolders: |
| | image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)] |
| | if len(image_files) < 5: |
| | raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.") |
| | |
| | if len(image_files) != len(os.listdir(subfolder)): |
| | raise ValueError(f"{subfolder} should not contain any other files except image files.") |
| |
|
| | |
| | subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()] |
| | if len(subfolders_in_subfolder) > 0: |
| | raise ValueError(f"{subfolder} should not contain any subfolders.") |
| |
|
| | if self.valid_data: |
| | |
| | if not os.path.exists(self.valid_data): |
| | raise ValueError(f"{self.valid_data} does not exist.") |
| |
|
| | |
| | subfolders = [f.path for f in os.scandir(self.valid_data) if f.is_dir()] |
| |
|
| | |
| | train_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.train_data) if f.is_dir()) |
| | valid_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.valid_data) if f.is_dir()) |
| | if train_subfolders != valid_subfolders: |
| | raise ValueError(f"{self.valid_data} should have the same subfolders as {self.train_data}.") |
| |
|
| | if len(subfolders) < 2: |
| | raise ValueError(f"{self.valid_data} should contain at least 2 subfolders.") |
| |
|
| | |
| | for subfolder in subfolders: |
| | image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)] |
| | if len(image_files) < 5: |
| | raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.") |
| |
|
| | |
| | if len(image_files) != len(os.listdir(subfolder)): |
| | raise ValueError(f"{subfolder} should not contain any other files except image files.") |
| |
|
| | |
| | subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()] |
| | if len(subfolders_in_subfolder) > 0: |
| | raise ValueError(f"{subfolder} should not contain any subfolders.") |
| |
|
| | def split(self, df): |
| | train_df, valid_df = train_test_split( |
| | df, |
| | test_size=self.test_size, |
| | random_state=self.seed, |
| | stratify=df["subfolder"], |
| | ) |
| | train_df = train_df.reset_index(drop=True) |
| | valid_df = valid_df.reset_index(drop=True) |
| | return train_df, valid_df |
| |
|
| | def prepare(self): |
| | random_uuid = uuid.uuid4() |
| | cache_dir = os.environ.get("HF_HOME") |
| | if not cache_dir: |
| | cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") |
| | data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) |
| |
|
| | if self.valid_data: |
| | shutil.copytree(self.train_data, os.path.join(data_dir, "train")) |
| | shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) |
| |
|
| | dataset = load_dataset("imagefolder", data_dir=data_dir) |
| | dataset.push_to_hub( |
| | f"{self.username}/autotrain-data-{self.project_name}", |
| | private=True, |
| | token=self.token, |
| | ) |
| |
|
| | else: |
| | subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()] |
| |
|
| | image_filenames = [] |
| | subfolder_names = [] |
| |
|
| | for subfolder in subfolders: |
| | for filename in os.listdir(subfolder): |
| | if filename.endswith(("jpeg", "png", "jpg")): |
| | image_filenames.append(filename) |
| | subfolder_names.append(os.path.basename(subfolder)) |
| |
|
| | df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names}) |
| | train_df, valid_df = self.split(df) |
| |
|
| | for row in train_df.itertuples(): |
| | os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True) |
| | shutil.copy( |
| | os.path.join(self.train_data, row.subfolder, row.image_filename), |
| | os.path.join(data_dir, "train", row.subfolder, row.image_filename), |
| | ) |
| |
|
| | for row in valid_df.itertuples(): |
| | os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True) |
| | shutil.copy( |
| | os.path.join(self.train_data, row.subfolder, row.image_filename), |
| | os.path.join(data_dir, "validation", row.subfolder, row.image_filename), |
| | ) |
| |
|
| | dataset = load_dataset("imagefolder", data_dir=data_dir) |
| | dataset.push_to_hub( |
| | f"{self.username}/autotrain-data-{self.project_name}", |
| | private=True, |
| | token=self.token, |
| | ) |
| |
|