File size: 6,549 Bytes
c0551d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | import os
import shutil
import uuid
from dataclasses import dataclass
from typing import Optional
import pandas as pd
from datasets import load_dataset
from loguru import logger
from sklearn.model_selection import train_test_split
ALLOWED_EXTENSIONS = ("jpeg", "png", "jpg", "JPG", "JPEG", "PNG")
@dataclass
class ImageClassificationPreprocessor:
train_data: str
username: str
project_name: str
token: str
valid_data: Optional[str] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
def __post_init__(self):
# Check if train data path exists
if not os.path.exists(self.train_data):
raise ValueError(f"{self.train_data} does not exist.")
# Check if train data path contains at least 2 folders
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()]
# list subfolders
logger.info(f"🚀 Subfolders: {subfolders}")
if len(subfolders) < 2:
raise ValueError(f"{self.train_data} should contain at least 2 subfolders.")
# Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only
for subfolder in subfolders:
image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)]
if len(image_files) < 5:
raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.")
# Check if there are no other files except image files in the subfolder
if len(image_files) != len(os.listdir(subfolder)):
raise ValueError(f"{subfolder} should not contain any other files except image files.")
# Check if there are no subfolders inside subfolders
subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()]
if len(subfolders_in_subfolder) > 0:
raise ValueError(f"{subfolder} should not contain any subfolders.")
if self.valid_data:
# Check if valid data path exists
if not os.path.exists(self.valid_data):
raise ValueError(f"{self.valid_data} does not exist.")
# Check if valid data path contains at least 2 folders
subfolders = [f.path for f in os.scandir(self.valid_data) if f.is_dir()]
# make sure that the subfolders in train and valid data are the same
train_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.train_data) if f.is_dir())
valid_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.valid_data) if f.is_dir())
if train_subfolders != valid_subfolders:
raise ValueError(f"{self.valid_data} should have the same subfolders as {self.train_data}.")
if len(subfolders) < 2:
raise ValueError(f"{self.valid_data} should contain at least 2 subfolders.")
# Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only
for subfolder in subfolders:
image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)]
if len(image_files) < 5:
raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.")
# Check if there are no other files except image files in the subfolder
if len(image_files) != len(os.listdir(subfolder)):
raise ValueError(f"{subfolder} should not contain any other files except image files.")
# Check if there are no subfolders inside subfolders
subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()]
if len(subfolders_in_subfolder) > 0:
raise ValueError(f"{subfolder} should not contain any subfolders.")
def split(self, df):
train_df, valid_df = train_test_split(
df,
test_size=self.test_size,
random_state=self.seed,
stratify=df["subfolder"],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare(self):
random_uuid = uuid.uuid4()
cache_dir = os.environ.get("HF_HOME")
if not cache_dir:
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid))
if self.valid_data:
shutil.copytree(self.train_data, os.path.join(data_dir, "train"))
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation"))
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
else:
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()]
image_filenames = []
subfolder_names = []
for subfolder in subfolders:
for filename in os.listdir(subfolder):
if filename.endswith(("jpeg", "png", "jpg")):
image_filenames.append(filename)
subfolder_names.append(os.path.basename(subfolder))
df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names})
train_df, valid_df = self.split(df)
for row in train_df.itertuples():
os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True)
shutil.copy(
os.path.join(self.train_data, row.subfolder, row.image_filename),
os.path.join(data_dir, "train", row.subfolder, row.image_filename),
)
for row in valid_df.itertuples():
os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True)
shutil.copy(
os.path.join(self.train_data, row.subfolder, row.image_filename),
os.path.join(data_dir, "validation", row.subfolder, row.image_filename),
)
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
|