Hemanth-thunder's picture
End of training
c0551d3
import os
import shutil
import uuid
from dataclasses import dataclass
from typing import Optional
import pandas as pd
from datasets import load_dataset
from loguru import logger
from sklearn.model_selection import train_test_split
ALLOWED_EXTENSIONS = ("jpeg", "png", "jpg", "JPG", "JPEG", "PNG")
@dataclass
class ImageClassificationPreprocessor:
train_data: str
username: str
project_name: str
token: str
valid_data: Optional[str] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
def __post_init__(self):
# Check if train data path exists
if not os.path.exists(self.train_data):
raise ValueError(f"{self.train_data} does not exist.")
# Check if train data path contains at least 2 folders
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()]
# list subfolders
logger.info(f"🚀 Subfolders: {subfolders}")
if len(subfolders) < 2:
raise ValueError(f"{self.train_data} should contain at least 2 subfolders.")
# Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only
for subfolder in subfolders:
image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)]
if len(image_files) < 5:
raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.")
# Check if there are no other files except image files in the subfolder
if len(image_files) != len(os.listdir(subfolder)):
raise ValueError(f"{subfolder} should not contain any other files except image files.")
# Check if there are no subfolders inside subfolders
subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()]
if len(subfolders_in_subfolder) > 0:
raise ValueError(f"{subfolder} should not contain any subfolders.")
if self.valid_data:
# Check if valid data path exists
if not os.path.exists(self.valid_data):
raise ValueError(f"{self.valid_data} does not exist.")
# Check if valid data path contains at least 2 folders
subfolders = [f.path for f in os.scandir(self.valid_data) if f.is_dir()]
# make sure that the subfolders in train and valid data are the same
train_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.train_data) if f.is_dir())
valid_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.valid_data) if f.is_dir())
if train_subfolders != valid_subfolders:
raise ValueError(f"{self.valid_data} should have the same subfolders as {self.train_data}.")
if len(subfolders) < 2:
raise ValueError(f"{self.valid_data} should contain at least 2 subfolders.")
# Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only
for subfolder in subfolders:
image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)]
if len(image_files) < 5:
raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.")
# Check if there are no other files except image files in the subfolder
if len(image_files) != len(os.listdir(subfolder)):
raise ValueError(f"{subfolder} should not contain any other files except image files.")
# Check if there are no subfolders inside subfolders
subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()]
if len(subfolders_in_subfolder) > 0:
raise ValueError(f"{subfolder} should not contain any subfolders.")
def split(self, df):
train_df, valid_df = train_test_split(
df,
test_size=self.test_size,
random_state=self.seed,
stratify=df["subfolder"],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare(self):
random_uuid = uuid.uuid4()
cache_dir = os.environ.get("HF_HOME")
if not cache_dir:
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid))
if self.valid_data:
shutil.copytree(self.train_data, os.path.join(data_dir, "train"))
shutil.copytree(self.valid_data, os.path.join(data_dir, "validation"))
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)
else:
subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()]
image_filenames = []
subfolder_names = []
for subfolder in subfolders:
for filename in os.listdir(subfolder):
if filename.endswith(("jpeg", "png", "jpg")):
image_filenames.append(filename)
subfolder_names.append(os.path.basename(subfolder))
df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names})
train_df, valid_df = self.split(df)
for row in train_df.itertuples():
os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True)
shutil.copy(
os.path.join(self.train_data, row.subfolder, row.image_filename),
os.path.join(data_dir, "train", row.subfolder, row.image_filename),
)
for row in valid_df.itertuples():
os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True)
shutil.copy(
os.path.join(self.train_data, row.subfolder, row.image_filename),
os.path.join(data_dir, "validation", row.subfolder, row.image_filename),
)
dataset = load_dataset("imagefolder", data_dir=data_dir)
dataset.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
private=True,
token=self.token,
)