ALYYAN's picture
Backend + Frontend done
eacd6a2
raw
history blame
2.13 kB
from datasets import load_from_disk
import pandas as pd
from cnnClassifier import logger
from cnnClassifier.entity.config_entity import DataPreparationConfig
from pathlib import Path
from PIL import Image # <<< ADD THIS IMPORT
import io # <<< ADD THIS IMPORT
class DataPreparation:
def __init__(self, config: DataPreparationConfig):
self.config = config
def create_cleaned_dataframe(self):
try:
logger.info("Loading raw dataset to create cleaned CSV...")
raw_dataset = load_from_disk(self.config.raw_data_path)
df_train = raw_dataset['train'].to_pandas()
df_val = raw_dataset['validation'].to_pandas()
combined_df = pd.concat([df_train, df_val], ignore_index=True)
image_dir = Path("artifacts/data_preparation/images")
image_dir.mkdir(parents=True, exist_ok=True)
combined_df['image_file_path'] = [
str(image_dir / f"{i}.jpg") for i in range(len(combined_df))
]
# --- IMPORTANT ---
# We only need the file path for the CSV, so we drop the bulky 'image' column
final_df_for_csv = combined_df.drop(columns=['image'])
logger.info(f"Saving cleaned metadata to {self.config.cleaned_data_path}")
final_df_for_csv.to_csv(self.config.cleaned_data_path, index=False)
# --- CORRECTED IMAGE SAVING LOOP ---
logger.info(f"Deterministically saving images to {image_dir}...")
for i, row in combined_df.iterrows():
image_path = Path(row['image_file_path'])
image_dict = row['image']
# Recreate the PIL Image from the dictionary's bytes data
pil_image = Image.open(io.BytesIO(image_dict['bytes']))
# Now save the reconstructed PIL Image
pil_image.save(image_path)
except Exception as e:
logger.error(f"Failed during data preparation. Error: {e}")
raise e