Spaces:
Runtime error
Runtime error
| from datasets import load_from_disk | |
| import pandas as pd | |
| from cnnClassifier import logger | |
| from cnnClassifier.entity.config_entity import DataPreparationConfig | |
| from pathlib import Path | |
| from PIL import Image # <<< ADD THIS IMPORT | |
| import io # <<< ADD THIS IMPORT | |
| class DataPreparation: | |
| def __init__(self, config: DataPreparationConfig): | |
| self.config = config | |
| def create_cleaned_dataframe(self): | |
| try: | |
| logger.info("Loading raw dataset to create cleaned CSV...") | |
| raw_dataset = load_from_disk(self.config.raw_data_path) | |
| df_train = raw_dataset['train'].to_pandas() | |
| df_val = raw_dataset['validation'].to_pandas() | |
| combined_df = pd.concat([df_train, df_val], ignore_index=True) | |
| image_dir = Path("artifacts/data_preparation/images") | |
| image_dir.mkdir(parents=True, exist_ok=True) | |
| combined_df['image_file_path'] = [ | |
| str(image_dir / f"{i}.jpg") for i in range(len(combined_df)) | |
| ] | |
| # --- IMPORTANT --- | |
| # We only need the file path for the CSV, so we drop the bulky 'image' column | |
| final_df_for_csv = combined_df.drop(columns=['image']) | |
| logger.info(f"Saving cleaned metadata to {self.config.cleaned_data_path}") | |
| final_df_for_csv.to_csv(self.config.cleaned_data_path, index=False) | |
| # --- CORRECTED IMAGE SAVING LOOP --- | |
| logger.info(f"Deterministically saving images to {image_dir}...") | |
| for i, row in combined_df.iterrows(): | |
| image_path = Path(row['image_file_path']) | |
| image_dict = row['image'] | |
| # Recreate the PIL Image from the dictionary's bytes data | |
| pil_image = Image.open(io.BytesIO(image_dict['bytes'])) | |
| # Now save the reconstructed PIL Image | |
| pil_image.save(image_path) | |
| except Exception as e: | |
| logger.error(f"Failed during data preparation. Error: {e}") | |
| raise e |