File size: 2,126 Bytes
eacd6a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from datasets import load_from_disk
import pandas as pd
from cnnClassifier import logger
from cnnClassifier.entity.config_entity import DataPreparationConfig
from pathlib import Path
from PIL import Image # <<< ADD THIS IMPORT
import io              # <<< ADD THIS IMPORT

class DataPreparation:
    def __init__(self, config: DataPreparationConfig):
        self.config = config

    def create_cleaned_dataframe(self):
        try:
            logger.info("Loading raw dataset to create cleaned CSV...")
            raw_dataset = load_from_disk(self.config.raw_data_path)
            
            df_train = raw_dataset['train'].to_pandas()
            df_val = raw_dataset['validation'].to_pandas()
            combined_df = pd.concat([df_train, df_val], ignore_index=True)
            
            image_dir = Path("artifacts/data_preparation/images")
            image_dir.mkdir(parents=True, exist_ok=True)
            
            combined_df['image_file_path'] = [
                str(image_dir / f"{i}.jpg") for i in range(len(combined_df))
            ]
            
            # --- IMPORTANT ---
            # We only need the file path for the CSV, so we drop the bulky 'image' column
            final_df_for_csv = combined_df.drop(columns=['image'])
            
            logger.info(f"Saving cleaned metadata to {self.config.cleaned_data_path}")
            final_df_for_csv.to_csv(self.config.cleaned_data_path, index=False)

            # --- CORRECTED IMAGE SAVING LOOP ---
            logger.info(f"Deterministically saving images to {image_dir}...")
            for i, row in combined_df.iterrows():
                image_path = Path(row['image_file_path'])
                image_dict = row['image']
                
                # Recreate the PIL Image from the dictionary's bytes data
                pil_image = Image.open(io.BytesIO(image_dict['bytes']))
                
                # Now save the reconstructed PIL Image
                pil_image.save(image_path)

        except Exception as e:
            logger.error(f"Failed during data preparation. Error: {e}")
            raise e