File size: 1,698 Bytes
eacd6a2
 
 
30672d3
 
eacd6a2
30672d3
 
 
 
 
eacd6a2
30672d3
eacd6a2
30672d3
 
eacd6a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30672d3
eacd6a2
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# src/cnnClassifier/components/data_ingestion.py

from datasets import load_dataset
from cnnClassifier import logger
from cnnClassifier.entity.config_entity import DataIngestionConfig
from pathlib import Path

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_dataset(self):
        """
        Downloads and saves the FairFace dataset from the Hugging Face Hub.
        """
        try:
            logger.info(f"Downloading dataset '{self.config.dataset_name}' from Hugging Face Hub...")
            
            # load_dataset handles everything: download, verification, and caching
            # It returns a DatasetDict, typically with 'train' and 'validation' splits
            fairface_dataset = load_dataset(
                self.config.dataset_name,
                name=self.config.dataset_config,
                cache_dir=self.config.root_dir # Use our root_dir for caching
            )
            
            # Save the downloaded dataset to our specified artifacts directory
            # This makes it a persistent part of our DVC pipeline
            save_path = Path(self.config.local_data_dir)
            fairface_dataset.save_to_disk(save_path)
            
            logger.info(f"Dataset successfully downloaded and saved to {save_path}")

            # Optional: Log the structure of the downloaded dataset
            logger.info(f"Dataset splits: {list(fairface_dataset.keys())}")
            logger.info(f"Training set features: {fairface_dataset['train'].features}")

        except Exception as e:
            logger.error(f"Failed to download or save dataset. Error: {e}")
            raise e