Spaces:
Runtime error
Runtime error
| # src/cnnClassifier/components/data_ingestion.py | |
| from datasets import load_dataset | |
| from cnnClassifier import logger | |
| from cnnClassifier.entity.config_entity import DataIngestionConfig | |
| from pathlib import Path | |
| class DataIngestion: | |
| def __init__(self, config: DataIngestionConfig): | |
| self.config = config | |
| def download_dataset(self): | |
| """ | |
| Downloads and saves the FairFace dataset from the Hugging Face Hub. | |
| """ | |
| try: | |
| logger.info(f"Downloading dataset '{self.config.dataset_name}' from Hugging Face Hub...") | |
| # load_dataset handles everything: download, verification, and caching | |
| # It returns a DatasetDict, typically with 'train' and 'validation' splits | |
| fairface_dataset = load_dataset( | |
| self.config.dataset_name, | |
| name=self.config.dataset_config, | |
| cache_dir=self.config.root_dir # Use our root_dir for caching | |
| ) | |
| # Save the downloaded dataset to our specified artifacts directory | |
| # This makes it a persistent part of our DVC pipeline | |
| save_path = Path(self.config.local_data_dir) | |
| fairface_dataset.save_to_disk(save_path) | |
| logger.info(f"Dataset successfully downloaded and saved to {save_path}") | |
| # Optional: Log the structure of the downloaded dataset | |
| logger.info(f"Dataset splits: {list(fairface_dataset.keys())}") | |
| logger.info(f"Training set features: {fairface_dataset['train'].features}") | |
| except Exception as e: | |
| logger.error(f"Failed to download or save dataset. Error: {e}") | |
| raise e |