{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "8f33ab85", "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 2, "id": "5b55e660", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'f:\\\\Projects\\\\End-to-End-Chest-Cancer-Classification-using-MLflow-and-DVC\\\\research'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%pwd" ] }, { "cell_type": "code", "execution_count": 3, "id": "b7338c82", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'f:\\\\Projects\\\\End-to-End-Chest-Cancer-Classification-using-MLflow-and-DVC'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.chdir(\"../\")\n", "%pwd" ] }, { "cell_type": "code", "execution_count": 4, "id": "a770b8df", "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "\n", "\n", "@dataclass(frozen=True)\n", "class DataIngestionConfig:\n", " root_dir: Path\n", " source_URL: str\n", " local_data_file: Path\n", " unzip_dir: Path" ] }, { "cell_type": "code", "execution_count": 6, "id": "979add90", "metadata": {}, "outputs": [], "source": [ "from cnnClassifier.constants import *\n", "from cnnClassifier.utils.common import read_yaml, create_directories\n", "class ConfigurationManager:\n", " def __init__(\n", " self,\n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH):\n", "\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", "\n", " def get_data_ingestion_config(self) -> DataIngestionConfig:\n", " config = self.config.data_ingestion\n", "\n", " create_directories([config.root_dir])\n", "\n", " data_ingestion_config = DataIngestionConfig(\n", " root_dir=config.root_dir,\n", " source_URL=config.source_URL,\n", " local_data_file=config.local_data_file,\n", " unzip_dir=config.unzip_dir \n", " )\n", "\n", " return data_ingestion_config" ] }, { "cell_type": "code", "execution_count": 9, "id": "e4fd8f68", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2025-08-18 00:24:08,669: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2025-08-18 00:24:08,684: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2025-08-18 00:24:08,686: INFO: common: created directory at: artifacts]\n", "[2025-08-18 00:24:08,688: INFO: common: created directory at: artifacts/data_ingestion]\n", "[2025-08-18 00:24:08,692: INFO: 78466947: Downloading data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=sharing into file artifacts/data_ingestion/data.zip]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Downloading...\n", "From (original): https://drive.google.com/uc?/export=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY\n", "From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY&confirm=t&uuid=954f5f66-c0d6-4c40-a993-933880515813\n", "To: f:\\Projects\\End-to-End-Chest-Cancer-Classification-using-MLflow-and-DVC\\artifacts\\data_ingestion\\data.zip\n", "100%|██████████| 49.0M/49.0M [00:24<00:00, 2.03MB/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2025-08-18 00:24:36,267: INFO: 78466947: Downloaded data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=sharing into file artifacts/data_ingestion/data.zip]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import os\n", "import zipfile\n", "import gdown\n", "from cnnClassifier import logger\n", "from cnnClassifier.utils.common import get_size\n", "\n", "class DataIngestion:\n", " def __init__(self, config: DataIngestionConfig):\n", " self.config = config\n", "\n", "\n", " \n", " \n", " def download_file(self)-> str:\n", " '''\n", " Fetch data from the url\n", " '''\n", "\n", " try: \n", " dataset_url = self.config.source_URL\n", " zip_download_dir = self.config.local_data_file\n", " os.makedirs(\"artifacts/data_ingestion\", exist_ok=True)\n", " logger.info(f\"Downloading data from {dataset_url} into file {zip_download_dir}\")\n", "\n", " file_id = dataset_url.split(\"/\")[-2]\n", " prefix = 'https://drive.google.com/uc?/export=download&id='\n", " gdown.download(prefix+file_id,zip_download_dir)\n", "\n", " logger.info(f\"Downloaded data from {dataset_url} into file {zip_download_dir}\")\n", "\n", " except Exception as e:\n", " raise e\n", " \n", " \n", " def extract_zip_file(self):\n", " \"\"\"\n", " zip_file_path: str\n", " Extracts the zip file into the data directory\n", " Function returns None\n", " \"\"\"\n", " unzip_path = self.config.unzip_dir\n", " os.makedirs(unzip_path, exist_ok=True)\n", " with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n", " zip_ref.extractall(unzip_path)\n", "try:\n", " config = ConfigurationManager()\n", " data_ingestion_config = config.get_data_ingestion_config()\n", " data_ingestion = DataIngestion(config=data_ingestion_config)\n", " data_ingestion.download_file()\n", " data_ingestion.extract_zip_file()\n", "except Exception as e:\n", " raise e" ] } ], "metadata": { "kernelspec": { "display_name": "cnn_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }