Spaces:
Sleeping
Sleeping
File size: 2,728 Bytes
4821854 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import pandas as pd
from pathlib import Path
from vitClassifier import logger
from vitClassifier.entity.config_entity import DataIngestionConfig
import kaggle
class DataIngestion:
def __init__(self, config: DataIngestionConfig):
self.config = config
def download_dataset(self):
try:
# ... (download logic remains exactly the same)
logger.info("Authenticating with Kaggle API...")
kaggle.api.authenticate()
logger.info("Authentication successful.")
dataset_id = self.config.source_kaggle_dataset_id
download_path = self.config.unzip_dir
expected_data_folder = download_path / "chest_xray"
if expected_data_folder.exists():
logger.info(f"Dataset already exists at {expected_data_folder}. Skipping download.")
return
logger.info(f"Downloading dataset '{dataset_id}' to '{download_path}'...")
kaggle.api.dataset_download_files(
dataset=dataset_id, path=download_path, unzip=True, quiet=False
)
logger.info("Dataset downloaded and unzipped successfully.")
except Exception as e:
logger.error(f"Failed to download dataset from Kaggle. Error: {e}")
raise e
def create_dataframes(self):
"""
Scans train, test, and val directories and creates separate DataFrames.
"""
source_root = self.config.unzip_dir / "chest_xray"
# Helper function to create a dataframe for a given split (train/test/val)
def _create_df_for_split(split_name: str, save_path: Path):
split_path = source_root / split_name
file_names, labels = [], []
# Using .glob to find all .jpeg files in NORMAL and PNEUMONIA subfolders
for file in sorted(split_path.glob('*/*.jpeg')):
label = file.parent.name # NORMAL or PNEUMONIA
labels.append(label)
file_names.append(str(file))
df = pd.DataFrame({"image": file_names, "label": labels})
df.to_csv(save_path, index=False)
logger.info(f"Created and saved {split_name} DataFrame to {save_path}")
# Create DataFrames for each split
_create_df_for_split("train", self.config.train_df_path)
_create_df_for_split("test", self.config.test_df_path)
_create_df_for_split("val", self.config.val_df_path)
def ingest_data(self):
logger.info("Starting data ingestion process.")
self.download_dataset()
self.create_dataframes()
logger.info("Data ingestion process completed.") |