Spaces:
Sleeping
Sleeping
File size: 2,866 Bytes
a7d80f2 0addae6 a7d80f2 0addae6 a7d80f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | import os
import shutil
from pathlib import Path
from mlpipeline.entity import DataIngestionConfig, DataIngestionArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataIngestionException
import sys
logger = get_logger(__name__)
class DataIngestion:
def __init__(self, config: DataIngestionConfig):
self.config = config
self.kaggle_api = None
def _authenticate_kaggle(self):
"""Lazy authentication - only when needed"""
if self.kaggle_api is None:
try:
from kaggle.api.kaggle_api_extended import KaggleApi
self.kaggle_api = KaggleApi()
self.kaggle_api.authenticate()
logger.info("Kaggle API authenticated successfully")
except Exception as e:
raise DataIngestionException(
f"Failed to authenticate with Kaggle API: {e}",
sys
)
def download_data(self) -> DataIngestionArtifact:
try:
logger.info("Starting data ingestion")
# Authenticate only when downloading
self._authenticate_kaggle()
os.makedirs(self.config.root_dir, exist_ok=True)
competition_name = "playground-series-s6e2"
logger.info(f"Downloading dataset from Kaggle competition: {competition_name}")
self.kaggle_api.competition_download_files(
competition_name,
path=self.config.root_dir
)
zip_file = self.config.root_dir / f"{competition_name}.zip"
if zip_file.exists():
logger.info(f"Extracting {zip_file}")
shutil.unpack_archive(zip_file, self.config.unzip_dir)
zip_file.unlink()
train_file = self.config.unzip_dir / "train.csv"
test_file = self.config.unzip_dir / "test.csv"
if train_file.exists() and test_file.exists():
train_raw = self.config.root_dir / "train_raw.csv"
test_raw = self.config.root_dir / "test_raw.csv"
shutil.copy(train_file, train_raw)
shutil.copy(test_file, test_raw)
logger.info(f"Data saved: {train_raw}, {test_raw}")
return DataIngestionArtifact(
data_file_path=train_raw,
is_ingested=True,
message="Data ingestion completed successfully"
)
else:
raise FileNotFoundError("Train or test file not found after extraction")
except Exception as e:
raise DataIngestionException(str(e), sys)
|