File size: 2,866 Bytes
a7d80f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0addae6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7d80f2
 
 
 
 
0addae6
 
 
a7d80f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import shutil
from pathlib import Path

from mlpipeline.entity import DataIngestionConfig, DataIngestionArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataIngestionException
import sys

logger = get_logger(__name__)


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.kaggle_api = None
    
    def _authenticate_kaggle(self):
        """Lazy authentication - only when needed"""
        if self.kaggle_api is None:
            try:
                from kaggle.api.kaggle_api_extended import KaggleApi
                self.kaggle_api = KaggleApi()
                self.kaggle_api.authenticate()
                logger.info("Kaggle API authenticated successfully")
            except Exception as e:
                raise DataIngestionException(
                    f"Failed to authenticate with Kaggle API: {e}",
                    sys
                )
    
    def download_data(self) -> DataIngestionArtifact:
        try:
            logger.info("Starting data ingestion")
            
            # Authenticate only when downloading
            self._authenticate_kaggle()
            
            os.makedirs(self.config.root_dir, exist_ok=True)
            
            competition_name = "playground-series-s6e2"
            
            logger.info(f"Downloading dataset from Kaggle competition: {competition_name}")
            self.kaggle_api.competition_download_files(
                competition_name,
                path=self.config.root_dir
            )
            
            zip_file = self.config.root_dir / f"{competition_name}.zip"
            
            if zip_file.exists():
                logger.info(f"Extracting {zip_file}")
                shutil.unpack_archive(zip_file, self.config.unzip_dir)
                zip_file.unlink()
            
            train_file = self.config.unzip_dir / "train.csv"
            test_file = self.config.unzip_dir / "test.csv"
            
            if train_file.exists() and test_file.exists():
                train_raw = self.config.root_dir / "train_raw.csv"
                test_raw = self.config.root_dir / "test_raw.csv"
                
                shutil.copy(train_file, train_raw)
                shutil.copy(test_file, test_raw)
                
                logger.info(f"Data saved: {train_raw}, {test_raw}")
                
                return DataIngestionArtifact(
                    data_file_path=train_raw,
                    is_ingested=True,
                    message="Data ingestion completed successfully"
                )
            else:
                raise FileNotFoundError("Train or test file not found after extraction")
                
        except Exception as e:
            raise DataIngestionException(str(e), sys)