File size: 1,526 Bytes
a7d80f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
from sklearn.model_selection import train_test_split
from mlpipeline.entity import DataTransformationConfig, DataTransformationArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataTransformationException
import sys
import os

logger = get_logger(__name__)


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def transform(self) -> DataTransformationArtifact:
        try:
            logger.info("Starting data transformation")
            
            df = pd.read_csv(self.config.data_path)
            
            train_df, test_df = train_test_split(
                df,
                test_size=self.config.test_size,
                random_state=self.config.random_state
            )
            
            os.makedirs(self.config.root_dir, exist_ok=True)
            
            train_df.to_csv(self.config.train_path, index=False)
            test_df.to_csv(self.config.test_path, index=False)
            
            logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
            
            return DataTransformationArtifact(
                train_file_path=self.config.train_path,
                test_file_path=self.config.test_path,
                is_transformed=True,
                message=f"Train: {train_df.shape}, Test: {test_df.shape}"
            )
        except Exception as e:
            raise DataTransformationException(str(e), sys)