import pandas as pd from sklearn.model_selection import train_test_split from mlpipeline.entity import DataTransformationConfig, DataTransformationArtifact from mlpipeline.logging.logger import get_logger from mlpipeline.exception import DataTransformationException import sys import os logger = get_logger(__name__) class DataTransformation: def __init__(self, config: DataTransformationConfig): self.config = config def transform(self) -> DataTransformationArtifact: try: logger.info("Starting data transformation") df = pd.read_csv(self.config.data_path) train_df, test_df = train_test_split( df, test_size=self.config.test_size, random_state=self.config.random_state ) os.makedirs(self.config.root_dir, exist_ok=True) train_df.to_csv(self.config.train_path, index=False) test_df.to_csv(self.config.test_path, index=False) logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}") return DataTransformationArtifact( train_file_path=self.config.train_path, test_file_path=self.config.test_path, is_transformed=True, message=f"Train: {train_df.shape}, Test: {test_df.shape}" ) except Exception as e: raise DataTransformationException(str(e), sys)