AutoML_MLOps_PipeLine / src /mlpipeline /components /data_transformation.py
Abeshith's picture
Add pipeline stages implementation
a7d80f2
import pandas as pd
from sklearn.model_selection import train_test_split
from mlpipeline.entity import DataTransformationConfig, DataTransformationArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataTransformationException
import sys
import os
logger = get_logger(__name__)
class DataTransformation:
def __init__(self, config: DataTransformationConfig):
self.config = config
def transform(self) -> DataTransformationArtifact:
try:
logger.info("Starting data transformation")
df = pd.read_csv(self.config.data_path)
train_df, test_df = train_test_split(
df,
test_size=self.config.test_size,
random_state=self.config.random_state
)
os.makedirs(self.config.root_dir, exist_ok=True)
train_df.to_csv(self.config.train_path, index=False)
test_df.to_csv(self.config.test_path, index=False)
logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
return DataTransformationArtifact(
train_file_path=self.config.train_path,
test_file_path=self.config.test_path,
is_transformed=True,
message=f"Train: {train_df.shape}, Test: {test_df.shape}"
)
except Exception as e:
raise DataTransformationException(str(e), sys)