Spaces:
Running
Running
File size: 1,526 Bytes
a7d80f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import pandas as pd
from sklearn.model_selection import train_test_split
from mlpipeline.entity import DataTransformationConfig, DataTransformationArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import DataTransformationException
import sys
import os
logger = get_logger(__name__)
class DataTransformation:
def __init__(self, config: DataTransformationConfig):
self.config = config
def transform(self) -> DataTransformationArtifact:
try:
logger.info("Starting data transformation")
df = pd.read_csv(self.config.data_path)
train_df, test_df = train_test_split(
df,
test_size=self.config.test_size,
random_state=self.config.random_state
)
os.makedirs(self.config.root_dir, exist_ok=True)
train_df.to_csv(self.config.train_path, index=False)
test_df.to_csv(self.config.test_path, index=False)
logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
return DataTransformationArtifact(
train_file_path=self.config.train_path,
test_file_path=self.config.test_path,
is_transformed=True,
message=f"Train: {train_df.shape}, Test: {test_df.shape}"
)
except Exception as e:
raise DataTransformationException(str(e), sys) |