Spaces:
Runtime error
Runtime error
| import logging | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from typing import Tuple | |
| class DataSplitter: | |
| def __init__(self, df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42): | |
| """ | |
| Initialize the DataSplitter with a DataFrame and parameters for splitting. | |
| Parameters: | |
| df : pd.DataFrame | |
| The input dataframe to be split. | |
| target_column : str | |
| The name of the target column in the dataframe. | |
| test_size : float, optional | |
| The proportion of the dataset to include in the test split. Default is 0.2. | |
| random_state : int, optional | |
| Controls the shuffling applied to the data before splitting. Default is 42. | |
| """ | |
| self.df = df | |
| self.target_column = target_column | |
| self.test_size = test_size | |
| self.random_state = random_state | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| # Check if target_column exists in the DataFrame | |
| if self.target_column not in self.df.columns: | |
| raise ValueError(f"Target column '{self.target_column}' does not exist in the DataFrame.") | |
| logging.info("DataSplitter initialized successfully.") | |
| def split_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: | |
| """ | |
| Split the dataframe into train and test sets. | |
| Returns: | |
| Tuple of X_train, X_test, y_train, y_test: | |
| X_train : pd.DataFrame | |
| Training set features. | |
| X_test : pd.DataFrame | |
| Testing set features. | |
| y_train : pd.Series | |
| Training set target variable. | |
| y_test : pd.Series | |
| Testing set target variable. | |
| """ | |
| logging.info(f"Starting train-test split with test_size={self.test_size} and random_state={self.random_state}.") | |
| X = self.df.drop(columns=[self.target_column], axis=1) | |
| y = self.df[self.target_column] | |
| logging.info(f"Feature set shape: {X.shape}") | |
| logging.info(f"Target set shape: {y.shape}") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=self.test_size, random_state=self.random_state | |
| ) | |
| logging.info(f"Train feature set shape: {X_train.shape}") | |
| logging.info(f"Test feature set shape: {X_test.shape}") | |
| logging.info(f"Train target set shape: {y_train.shape}") | |
| logging.info(f"Test target set shape: {y_test.shape}") | |
| logging.info("Train-test split completed successfully.") | |
| return X_train, X_test, y_train, y_test | |