| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| def load_data(file_path): | |
| """Load data from a CSV file.""" | |
| return pd.read_csv(file_path) | |
| def clean_data(df): | |
| """Clean the dataset by handling missing values and duplicates.""" | |
| df = df.dropna() | |
| df = df.drop_duplicates() | |
| return df | |
| def preprocess_data(df, target_column): | |
| """Preprocess the data by splitting into features and target.""" | |
| X = df.drop(columns=[target_column]) | |
| y = df[target_column] | |
| return X, y | |
| def split_data(X, y, test_size=0.2, random_state=42): | |
| """Split the data into training and testing sets.""" | |
| return train_test_split(X, y, test_size=test_size, random_state=random_state) | |