import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder def preprocess_data(df): """ Preprocess the dataset by handling missing values and performing any necessary cleanup. """ # Example: Handling missing values by filling with column mean for col in df.columns: if df[col].dtype in ['float64', 'int64']: df[col].fillna(df[col].mean(), inplace=True) else: df[col].fillna(df[col].mode()[0], inplace=True) return df def remove_outliers_iqr(df): """ Remove outliers based on the IQR (Interquartile Range) method. """ numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns for col in numeric_columns: Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 # Remove rows where values are outside of the IQR range for the column df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))] return df def cap_extreme_values(df): """ Cap extreme values in the dataset beyond the 1st and 99th percentiles. """ for col in df.select_dtypes(include=['float64', 'int64']).columns: upper_limit = df[col].quantile(0.99) lower_limit = df[col].quantile(0.01) df[col] = df[col].clip(lower=lower_limit, upper=upper_limit) return df def convert_string_to_numeric(df): """ Convert string categorical columns to numeric using Label Encoding. """ label_encoder = LabelEncoder() for col in df.select_dtypes(include=['object']).columns: df[col] = label_encoder.fit_transform(df[col]) return df