import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    """
    Preprocess the dataset by handling missing values and performing any necessary cleanup.
    """
    # Example: Handling missing values by filling with column mean
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def remove_outliers_iqr(df):
    """
    Remove outliers based on the IQR (Interquartile Range) method.
    """
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        # Remove rows where values are outside of the IQR range for the column
        df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]
    return df

def cap_extreme_values(df):
    """
    Cap extreme values in the dataset beyond the 1st and 99th percentiles.
    """
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        upper_limit = df[col].quantile(0.99)
        lower_limit = df[col].quantile(0.01)
        df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    return df

def convert_string_to_numeric(df):
    """
    Convert string categorical columns to numeric using Label Encoding.
    """
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    return df