Spaces:
Runtime error
Runtime error
| import numpy as np | |
| def contains_missing_value(df): | |
| """ | |
| Checks if the DataFrame contains any missing values. | |
| """ | |
| return df.isnull().values.any() | |
| def fill_null_values(df, mean_list, median_list, mode_list, new_category_list, interpolation_list): | |
| """ | |
| Fills missing values in the DataFrame using specified methods for different columns. | |
| Parameters: | |
| - df (DataFrame): The DataFrame with missing values. | |
| - mean_list (list): Columns to fill missing values with mean. | |
| - median_list (list): Columns to fill missing values with median. | |
| - mode_list (list): Columns to fill missing values with mode. | |
| - new_category_list (list): Columns to fill missing values with a new category (previously intended for 'NaN', now uses interpolation). | |
| - interpolation_list (list): Columns to fill missing values using interpolation. | |
| Returns: | |
| - df (DataFrame): The DataFrame after filling missing values. | |
| """ | |
| if mean_list: | |
| df = fill_with_mean(df, mean_list) | |
| if median_list: | |
| df = fill_with_median(df, median_list) | |
| if mode_list: | |
| df = fill_with_mode(df, mode_list) | |
| if new_category_list: | |
| # df = fill_with_NaN(df, new_category_list) | |
| df = fill_with_interpolation(df, new_category_list) | |
| if interpolation_list: | |
| df = fill_with_interpolation(df, interpolation_list) | |
| return df | |
| def remove_high_null(df, threshold_row=0.5, threshold_col=0.7): | |
| """ | |
| Remove rows and columns from a DataFrame where the proportion of null values | |
| is greater than the specified threshold. | |
| - param df: Pandas DataFrame to be processed. | |
| - param threshold_row: Proportion threshold for null values (default is 0.5 for rows). | |
| - param threshold_col: Proportion threshold for null values (default is 0.7 for columns). | |
| - return: DataFrame with high-null rows and columns removed. | |
| """ | |
| # Calculate the proportion of nulls in each column | |
| null_prop_col = df.isnull().mean() | |
| cols_to_drop = null_prop_col[null_prop_col > threshold_col].index | |
| # Drop columns with high proportion of nulls | |
| df_cleaned = df.drop(columns=cols_to_drop) | |
| # Calculate the proportion of nulls in each row | |
| null_prop_row = df_cleaned.isnull().mean(axis=1) | |
| rows_to_drop = null_prop_row[null_prop_row > threshold_row].index | |
| # Drop rows with high proportion of nulls | |
| df_cleaned = df_cleaned.drop(index=rows_to_drop) | |
| return df_cleaned | |
| def fill_with_mean(df, attributes): | |
| for attr in attributes: | |
| if attr in df.columns: | |
| df[attr] = df[attr].fillna(df[attr].mean()) | |
| return df | |
| def fill_with_median(df, attributes): | |
| for attr in attributes: | |
| if attr in df.columns: | |
| df[attr] = df[attr].fillna(df[attr].median()) | |
| return df | |
| def fill_with_mode(df, attributes): | |
| for attr in attributes: | |
| if attr in df.columns: | |
| mode_value = df[attr].mode()[0] if not df[attr].mode().empty else None | |
| if mode_value is not None: | |
| df[attr] = df[attr].fillna(mode_value) | |
| return df | |
| def fill_with_interpolation(df, attributes, method='linear'): | |
| # method: default is 'linear'. 'time', 'index', 'pad', 'nearest', 'quadratic', 'cubic', etc. | |
| for attr in attributes: | |
| if attr in df.columns: | |
| df[attr] = df[attr].interpolate(method=method) | |
| return df | |
| # Deprecated: replaced with interpolation to ensure no missing values | |
| def fill_with_NaN(df, attributes): | |
| for attr in attributes: | |
| if attr in df.columns: | |
| df[attr] = df[attr].fillna('NaN') | |
| return df | |
| def replace_placeholders_with_nan(df): | |
| """ | |
| Replaces common placeholders for missing values in object columns with np.nan. | |
| Parameters: | |
| - df (DataFrame): The DataFrame to process. | |
| Returns: | |
| - df (DataFrame): Updated DataFrame with placeholders replaced. | |
| """ | |
| placeholders = ["NA", "NULL", "?", "", "NaN", "None", "N/A", "n/a", "nan", "none"] | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| df[col] = df[col].apply(lambda x: np.nan if str(x).lower() in placeholders else x) | |
| return df |