Spaces:
Sleeping
Sleeping
| """Preprocessing utilities for the Adult dataset. | |
| Exports: | |
| - preprocess_adult(df): returns a cleaned, numeric DataFrame with an 'income' label column. | |
| """ | |
| from typing import List | |
| import numpy as np | |
| import pandas as pd | |
| def _strip_and_normalize_strings(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame: | |
| for c in cols: | |
| df[c] = ( | |
| df[c] | |
| .astype(str) | |
| .str.strip() | |
| .replace({'?': 'Unknown'}) | |
| ) | |
| return df | |
| def preprocess_adult(df: pd.DataFrame) -> pd.DataFrame: | |
| """Clean and encode Adult dataset into numeric features. | |
| Input: | |
| df: DataFrame containing Adult columns including 'income'. | |
| Output: | |
| DataFrame with numeric features; 'income' remains as the target label. | |
| """ | |
| df = df.copy() | |
| if 'income' not in df.columns: | |
| raise ValueError("Expected 'income' column in Adult dataframe") | |
| # Normalize string columns | |
| object_cols = [c for c in df.columns if df[c].dtype == 'object'] | |
| df[object_cols] = df[object_cols].fillna('Unknown') | |
| df = _strip_and_normalize_strings(df, object_cols) | |
| # Ensure common numeric cols are numeric | |
| numeric_candidates = [ | |
| 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week' | |
| ] | |
| for c in numeric_candidates: | |
| if c in df.columns: | |
| df[c] = pd.to_numeric(df[c], errors='coerce') | |
| # Fill NaNs: numeric with median, categorical with mode/Unknown | |
| for c in df.columns: | |
| if c == 'income': | |
| continue | |
| if pd.api.types.is_numeric_dtype(df[c]): | |
| # Calculate median, but use a default value if median is NaN (empty column) | |
| median_val = df[c].median() | |
| if pd.isna(median_val): | |
| # Use sensible defaults for numeric columns if median is NaN | |
| if c == 'age': | |
| median_val = 35 | |
| elif c == 'fnlwgt': | |
| median_val = 100000 | |
| elif c == 'education_num': | |
| median_val = 9 # HS-grad equivalent | |
| elif c in ['capital_gain', 'capital_loss']: | |
| median_val = 0 | |
| elif c == 'hours_per_week': | |
| median_val = 40 | |
| else: | |
| median_val = 0 # Default fallback | |
| df[c] = df[c].fillna(median_val) | |
| else: | |
| df[c] = df[c].fillna('Unknown') | |
| # One-hot encode categorical features except the target | |
| cat_cols = [c for c in df.columns if df[c].dtype == 'object' and c != 'income'] | |
| df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True) | |
| # Keep label as string categories; sklearn supports string labels | |
| # Ensure 'income' column is last for readability | |
| cols = [c for c in df_encoded.columns if c != 'income'] + ['income'] | |
| df_encoded = df_encoded[cols] | |
| return df_encoded | |