hicxai-condition-2 / src /preprocessing.py
Suvh
Update to v1.1-chatty-luna (2025-12-07)
070061f
"""Preprocessing utilities for the Adult dataset.
Exports:
- preprocess_adult(df): returns a cleaned, numeric DataFrame with an 'income' label column.
"""
from typing import List
import numpy as np
import pandas as pd
def _strip_and_normalize_strings(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
for c in cols:
df[c] = (
df[c]
.astype(str)
.str.strip()
.replace({'?': 'Unknown'})
)
return df
def preprocess_adult(df: pd.DataFrame) -> pd.DataFrame:
"""Clean and encode Adult dataset into numeric features.
Input:
df: DataFrame containing Adult columns including 'income'.
Output:
DataFrame with numeric features; 'income' remains as the target label.
"""
df = df.copy()
if 'income' not in df.columns:
raise ValueError("Expected 'income' column in Adult dataframe")
# Normalize string columns
object_cols = [c for c in df.columns if df[c].dtype == 'object']
df[object_cols] = df[object_cols].fillna('Unknown')
df = _strip_and_normalize_strings(df, object_cols)
# Ensure common numeric cols are numeric
numeric_candidates = [
'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'
]
for c in numeric_candidates:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors='coerce')
# Fill NaNs: numeric with median, categorical with mode/Unknown
for c in df.columns:
if c == 'income':
continue
if pd.api.types.is_numeric_dtype(df[c]):
# Calculate median, but use a default value if median is NaN (empty column)
median_val = df[c].median()
if pd.isna(median_val):
# Use sensible defaults for numeric columns if median is NaN
if c == 'age':
median_val = 35
elif c == 'fnlwgt':
median_val = 100000
elif c == 'education_num':
median_val = 9 # HS-grad equivalent
elif c in ['capital_gain', 'capital_loss']:
median_val = 0
elif c == 'hours_per_week':
median_val = 40
else:
median_val = 0 # Default fallback
df[c] = df[c].fillna(median_val)
else:
df[c] = df[c].fillna('Unknown')
# One-hot encode categorical features except the target
cat_cols = [c for c in df.columns if df[c].dtype == 'object' and c != 'income']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
# Keep label as string categories; sklearn supports string labels
# Ensure 'income' column is last for readability
cols = [c for c in df_encoded.columns if c != 'income'] + ['income']
df_encoded = df_encoded[cols]
return df_encoded