mlops2 / src /features.py
marintosti12
rework features
46fad31
import pandas as pd
import numpy as np
def safe_div(a, b):
return np.where(b == 0, 0, a / b)
def compute_features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
if 'DAYS_EMPLOYED' in df.columns:
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)
to_drop = [
'COMMONAREA_MODE', 'COMMONAREA_MEDI',
'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MEDI',
'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_MEDI',
'FLOORSMIN_MODE', 'FLOORSMIN_MEDI',
'YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI',
'LANDAREA_MODE', 'LANDAREA_MEDI',
'BASEMENTAREA_MODE', 'BASEMENTAREA_MEDI',
'ELEVATORS_MODE', 'ELEVATORS_MEDI'
]
df = df.drop(columns=[c for c in to_drop if c in df.columns])
isna_cols = [
'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OWN_CAR_AGE',
'COMMONAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_AVG',
'FLOORSMIN_AVG', 'YEARS_BUILD_AVG', 'LANDAREA_AVG', 'BASEMENTAREA_AVG',
'NONLIVINGAREA_AVG', 'ELEVATORS_AVG', 'FONDKAPREMONT_MODE'
]
for c in isna_cols:
if c in df.columns:
df[c + "_ISNA"] = df[c].isna().astype(int)
df['CHILDREN_RATIO'] = safe_div(df['CNT_CHILDREN'], df['CNT_FAM_MEMBERS'])
df['INCOME_PER_PERSON'] = safe_div(df['AMT_INCOME_TOTAL'], df['CNT_FAM_MEMBERS'])
df['AGE'] = -df['DAYS_BIRTH'] / 365.25
df['AGE_PER_MEMBER'] = safe_div(df['AGE'], df['CNT_FAM_MEMBERS'])
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
num_cols = df.select_dtypes(include=['number']).columns.tolist()
if num_cols:
med = df[num_cols].median()
df[num_cols] = df[num_cols].fillna(med)
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
for c in cat_cols:
if df[c].notna().any():
mode_val = df[c].mode(dropna=True)
fill_val = mode_val.iloc[0] if not mode_val.empty else "Unknown"
else:
fill_val = "Unknown"
df[c] = df[c].fillna(fill_val)
return df