Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| def safe_div(a, b): | |
| return np.where(b == 0, 0, a / b) | |
| def compute_features(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| if 'DAYS_EMPLOYED' in df.columns: | |
| df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan) | |
| to_drop = [ | |
| 'COMMONAREA_MODE', 'COMMONAREA_MEDI', | |
| 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MEDI', | |
| 'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_MEDI', | |
| 'FLOORSMIN_MODE', 'FLOORSMIN_MEDI', | |
| 'YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI', | |
| 'LANDAREA_MODE', 'LANDAREA_MEDI', | |
| 'BASEMENTAREA_MODE', 'BASEMENTAREA_MEDI', | |
| 'ELEVATORS_MODE', 'ELEVATORS_MEDI' | |
| ] | |
| df = df.drop(columns=[c for c in to_drop if c in df.columns]) | |
| isna_cols = [ | |
| 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OWN_CAR_AGE', | |
| 'COMMONAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_AVG', | |
| 'FLOORSMIN_AVG', 'YEARS_BUILD_AVG', 'LANDAREA_AVG', 'BASEMENTAREA_AVG', | |
| 'NONLIVINGAREA_AVG', 'ELEVATORS_AVG', 'FONDKAPREMONT_MODE' | |
| ] | |
| for c in isna_cols: | |
| if c in df.columns: | |
| df[c + "_ISNA"] = df[c].isna().astype(int) | |
| df['CHILDREN_RATIO'] = safe_div(df['CNT_CHILDREN'], df['CNT_FAM_MEMBERS']) | |
| df['INCOME_PER_PERSON'] = safe_div(df['AMT_INCOME_TOTAL'], df['CNT_FAM_MEMBERS']) | |
| df['AGE'] = -df['DAYS_BIRTH'] / 365.25 | |
| df['AGE_PER_MEMBER'] = safe_div(df['AGE'], df['CNT_FAM_MEMBERS']) | |
| df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] | |
| df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT'] | |
| df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL'] | |
| df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT'] | |
| num_cols = df.select_dtypes(include=['number']).columns.tolist() | |
| if num_cols: | |
| med = df[num_cols].median() | |
| df[num_cols] = df[num_cols].fillna(med) | |
| cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| for c in cat_cols: | |
| if df[c].notna().any(): | |
| mode_val = df[c].mode(dropna=True) | |
| fill_val = mode_val.iloc[0] if not mode_val.empty else "Unknown" | |
| else: | |
| fill_val = "Unknown" | |
| df[c] = df[c].fillna(fill_val) | |
| return df |