Spaces:
Sleeping
Sleeping
File size: 2,746 Bytes
0c661f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import numpy as np
import pandas as pd
def clean_and_engineer(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
# -------------------------
# Fix known data issues
# -------------------------
if 'loan_purpose' in df.columns:
df['loan_purpose'] = df['loan_purpose'].replace('Personaal', 'Personal')
# -------------------------
# Handle missing residence_type
# -------------------------
if 'residence_type' in df.columns:
if df['residence_type'].isna().sum() > 0:
mode_val = df['residence_type'].mode()[0]
df['residence_type'] = df['residence_type'].fillna(mode_val)
# -------------------------
# Business rules (only if columns exist)
# -------------------------
if 'processing_fee' in df.columns and 'loan_amount' in df.columns:
df = df[(df['processing_fee'] / df['loan_amount']) < 0.03]
if 'gst' in df.columns and 'loan_amount' in df.columns:
df = df[(df['gst'] / df['loan_amount']) < 0.20]
if 'net_disbursement' in df.columns and 'loan_amount' in df.columns:
df = df[df['net_disbursement'] <= df['loan_amount']]
# -------------------------
# Feature Engineering
# -------------------------
if 'loan_amount' in df.columns and 'income' in df.columns:
df['loan_to_income'] = round(df['loan_amount'] / df['income'], 2)
else:
df['loan_to_income'] = 0
if 'total_loan_months' in df.columns and 'delinquent_months' in df.columns:
df['delinquency_ratio'] = np.where(
df['total_loan_months'] > 0,
round((df['delinquent_months'] * 100) / df['total_loan_months'], 1),
0
)
elif 'delinquency_ratio' not in df.columns:
df['delinquency_ratio'] = 0
if 'total_dpd' in df.columns and 'delinquent_months' in df.columns:
df['avg_dpd_per_delinquency'] = np.where(
df['delinquent_months'] > 0,
round(df['total_dpd'] / df['delinquent_months'], 1),
0
)
elif 'avg_dpd_per_delinquency' not in df.columns:
df['avg_dpd_per_delinquency'] = 0
# -------------------------
# Drop columns if present (training only fields)
# -------------------------
drop_cols = [
'cust_id', 'loan_id',
'disbursal_date', 'installment_start_dt',
'loan_amount', 'income',
'total_loan_months', 'delinquent_months', 'total_dpd',
'sanction_amount', 'processing_fee', 'gst', 'net_disbursement',
'principal_outstanding'
]
existing_drop_cols = [c for c in drop_cols if c in df.columns]
df = df.drop(columns=existing_drop_cols)
return df
|