File size: 2,976 Bytes
5a9137a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | import os
import joblib
import pandas as pd
import numpy as np
from feature_builder import prepare_input_features
# --------------------------------
# PATHS
# --------------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# Ensure these match the new XGBoost models you trained
CLASSIFIER_PATH = os.path.join(BASE_DIR, "models", "emi_classifier_final.pkl")
REGRESSOR_PATH = os.path.join(BASE_DIR, "models", "emi_model_optimized.pkl")
LABEL_ENCODER_PATH = os.path.join(BASE_DIR, "models", "label_encoder.pkl")
# --------------------------------
# LOAD MODELS (ONCE)
# --------------------------------
classifier = joblib.load(CLASSIFIER_PATH)
regressor = joblib.load(REGRESSOR_PATH)
# We need the label encoder to know which index corresponds to "Eligible"
label_encoder = joblib.load(LABEL_ENCODER_PATH)
# --------------------------------
# PREDICTION FUNCTION
# --------------------------------
def predict_emi(raw_input: dict):
"""
Returns:
eligibility_label (str): Eligible | High Risk | Not Eligible
max_emi (float): Predicted maximum EMI
"""
# 1. FEATURE ENGINEERING
input_df = prepare_input_features(raw_input)
# 2. PROBABILITY-BASED CLASSIFICATION (The Fix)
# Get probabilities for all classes
probs = classifier.predict_proba(input_df)[0]
# Create a dictionary mapping Class Name -> Probability
# Example: {'Eligible': 0.38, 'Not_Eligible': 0.62}
prob_map = {
label_encoder.inverse_transform([i])[0]: prob
for i, prob in enumerate(probs)
}
# 3. THRESHOLD LOGIC (Anti-Bias)
# Because of the 4.5:1 imbalance, the model is "shy" to predict Eligible.
# We lower the requirement to 35% to give good candidates a fair chance.
eligible_prob = prob_map.get("Eligible", 0)
high_risk_prob = prob_map.get("High_Risk", 0)
if eligible_prob > 0.35:
ml_label = "Eligible"
elif high_risk_prob > 0.40:
ml_label = "High Risk"
else:
ml_label = "Not Eligible"
# 4. REGRESSION PREDICTION (Amount)
max_emi = float(regressor.predict(input_df)[0])
max_emi = max(max_emi, 0.0) # Safety clamp
# 5. HARD BUSINESS RULES (The "Banker's Veto")
# These rules override the ML if the risk is objectively too high.
credit_score = input_df["credit_score"].iloc[0]
dti = input_df.get("debt_to_income", pd.Series([0])).iloc[0]
# Relaxed but safe rejection criteria
is_hard_reject = (
credit_score < 400 or
dti > 0.85
)
# 6. FINAL ELIGIBILITY LOGIC
if is_hard_reject:
eligibility_label = "Not Eligible"
max_emi = 0.0
else:
eligibility_label = ml_label
# If the label is "Not Eligible", we force Max EMI to 0 for consistency
if eligibility_label == "Not Eligible":
max_emi = 0.0
return eligibility_label, round(max_emi, 2) |