import os import joblib import pandas as pd import numpy as np from feature_builder import prepare_input_features # -------------------------------- # PATHS # -------------------------------- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # Ensure these match the new XGBoost models you trained CLASSIFIER_PATH = os.path.join(BASE_DIR, "models", "emi_classifier_final.pkl") REGRESSOR_PATH = os.path.join(BASE_DIR, "models", "emi_model_optimized.pkl") LABEL_ENCODER_PATH = os.path.join(BASE_DIR, "models", "label_encoder.pkl") # -------------------------------- # LOAD MODELS (ONCE) # -------------------------------- classifier = joblib.load(CLASSIFIER_PATH) regressor = joblib.load(REGRESSOR_PATH) # We need the label encoder to know which index corresponds to "Eligible" label_encoder = joblib.load(LABEL_ENCODER_PATH) # -------------------------------- # PREDICTION FUNCTION # -------------------------------- def predict_emi(raw_input: dict): """ Returns: eligibility_label (str): Eligible | High Risk | Not Eligible max_emi (float): Predicted maximum EMI """ # 1. FEATURE ENGINEERING input_df = prepare_input_features(raw_input) # 2. PROBABILITY-BASED CLASSIFICATION (The Fix) # Get probabilities for all classes probs = classifier.predict_proba(input_df)[0] # Create a dictionary mapping Class Name -> Probability # Example: {'Eligible': 0.38, 'Not_Eligible': 0.62} prob_map = { label_encoder.inverse_transform([i])[0]: prob for i, prob in enumerate(probs) } # 3. THRESHOLD LOGIC (Anti-Bias) # Because of the 4.5:1 imbalance, the model is "shy" to predict Eligible. # We lower the requirement to 35% to give good candidates a fair chance. eligible_prob = prob_map.get("Eligible", 0) high_risk_prob = prob_map.get("High_Risk", 0) if eligible_prob > 0.35: ml_label = "Eligible" elif high_risk_prob > 0.40: ml_label = "High Risk" else: ml_label = "Not Eligible" # 4. REGRESSION PREDICTION (Amount) max_emi = float(regressor.predict(input_df)[0]) max_emi = max(max_emi, 0.0) # Safety clamp # 5. HARD BUSINESS RULES (The "Banker's Veto") # These rules override the ML if the risk is objectively too high. credit_score = input_df["credit_score"].iloc[0] dti = input_df.get("debt_to_income", pd.Series([0])).iloc[0] # Relaxed but safe rejection criteria is_hard_reject = ( credit_score < 400 or dti > 0.85 ) # 6. FINAL ELIGIBILITY LOGIC if is_hard_reject: eligibility_label = "Not Eligible" max_emi = 0.0 else: eligibility_label = ml_label # If the label is "Not Eligible", we force Max EMI to 0 for consistency if eligibility_label == "Not Eligible": max_emi = 0.0 return eligibility_label, round(max_emi, 2)