from __future__ import annotations from pathlib import Path from typing import Any, Dict from shutil import copyfile import joblib import numpy as np import pandas as pd from huggingface_hub import hf_hub_download REPO_ID = "Fola-lad/loan-artifacts" ARTIFACT_DIR = Path("artifacts") ARTIFACT_DIR.mkdir(exist_ok=True) def _get_artifact(filename: str) -> Path: downloaded = hf_hub_download(repo_id=REPO_ID, filename=filename) dst = ARTIFACT_DIR / filename if not dst.exists(): copyfile(downloaded, dst) return dst missing_value_handler = joblib.load(_get_artifact("missing_value_handler.joblib")) preprocessor = joblib.load(_get_artifact("preprocessor.joblib")) model = joblib.load(_get_artifact("loan_model.joblib")) label_encoder = joblib.load(_get_artifact("label_encoder.joblib")) CLEANED_FEATURE_COLS = [ "Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area", "ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History", ] EXPECTED_INPUT_COLS = ["Loan_ID"] + CLEANED_FEATURE_COLS def _safe_log(series: pd.Series) -> np.ndarray: v = pd.to_numeric(series, errors="coerce").fillna(0).to_numpy(dtype=float) v = np.where(v > 0, v, 1.0) return np.log(v) def feature_engineering(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df["Dependents"] = df["Dependents"].replace("3+", "3") df["Dependents"] = pd.to_numeric(df["Dependents"], errors="coerce") df["ApplicantIncome"] = pd.to_numeric(df["ApplicantIncome"], errors="coerce") df["CoapplicantIncome"] = pd.to_numeric(df["CoapplicantIncome"], errors="coerce") df["Total_Income"] = df["ApplicantIncome"] + df["CoapplicantIncome"] df["LoanAmount_Log"] = _safe_log(df["LoanAmount"]) df["Total_Income_Log"] = _safe_log(df["Total_Income"]) df = df.drop( columns=["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Total_Income", "Loan_ID"], errors="ignore", ) return df def _normalize_input(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() for c in EXPECTED_INPUT_COLS: if c not in df.columns: df[c] = np.nan df = df[EXPECTED_INPUT_COLS] df["ApplicantIncome"] = pd.to_numeric(df["ApplicantIncome"], errors="coerce") df["CoapplicantIncome"] = pd.to_numeric(df["CoapplicantIncome"], errors="coerce") df["LoanAmount"] = pd.to_numeric(df["LoanAmount"], errors="coerce") df["Loan_Amount_Term"] = pd.to_numeric(df["Loan_Amount_Term"], errors="coerce") df["Credit_History"] = pd.to_numeric(df["Credit_History"], errors="coerce") return df def _prepare_features(raw_df: pd.DataFrame): raw_df = _normalize_input(raw_df) cleaned_arr = missing_value_handler.transform(raw_df) cleaned_df = pd.DataFrame(cleaned_arr, columns=CLEANED_FEATURE_COLS, index=raw_df.index) fe_input = pd.concat([raw_df[["Loan_ID"]], cleaned_df], axis=1) fe_df = feature_engineering(fe_input) return preprocessor.transform(fe_df) def predict_one(payload: Dict[str, Any]) -> Dict[str, Any]: df = pd.DataFrame([payload]) X = _prepare_features(df) pred = model.predict(X) proba = model.predict_proba(X)[0] label = label_encoder.inverse_transform(pred)[0] return {"Loan_Status": str(label), "confidence": float(np.max(proba))} def predict_batch(df: pd.DataFrame) -> pd.DataFrame: X = _prepare_features(df) preds = model.predict(X) confs = model.predict_proba(X).max(axis=1) labels = label_encoder.inverse_transform(preds) out = df.copy() out["Loan_Status"] = labels out["confidence"] = confs.astype(float) return out