Predictor-app / predictor.py
Fola-lad's picture
Add app + predictor with HF artifact download
d5d57ea
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict
from shutil import copyfile
import joblib
import numpy as np
import pandas as pd
from huggingface_hub import hf_hub_download
REPO_ID = "Fola-lad/loan-artifacts"
ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True)
def _get_artifact(filename: str) -> Path:
downloaded = hf_hub_download(repo_id=REPO_ID, filename=filename)
dst = ARTIFACT_DIR / filename
if not dst.exists():
copyfile(downloaded, dst)
return dst
missing_value_handler = joblib.load(_get_artifact("missing_value_handler.joblib"))
preprocessor = joblib.load(_get_artifact("preprocessor.joblib"))
model = joblib.load(_get_artifact("loan_model.joblib"))
label_encoder = joblib.load(_get_artifact("label_encoder.joblib"))
CLEANED_FEATURE_COLS = [
"Gender",
"Married",
"Dependents",
"Education",
"Self_Employed",
"Property_Area",
"ApplicantIncome",
"CoapplicantIncome",
"LoanAmount",
"Loan_Amount_Term",
"Credit_History",
]
EXPECTED_INPUT_COLS = ["Loan_ID"] + CLEANED_FEATURE_COLS
def _safe_log(series: pd.Series) -> np.ndarray:
v = pd.to_numeric(series, errors="coerce").fillna(0).to_numpy(dtype=float)
v = np.where(v > 0, v, 1.0)
return np.log(v)
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df["Dependents"] = df["Dependents"].replace("3+", "3")
df["Dependents"] = pd.to_numeric(df["Dependents"], errors="coerce")
df["ApplicantIncome"] = pd.to_numeric(df["ApplicantIncome"], errors="coerce")
df["CoapplicantIncome"] = pd.to_numeric(df["CoapplicantIncome"], errors="coerce")
df["Total_Income"] = df["ApplicantIncome"] + df["CoapplicantIncome"]
df["LoanAmount_Log"] = _safe_log(df["LoanAmount"])
df["Total_Income_Log"] = _safe_log(df["Total_Income"])
df = df.drop(
columns=["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Total_Income", "Loan_ID"],
errors="ignore",
)
return df
def _normalize_input(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for c in EXPECTED_INPUT_COLS:
if c not in df.columns:
df[c] = np.nan
df = df[EXPECTED_INPUT_COLS]
df["ApplicantIncome"] = pd.to_numeric(df["ApplicantIncome"], errors="coerce")
df["CoapplicantIncome"] = pd.to_numeric(df["CoapplicantIncome"], errors="coerce")
df["LoanAmount"] = pd.to_numeric(df["LoanAmount"], errors="coerce")
df["Loan_Amount_Term"] = pd.to_numeric(df["Loan_Amount_Term"], errors="coerce")
df["Credit_History"] = pd.to_numeric(df["Credit_History"], errors="coerce")
return df
def _prepare_features(raw_df: pd.DataFrame):
raw_df = _normalize_input(raw_df)
cleaned_arr = missing_value_handler.transform(raw_df)
cleaned_df = pd.DataFrame(cleaned_arr, columns=CLEANED_FEATURE_COLS, index=raw_df.index)
fe_input = pd.concat([raw_df[["Loan_ID"]], cleaned_df], axis=1)
fe_df = feature_engineering(fe_input)
return preprocessor.transform(fe_df)
def predict_one(payload: Dict[str, Any]) -> Dict[str, Any]:
df = pd.DataFrame([payload])
X = _prepare_features(df)
pred = model.predict(X)
proba = model.predict_proba(X)[0]
label = label_encoder.inverse_transform(pred)[0]
return {"Loan_Status": str(label), "confidence": float(np.max(proba))}
def predict_batch(df: pd.DataFrame) -> pd.DataFrame:
X = _prepare_features(df)
preds = model.predict(X)
confs = model.predict_proba(X).max(axis=1)
labels = label_encoder.inverse_transform(preds)
out = df.copy()
out["Loan_Status"] = labels
out["confidence"] = confs.astype(float)
return out