File size: 3,687 Bytes
ede8054
 
 
d5d57ea
 
ede8054
 
 
 
 
 
d5d57ea
ede8054
 
 
 
 
d5d57ea
ede8054
 
d5d57ea
ede8054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict
from shutil import copyfile

import joblib
import numpy as np
import pandas as pd
from huggingface_hub import hf_hub_download

REPO_ID = "Fola-lad/loan-artifacts"

ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True)

def _get_artifact(filename: str) -> Path:
    downloaded = hf_hub_download(repo_id=REPO_ID, filename=filename)
    dst = ARTIFACT_DIR / filename
    if not dst.exists():
        copyfile(downloaded, dst)
    return dst

missing_value_handler = joblib.load(_get_artifact("missing_value_handler.joblib"))
preprocessor = joblib.load(_get_artifact("preprocessor.joblib"))
model = joblib.load(_get_artifact("loan_model.joblib"))
label_encoder = joblib.load(_get_artifact("label_encoder.joblib"))

CLEANED_FEATURE_COLS = [
    "Gender",
    "Married",
    "Dependents",
    "Education",
    "Self_Employed",
    "Property_Area",
    "ApplicantIncome",
    "CoapplicantIncome",
    "LoanAmount",
    "Loan_Amount_Term",
    "Credit_History",
]

EXPECTED_INPUT_COLS = ["Loan_ID"] + CLEANED_FEATURE_COLS

def _safe_log(series: pd.Series) -> np.ndarray:
    v = pd.to_numeric(series, errors="coerce").fillna(0).to_numpy(dtype=float)
    v = np.where(v > 0, v, 1.0)
    return np.log(v)

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["Dependents"] = df["Dependents"].replace("3+", "3")
    df["Dependents"] = pd.to_numeric(df["Dependents"], errors="coerce")

    df["ApplicantIncome"] = pd.to_numeric(df["ApplicantIncome"], errors="coerce")
    df["CoapplicantIncome"] = pd.to_numeric(df["CoapplicantIncome"], errors="coerce")

    df["Total_Income"] = df["ApplicantIncome"] + df["CoapplicantIncome"]
    df["LoanAmount_Log"] = _safe_log(df["LoanAmount"])
    df["Total_Income_Log"] = _safe_log(df["Total_Income"])

    df = df.drop(
        columns=["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Total_Income", "Loan_ID"],
        errors="ignore",
    )
    return df

def _normalize_input(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for c in EXPECTED_INPUT_COLS:
        if c not in df.columns:
            df[c] = np.nan

    df = df[EXPECTED_INPUT_COLS]

    df["ApplicantIncome"] = pd.to_numeric(df["ApplicantIncome"], errors="coerce")
    df["CoapplicantIncome"] = pd.to_numeric(df["CoapplicantIncome"], errors="coerce")
    df["LoanAmount"] = pd.to_numeric(df["LoanAmount"], errors="coerce")
    df["Loan_Amount_Term"] = pd.to_numeric(df["Loan_Amount_Term"], errors="coerce")
    df["Credit_History"] = pd.to_numeric(df["Credit_History"], errors="coerce")

    return df

def _prepare_features(raw_df: pd.DataFrame):
    raw_df = _normalize_input(raw_df)

    cleaned_arr = missing_value_handler.transform(raw_df)
    cleaned_df = pd.DataFrame(cleaned_arr, columns=CLEANED_FEATURE_COLS, index=raw_df.index)

    fe_input = pd.concat([raw_df[["Loan_ID"]], cleaned_df], axis=1)
    fe_df = feature_engineering(fe_input)

    return preprocessor.transform(fe_df)

def predict_one(payload: Dict[str, Any]) -> Dict[str, Any]:
    df = pd.DataFrame([payload])
    X = _prepare_features(df)

    pred = model.predict(X)
    proba = model.predict_proba(X)[0]
    label = label_encoder.inverse_transform(pred)[0]

    return {"Loan_Status": str(label), "confidence": float(np.max(proba))}

def predict_batch(df: pd.DataFrame) -> pd.DataFrame:
    X = _prepare_features(df)

    preds = model.predict(X)
    confs = model.predict_proba(X).max(axis=1)
    labels = label_encoder.inverse_transform(preds)

    out = df.copy()
    out["Loan_Status"] = labels
    out["confidence"] = confs.astype(float)
    return out