Spaces:
Sleeping
Sleeping
Gutema-1990 commited on
Commit ·
97b9b33
1
Parent(s): f5f4d12
the model artifact is removed to be deployed separately
Browse files- .dockerignore +5 -0
- .gitignore copy +3 -0
- Dockerfile +22 -0
- README copy.md +1 -0
- api/app.py +337 -0
- api/model/explain_meta.json +92 -0
- api/model/xgboost_booster.json +0 -0
- api/model_training/train_model.py +364 -0
- requirements.txt +15 -0
- test.py +140 -0
- test_data.json +243 -0
.dockerignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fly.toml
|
| 2 |
+
.git/
|
| 3 |
+
__pycache__/
|
| 4 |
+
.envrc
|
| 5 |
+
.venv/
|
.gitignore copy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
catboost_info/
|
| 2 |
+
api/__pycache__/
|
| 3 |
+
api/catboost_info/
|
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12.12 AS builder
|
| 2 |
+
|
| 3 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 4 |
+
PYTHONDONTWRITEBYTECODE=1
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
RUN python -m venv .venv
|
| 9 |
+
COPY requirements.txt ./
|
| 10 |
+
RUN .venv/bin/pip install -r requirements.txt
|
| 11 |
+
|
| 12 |
+
FROM python:3.12.12-slim
|
| 13 |
+
WORKDIR /app
|
| 14 |
+
|
| 15 |
+
# Native libs required by xgboost / scikit-learn wheels
|
| 16 |
+
RUN apt-get update \
|
| 17 |
+
&& apt-get install -y --no-install-recommends libgomp1 libopenblas0-pthread \
|
| 18 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 19 |
+
|
| 20 |
+
COPY --from=builder /app/.venv .venv/
|
| 21 |
+
COPY . .
|
| 22 |
+
CMD ["/app/.venv/bin/uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README copy.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# frankscore-deployment
|
api/app.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Dict, List, Optional
|
| 6 |
+
|
| 7 |
+
import joblib
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from fastapi import FastAPI, HTTPException
|
| 11 |
+
from pydantic import BaseModel, Field
|
| 12 |
+
import xgboost as xgb
|
| 13 |
+
# Compatibility shim for pickles created with newer sklearn that include _RemainderColsList
|
| 14 |
+
import sklearn.compose._column_transformer as _ct # type: ignore
|
| 15 |
+
if not hasattr(_ct, "_RemainderColsList"):
|
| 16 |
+
class _RemainderColsList(list): # type: ignore
|
| 17 |
+
pass
|
| 18 |
+
_ct._RemainderColsList = _RemainderColsList
|
| 19 |
+
|
| 20 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 21 |
+
MODEL_DIR = Path(__file__).resolve().parent / "model"
|
| 22 |
+
MODEL_PATH = MODEL_DIR / "xgboost_pipeline.pkl"
|
| 23 |
+
BOOSTER_PATH = MODEL_DIR / "xgboost_booster.json"
|
| 24 |
+
META_PATH = MODEL_DIR / "explain_meta.json"
|
| 25 |
+
|
| 26 |
+
if not MODEL_PATH.exists():
|
| 27 |
+
raise FileNotFoundError(f"Model file missing at {MODEL_PATH}")
|
| 28 |
+
if not META_PATH.exists():
|
| 29 |
+
raise FileNotFoundError(f"Explainability meta missing at {META_PATH}")
|
| 30 |
+
if not BOOSTER_PATH.exists():
|
| 31 |
+
raise FileNotFoundError(f"Booster file missing at {BOOSTER_PATH}")
|
| 32 |
+
|
| 33 |
+
PIPELINE = joblib.load(MODEL_PATH)
|
| 34 |
+
META = json.loads(META_PATH.read_text())
|
| 35 |
+
|
| 36 |
+
EXPECTED_FEATURES = list(getattr(PIPELINE, "feature_names_in_", []))
|
| 37 |
+
PREPROCESS = PIPELINE.named_steps.get("preprocess") if hasattr(PIPELINE, "named_steps") else None
|
| 38 |
+
if PREPROCESS is None:
|
| 39 |
+
raise RuntimeError("Pipeline missing 'preprocess' step; cannot infer columns.")
|
| 40 |
+
|
| 41 |
+
if not EXPECTED_FEATURES:
|
| 42 |
+
EXPECTED_FEATURES = list(getattr(PREPROCESS, "feature_names_in_", []))
|
| 43 |
+
if not EXPECTED_FEATURES:
|
| 44 |
+
raise RuntimeError("Unable to determine expected feature names from the pipeline.")
|
| 45 |
+
|
| 46 |
+
_col_map = {name: cols for name, _, cols in getattr(PREPROCESS, "transformers_", [])}
|
| 47 |
+
NUM_FEATURES = list(_col_map.get("num", []))
|
| 48 |
+
CAT_FEATURES = list(_col_map.get("cat", []))
|
| 49 |
+
PRE_FEATURE_NAMES = META.get("pre_feature_names") or list(getattr(PREPROCESS, "get_feature_names_out", lambda: [])())
|
| 50 |
+
RAW_FEATURE_SET = set((META.get("raw_num_cols") or []) + (META.get("raw_cat_cols") or []))
|
| 51 |
+
FEATURE_GROUPS = {
|
| 52 |
+
"Borrowing History & Maturity": [
|
| 53 |
+
"account_age_days",
|
| 54 |
+
"avg_past_amount",
|
| 55 |
+
"avg_past_daily_burden",
|
| 56 |
+
"avg_time_bw_loans",
|
| 57 |
+
"borrower_history_strength",
|
| 58 |
+
"days_since_last_loan",
|
| 59 |
+
"loan_frequency_per_year",
|
| 60 |
+
"num_previous_loans",
|
| 61 |
+
"std_past_amount",
|
| 62 |
+
"std_past_daily_burden",
|
| 63 |
+
"trend_in_amount",
|
| 64 |
+
"trend_in_burden",
|
| 65 |
+
],
|
| 66 |
+
"Repayment Speed & Delinquency": [
|
| 67 |
+
"num_previous_defaults",
|
| 68 |
+
"past_default_rate",
|
| 69 |
+
"repayment_consistency",
|
| 70 |
+
],
|
| 71 |
+
"Current Loan Size, Pricing & Burden": [
|
| 72 |
+
"Total_Amount",
|
| 73 |
+
"Total_Amount_to_Repay",
|
| 74 |
+
"amount_bucket",
|
| 75 |
+
"burden_percentile",
|
| 76 |
+
"daily_burden",
|
| 77 |
+
"duration",
|
| 78 |
+
"duration_bucket",
|
| 79 |
+
"interest_rate",
|
| 80 |
+
],
|
| 81 |
+
"Affordability & Risk Ratios": [
|
| 82 |
+
"amount_ratio",
|
| 83 |
+
"burden_ratio",
|
| 84 |
+
"repayment_intensity",
|
| 85 |
+
],
|
| 86 |
+
"Seasonality & Timing": [
|
| 87 |
+
"days_to_local_festival",
|
| 88 |
+
"days_to_salary_day",
|
| 89 |
+
"month",
|
| 90 |
+
"quarter",
|
| 91 |
+
"week_of_year",
|
| 92 |
+
],
|
| 93 |
+
"Operational, Referral & Lender Signals": [
|
| 94 |
+
"lender_exposure_ratio",
|
| 95 |
+
"lender_id",
|
| 96 |
+
"lender_risk_profile",
|
| 97 |
+
],
|
| 98 |
+
"Time-based Trends & Volatility": [
|
| 99 |
+
"latest_amount_ma3",
|
| 100 |
+
],
|
| 101 |
+
}
|
| 102 |
+
FEATURE_GROUP_LOOKUP: Dict[str, str] = {}
|
| 103 |
+
for group, variables in FEATURE_GROUPS.items():
|
| 104 |
+
for var in variables:
|
| 105 |
+
FEATURE_GROUP_LOOKUP[var] = group
|
| 106 |
+
|
| 107 |
+
app = FastAPI(title="FrankScore", version="1.0.0")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class PredictionRequest(BaseModel):
|
| 111 |
+
records: List[Dict[str, Any]] = Field(..., description="List of borrower feature dictionaries")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class PredictionResponse(BaseModel):
|
| 115 |
+
probabilities: List[float]
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class ScoreRequest(BaseModel):
|
| 119 |
+
probabilities: List[float] = Field(..., description="Probabilities of default (0-1)")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class ScoreResponse(BaseModel):
|
| 123 |
+
scores: List[float]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class ExplainRequest(BaseModel):
|
| 127 |
+
records: List[Dict[str, Any]]
|
| 128 |
+
top_k: Optional[int] = Field(default=10, ge=1, le=100, description="Number of top features to return per record")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class FeatureContribution(BaseModel):
|
| 132 |
+
feature: str
|
| 133 |
+
shap_value: float
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class GroupContribution(BaseModel):
|
| 137 |
+
group: str
|
| 138 |
+
total_shap_value: float
|
| 139 |
+
features: List[FeatureContribution]
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class ExplainItem(BaseModel):
|
| 143 |
+
probability: float
|
| 144 |
+
base_value: float
|
| 145 |
+
group_contributions: List[GroupContribution]
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class ExplainResponse(BaseModel):
|
| 149 |
+
explanations: List[ExplainItem]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
class PredictExplainItem(BaseModel):
|
| 153 |
+
probability: float
|
| 154 |
+
score: float
|
| 155 |
+
base_value: float
|
| 156 |
+
group_contributions: List[GroupContribution]
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class PredictExplainResponse(BaseModel):
|
| 160 |
+
results: List[PredictExplainItem]
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def prepare_frame(records: List[Dict[str, Any]]) -> pd.DataFrame:
|
| 164 |
+
if not records:
|
| 165 |
+
raise HTTPException(status_code=400, detail="No records provided.")
|
| 166 |
+
df = pd.DataFrame(records)
|
| 167 |
+
for col in EXPECTED_FEATURES:
|
| 168 |
+
if col not in df.columns:
|
| 169 |
+
df[col] = np.nan
|
| 170 |
+
df = df[EXPECTED_FEATURES]
|
| 171 |
+
if NUM_FEATURES:
|
| 172 |
+
df[NUM_FEATURES] = df[NUM_FEATURES].apply(pd.to_numeric, errors="coerce")
|
| 173 |
+
if CAT_FEATURES:
|
| 174 |
+
df[CAT_FEATURES] = df[CAT_FEATURES].astype("object")
|
| 175 |
+
return df
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def pd_to_score(p: np.ndarray, base_score: float = 50, base_odds: float = 9, pdo: float = 20) -> np.ndarray:
|
| 179 |
+
p = np.clip(p, 1e-6, 1 - 1e-6)
|
| 180 |
+
B = pdo / np.log(2)
|
| 181 |
+
A = base_score - B * np.log(base_odds)
|
| 182 |
+
odds = (1 - p) / p
|
| 183 |
+
score = A + B * np.log(odds)
|
| 184 |
+
return np.clip(score, 0, 100)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _sanitize_feature_name(name: str) -> str:
|
| 188 |
+
sanitized = name
|
| 189 |
+
for ch, repl in {"[": "", "]": "", "<": "lt", ">": "gt", " ": "_", ",": "_", "=": "_"}.items():
|
| 190 |
+
sanitized = sanitized.replace(ch, repl)
|
| 191 |
+
return sanitized
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _base_feature_name(name: str) -> str:
|
| 195 |
+
base = name
|
| 196 |
+
if "__" in base:
|
| 197 |
+
base = base.split("__", 1)[1]
|
| 198 |
+
if base in RAW_FEATURE_SET:
|
| 199 |
+
return base
|
| 200 |
+
parts = base.split("_")
|
| 201 |
+
while len(parts) > 1:
|
| 202 |
+
candidate = "_".join(parts[:-1])
|
| 203 |
+
if candidate in RAW_FEATURE_SET:
|
| 204 |
+
return candidate
|
| 205 |
+
parts = parts[:-1]
|
| 206 |
+
return base
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def get_booster():
|
| 210 |
+
if not hasattr(get_booster, "_booster"):
|
| 211 |
+
booster = xgb.Booster()
|
| 212 |
+
booster.load_model(str(BOOSTER_PATH))
|
| 213 |
+
base_score = booster.attr("base_score")
|
| 214 |
+
if base_score:
|
| 215 |
+
try:
|
| 216 |
+
float(base_score)
|
| 217 |
+
except ValueError:
|
| 218 |
+
cleaned = base_score.strip("[]")
|
| 219 |
+
try:
|
| 220 |
+
cleaned_val = str(float(cleaned))
|
| 221 |
+
except Exception:
|
| 222 |
+
cleaned_val = "0.5"
|
| 223 |
+
booster.set_param({"base_score": cleaned_val})
|
| 224 |
+
booster.set_attr(base_score=cleaned_val)
|
| 225 |
+
get_booster._booster = booster
|
| 226 |
+
return get_booster._booster
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
@app.post("/predict", response_model=PredictionResponse)
|
| 230 |
+
def predict(req: PredictionRequest) -> PredictionResponse:
|
| 231 |
+
frame = prepare_frame(req.records)
|
| 232 |
+
probas = PIPELINE.predict_proba(frame)[:, 1]
|
| 233 |
+
return PredictionResponse(probabilities=probas.tolist())
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@app.get("/health")
|
| 237 |
+
def health() -> Dict[str, str]:
|
| 238 |
+
return {"status": "ok", "model_path": str(MODEL_PATH)}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
@app.post("/score", response_model=ScoreResponse)
|
| 242 |
+
def score(req: ScoreRequest) -> ScoreResponse:
|
| 243 |
+
if not req.probabilities:
|
| 244 |
+
raise HTTPException(status_code=400, detail="No probabilities provided.")
|
| 245 |
+
arr = np.array(req.probabilities, dtype=float)
|
| 246 |
+
scores = pd_to_score(arr)
|
| 247 |
+
return ScoreResponse(scores=scores.tolist())
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
@app.post("/explain", response_model=ExplainResponse)
|
| 251 |
+
def explain(req: ExplainRequest) -> ExplainResponse:
|
| 252 |
+
if not req.records:
|
| 253 |
+
raise HTTPException(status_code=400, detail="No records provided.")
|
| 254 |
+
frame = prepare_frame(req.records)
|
| 255 |
+
probas = PIPELINE.predict_proba(frame)[:, 1]
|
| 256 |
+
booster = get_booster()
|
| 257 |
+
X_proc = PREPROCESS.transform(frame)
|
| 258 |
+
feat_names = np.array(PRE_FEATURE_NAMES) if PRE_FEATURE_NAMES else np.array([f"f{i}" for i in range(X_proc.shape[1])])
|
| 259 |
+
sanitized_names = [_sanitize_feature_name(n) for n in feat_names]
|
| 260 |
+
dmat = xgb.DMatrix(X_proc, feature_names=sanitized_names)
|
| 261 |
+
contribs = booster.predict(dmat, pred_contribs=True)
|
| 262 |
+
if contribs.shape[1] != X_proc.shape[1] + 1:
|
| 263 |
+
raise HTTPException(status_code=500, detail="Unexpected contribution shape from booster.")
|
| 264 |
+
base_vals = contribs[:, -1]
|
| 265 |
+
feat_contribs = contribs[:, :-1]
|
| 266 |
+
explanations: List[ExplainItem] = []
|
| 267 |
+
for i in range(feat_contribs.shape[0]):
|
| 268 |
+
row_vals = feat_contribs[i]
|
| 269 |
+
group_totals: Dict[str, float] = {}
|
| 270 |
+
group_details: Dict[str, List[FeatureContribution]] = {}
|
| 271 |
+
for name, val in zip(feat_names, row_vals):
|
| 272 |
+
base = _base_feature_name(str(name))
|
| 273 |
+
group = FEATURE_GROUP_LOOKUP.get(base, "Other")
|
| 274 |
+
group_totals[group] = group_totals.get(group, 0.0) + float(val)
|
| 275 |
+
group_details.setdefault(group, []).append(
|
| 276 |
+
FeatureContribution(feature=str(name), shap_value=float(val))
|
| 277 |
+
)
|
| 278 |
+
group_contribs: List[GroupContribution] = []
|
| 279 |
+
for grp, total in sorted(group_totals.items(), key=lambda kv: abs(kv[1]), reverse=True):
|
| 280 |
+
feats = sorted(group_details.get(grp, []), key=lambda fc: abs(fc.shap_value), reverse=True)
|
| 281 |
+
if req.top_k:
|
| 282 |
+
feats = feats[:req.top_k]
|
| 283 |
+
group_contribs.append(GroupContribution(group=grp, total_shap_value=total, features=feats))
|
| 284 |
+
explanations.append(
|
| 285 |
+
ExplainItem(
|
| 286 |
+
probability=float(probas[i]),
|
| 287 |
+
base_value=float(base_vals[i]),
|
| 288 |
+
group_contributions=group_contribs,
|
| 289 |
+
)
|
| 290 |
+
)
|
| 291 |
+
return ExplainResponse(explanations=explanations)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
@app.post("/predict_explain", response_model=PredictExplainResponse)
|
| 295 |
+
def predict_explain(req: ExplainRequest) -> PredictExplainResponse:
|
| 296 |
+
if not req.records:
|
| 297 |
+
raise HTTPException(status_code=400, detail="No records provided.")
|
| 298 |
+
frame = prepare_frame(req.records)
|
| 299 |
+
probas = PIPELINE.predict_proba(frame)[:, 1]
|
| 300 |
+
booster = get_booster()
|
| 301 |
+
X_proc = PREPROCESS.transform(frame)
|
| 302 |
+
feat_names = np.array(PRE_FEATURE_NAMES) if PRE_FEATURE_NAMES else np.array([f"f{i}" for i in range(X_proc.shape[1])])
|
| 303 |
+
sanitized_names = [_sanitize_feature_name(n) for n in feat_names]
|
| 304 |
+
dmat = xgb.DMatrix(X_proc, feature_names=sanitized_names)
|
| 305 |
+
contribs = booster.predict(dmat, pred_contribs=True)
|
| 306 |
+
if contribs.shape[1] != X_proc.shape[1] + 1:
|
| 307 |
+
raise HTTPException(status_code=500, detail="Unexpected contribution shape from booster.")
|
| 308 |
+
base_vals = contribs[:, -1]
|
| 309 |
+
feat_contribs = contribs[:, :-1]
|
| 310 |
+
items: List[PredictExplainItem] = []
|
| 311 |
+
for i in range(feat_contribs.shape[0]):
|
| 312 |
+
row_vals = feat_contribs[i]
|
| 313 |
+
group_totals: Dict[str, float] = {}
|
| 314 |
+
group_details: Dict[str, List[FeatureContribution]] = {}
|
| 315 |
+
for name, val in zip(feat_names, row_vals):
|
| 316 |
+
base = _base_feature_name(str(name))
|
| 317 |
+
group = FEATURE_GROUP_LOOKUP.get(base, "Other")
|
| 318 |
+
group_totals[group] = group_totals.get(group, 0.0) + float(val)
|
| 319 |
+
group_details.setdefault(group, []).append(
|
| 320 |
+
FeatureContribution(feature=str(name), shap_value=float(val))
|
| 321 |
+
)
|
| 322 |
+
group_contribs: List[GroupContribution] = []
|
| 323 |
+
for grp, total in sorted(group_totals.items(), key=lambda kv: abs(kv[1]), reverse=True):
|
| 324 |
+
feats = sorted(group_details.get(grp, []), key=lambda fc: abs(fc.shap_value), reverse=True)
|
| 325 |
+
if req.top_k:
|
| 326 |
+
feats = feats[:req.top_k]
|
| 327 |
+
group_contribs.append(GroupContribution(group=grp, total_shap_value=total, features=feats))
|
| 328 |
+
score_val = int(round(float(pd_to_score(np.array([probas[i]]))[0])))
|
| 329 |
+
items.append(
|
| 330 |
+
PredictExplainItem(
|
| 331 |
+
probability=float(probas[i]),
|
| 332 |
+
score=score_val,
|
| 333 |
+
base_value=float(base_vals[i]),
|
| 334 |
+
group_contributions=group_contribs,
|
| 335 |
+
)
|
| 336 |
+
)
|
| 337 |
+
return PredictExplainResponse(results=items)
|
api/model/explain_meta.json
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset": "full",
|
| 3 |
+
"target_col": "target",
|
| 4 |
+
"raw_num_cols": [
|
| 5 |
+
"num_previous_loans",
|
| 6 |
+
"num_previous_defaults",
|
| 7 |
+
"past_default_rate",
|
| 8 |
+
"days_since_last_loan",
|
| 9 |
+
"avg_time_bw_loans",
|
| 10 |
+
"avg_past_amount",
|
| 11 |
+
"avg_past_daily_burden",
|
| 12 |
+
"std_past_amount",
|
| 13 |
+
"std_past_daily_burden",
|
| 14 |
+
"trend_in_amount",
|
| 15 |
+
"trend_in_burden",
|
| 16 |
+
"Total_Amount",
|
| 17 |
+
"Total_Amount_to_Repay",
|
| 18 |
+
"duration",
|
| 19 |
+
"daily_burden",
|
| 20 |
+
"amount_ratio",
|
| 21 |
+
"burden_ratio",
|
| 22 |
+
"burden_percentile",
|
| 23 |
+
"borrower_history_strength",
|
| 24 |
+
"month",
|
| 25 |
+
"quarter",
|
| 26 |
+
"week_of_year",
|
| 27 |
+
"days_to_salary_day",
|
| 28 |
+
"days_to_local_festival",
|
| 29 |
+
"lender_id",
|
| 30 |
+
"lender_exposure_ratio",
|
| 31 |
+
"account_age_days",
|
| 32 |
+
"loan_frequency_per_year",
|
| 33 |
+
"repayment_consistency",
|
| 34 |
+
"latest_amount_ma3"
|
| 35 |
+
],
|
| 36 |
+
"raw_cat_cols": [
|
| 37 |
+
"duration_bucket",
|
| 38 |
+
"amount_bucket"
|
| 39 |
+
],
|
| 40 |
+
"pre_feature_names": [
|
| 41 |
+
"num__num_previous_loans",
|
| 42 |
+
"num__num_previous_defaults",
|
| 43 |
+
"num__past_default_rate",
|
| 44 |
+
"num__days_since_last_loan",
|
| 45 |
+
"num__avg_time_bw_loans",
|
| 46 |
+
"num__avg_past_amount",
|
| 47 |
+
"num__avg_past_daily_burden",
|
| 48 |
+
"num__std_past_amount",
|
| 49 |
+
"num__std_past_daily_burden",
|
| 50 |
+
"num__trend_in_amount",
|
| 51 |
+
"num__trend_in_burden",
|
| 52 |
+
"num__Total_Amount",
|
| 53 |
+
"num__Total_Amount_to_Repay",
|
| 54 |
+
"num__duration",
|
| 55 |
+
"num__daily_burden",
|
| 56 |
+
"num__amount_ratio",
|
| 57 |
+
"num__burden_ratio",
|
| 58 |
+
"num__burden_percentile",
|
| 59 |
+
"num__borrower_history_strength",
|
| 60 |
+
"num__month",
|
| 61 |
+
"num__quarter",
|
| 62 |
+
"num__week_of_year",
|
| 63 |
+
"num__days_to_salary_day",
|
| 64 |
+
"num__days_to_local_festival",
|
| 65 |
+
"num__lender_id",
|
| 66 |
+
"num__lender_exposure_ratio",
|
| 67 |
+
"num__account_age_days",
|
| 68 |
+
"num__loan_frequency_per_year",
|
| 69 |
+
"num__repayment_consistency",
|
| 70 |
+
"num__latest_amount_ma3",
|
| 71 |
+
"cat__duration_bucket_<=1m",
|
| 72 |
+
"cat__duration_bucket_<=1w",
|
| 73 |
+
"cat__duration_bucket_<=2m",
|
| 74 |
+
"cat__duration_bucket_<=2w",
|
| 75 |
+
"cat__duration_bucket_>2m",
|
| 76 |
+
"cat__amount_bucket_q1",
|
| 77 |
+
"cat__amount_bucket_q2",
|
| 78 |
+
"cat__amount_bucket_q3",
|
| 79 |
+
"cat__amount_bucket_q4"
|
| 80 |
+
],
|
| 81 |
+
"id_cols": [
|
| 82 |
+
"customer_id",
|
| 83 |
+
"tbl_loan_id"
|
| 84 |
+
],
|
| 85 |
+
"dropped_features": [
|
| 86 |
+
"interest_rate",
|
| 87 |
+
"lender_risk_profile",
|
| 88 |
+
"pseudo_disb_date",
|
| 89 |
+
"repayment_intensity"
|
| 90 |
+
],
|
| 91 |
+
"split_used": "time_split(pseudo_disb_date)"
|
| 92 |
+
}
|
api/model/xgboost_booster.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
api/model_training/train_model.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, List, Tuple
|
| 6 |
+
|
| 7 |
+
import matplotlib
|
| 8 |
+
matplotlib.use("Agg") # Use non-GUI backend to avoid Tkinter cleanup warnings
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import seaborn as sns
|
| 13 |
+
import joblib
|
| 14 |
+
from catboost import CatBoostClassifier
|
| 15 |
+
from lightgbm import LGBMClassifier
|
| 16 |
+
from sklearn.compose import ColumnTransformer
|
| 17 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 18 |
+
from sklearn.impute import SimpleImputer
|
| 19 |
+
from sklearn.metrics import (
|
| 20 |
+
accuracy_score,
|
| 21 |
+
average_precision_score,
|
| 22 |
+
classification_report,
|
| 23 |
+
confusion_matrix,
|
| 24 |
+
f1_score,
|
| 25 |
+
precision_recall_curve,
|
| 26 |
+
precision_score,
|
| 27 |
+
recall_score,
|
| 28 |
+
roc_auc_score,
|
| 29 |
+
roc_curve,
|
| 30 |
+
)
|
| 31 |
+
from sklearn.model_selection import GroupShuffleSplit, train_test_split
|
| 32 |
+
from sklearn.pipeline import Pipeline
|
| 33 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 34 |
+
from xgboost import XGBClassifier
|
| 35 |
+
import xgboost as xgb
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
RANDOM_STATE = 42
|
| 39 |
+
# Repository root (two levels up from this file: code/model/train_models.py -> repo root)
|
| 40 |
+
ROOT = Path(__file__).resolve().parents[2]
|
| 41 |
+
DATA_BASE = Path(
|
| 42 |
+
"/home/name-1/AI-Agent/frankscore/kenyan-dataset-issue/data/feature-generated"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
DATASETS: Dict[str, Path] = {
|
| 46 |
+
"full": DATA_BASE / "kenya_engineered_features.csv",
|
| 47 |
+
"borrower": DATA_BASE / "kenya_engineered_features_borrower_side.csv",
|
| 48 |
+
}
|
| 49 |
+
OUTPUT_DIR = ROOT / "code" / "model" / "outputs_for_demo"
|
| 50 |
+
TARGET_COL = "target"
|
| 51 |
+
ID_COLS = ["customer_id", "tbl_loan_id"]
|
| 52 |
+
GROUP_COL_CANDIDATES = ["customer_id", "customerId", "client_id"]
|
| 53 |
+
DATE_COL_CANDIDATES = ["pseudo_disb_date", "disb_date", "disbursement_date", "application_date", "loan_date"]
|
| 54 |
+
FEATURES_TO_DROP = {
|
| 55 |
+
"interest_rate",
|
| 56 |
+
"repayment_intensity",
|
| 57 |
+
"lender_risk_profile",
|
| 58 |
+
"pseudo_disb_date",
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def build_preprocessor(
|
| 63 |
+
feature_frame: pd.DataFrame,
|
| 64 |
+
) -> Tuple[ColumnTransformer, List[str], List[str]]:
|
| 65 |
+
cat_cols = feature_frame.select_dtypes(include=["object"]).columns.tolist()
|
| 66 |
+
num_cols = [c for c in feature_frame.columns if c not in cat_cols]
|
| 67 |
+
|
| 68 |
+
num_pipe = Pipeline(
|
| 69 |
+
steps=[
|
| 70 |
+
("imputer", SimpleImputer(strategy="median")),
|
| 71 |
+
]
|
| 72 |
+
)
|
| 73 |
+
cat_pipe = Pipeline(
|
| 74 |
+
steps=[
|
| 75 |
+
("imputer", SimpleImputer(strategy="most_frequent")),
|
| 76 |
+
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
|
| 77 |
+
]
|
| 78 |
+
)
|
| 79 |
+
preprocessor = ColumnTransformer(
|
| 80 |
+
transformers=[
|
| 81 |
+
("num", num_pipe, num_cols),
|
| 82 |
+
("cat", cat_pipe, cat_cols),
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
return preprocessor, num_cols, cat_cols
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def find_first_existing_col(df: pd.DataFrame, candidates: List[str]) -> str | None:
|
| 89 |
+
for c in candidates:
|
| 90 |
+
if c in df.columns:
|
| 91 |
+
return c
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def split_data_leakage_safe(
|
| 96 |
+
df: pd.DataFrame, X: pd.DataFrame, y: pd.Series
|
| 97 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, str]:
|
| 98 |
+
"""
|
| 99 |
+
Priority:
|
| 100 |
+
1) time split if a date column exists
|
| 101 |
+
2) group split on customer id
|
| 102 |
+
3) stratified fallback
|
| 103 |
+
"""
|
| 104 |
+
date_col = find_first_existing_col(df, DATE_COL_CANDIDATES)
|
| 105 |
+
group_col = find_first_existing_col(df, GROUP_COL_CANDIDATES)
|
| 106 |
+
|
| 107 |
+
if date_col is not None:
|
| 108 |
+
tmp = df[[date_col]].copy()
|
| 109 |
+
tmp[date_col] = pd.to_datetime(tmp[date_col], errors="coerce")
|
| 110 |
+
if tmp[date_col].notna().mean() > 0.8:
|
| 111 |
+
order = tmp[date_col].sort_values().index
|
| 112 |
+
cutoff = int(len(order) * 0.8)
|
| 113 |
+
train_idx = order[:cutoff]
|
| 114 |
+
test_idx = order[cutoff:]
|
| 115 |
+
return (
|
| 116 |
+
X.loc[train_idx],
|
| 117 |
+
X.loc[test_idx],
|
| 118 |
+
y.loc[train_idx],
|
| 119 |
+
y.loc[test_idx],
|
| 120 |
+
f"time_split({date_col})",
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
if group_col is not None:
|
| 124 |
+
groups = df[group_col]
|
| 125 |
+
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
|
| 126 |
+
train_idx, test_idx = next(gss.split(X, y, groups=groups))
|
| 127 |
+
return (
|
| 128 |
+
X.iloc[train_idx],
|
| 129 |
+
X.iloc[test_idx],
|
| 130 |
+
y.iloc[train_idx],
|
| 131 |
+
y.iloc[test_idx],
|
| 132 |
+
f"group_split({group_col})",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 136 |
+
X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
|
| 137 |
+
)
|
| 138 |
+
return X_train, X_test, y_train, y_test, "stratified_random_split"
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def get_models(scale_pos_weight: float) -> Dict[str, object]:
|
| 142 |
+
# Using moderate defaults to keep runtime reasonable.
|
| 143 |
+
return {
|
| 144 |
+
"random_forest": RandomForestClassifier(
|
| 145 |
+
n_estimators=300,
|
| 146 |
+
max_depth=None,
|
| 147 |
+
n_jobs=-1,
|
| 148 |
+
class_weight="balanced",
|
| 149 |
+
random_state=RANDOM_STATE,
|
| 150 |
+
),
|
| 151 |
+
"xgboost": XGBClassifier(
|
| 152 |
+
n_estimators=300,
|
| 153 |
+
max_depth=6,
|
| 154 |
+
learning_rate=0.05,
|
| 155 |
+
subsample=0.8,
|
| 156 |
+
colsample_bytree=0.8,
|
| 157 |
+
eval_metric="logloss",
|
| 158 |
+
n_jobs=-1,
|
| 159 |
+
random_state=RANDOM_STATE,
|
| 160 |
+
scale_pos_weight=scale_pos_weight,
|
| 161 |
+
),
|
| 162 |
+
"lightgbm": LGBMClassifier(
|
| 163 |
+
n_estimators=400,
|
| 164 |
+
learning_rate=0.05,
|
| 165 |
+
max_depth=-1,
|
| 166 |
+
subsample=0.9,
|
| 167 |
+
colsample_bytree=0.9,
|
| 168 |
+
random_state=RANDOM_STATE,
|
| 169 |
+
n_jobs=-1,
|
| 170 |
+
class_weight="balanced",
|
| 171 |
+
),
|
| 172 |
+
"catboost": CatBoostClassifier(
|
| 173 |
+
iterations=400,
|
| 174 |
+
depth=8,
|
| 175 |
+
learning_rate=0.05,
|
| 176 |
+
loss_function="Logloss",
|
| 177 |
+
eval_metric="AUC",
|
| 178 |
+
verbose=0,
|
| 179 |
+
random_seed=RANDOM_STATE,
|
| 180 |
+
),
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def plot_roc(y_true: np.ndarray, y_score: np.ndarray, title: str, path: Path) -> None:
|
| 185 |
+
fpr, tpr, _ = roc_curve(y_true, y_score)
|
| 186 |
+
auc_val = roc_auc_score(y_true, y_score)
|
| 187 |
+
plt.figure()
|
| 188 |
+
plt.plot(fpr, tpr, label=f"AUC = {auc_val:.3f}")
|
| 189 |
+
plt.plot([0, 1], [0, 1], linestyle="--", color="grey")
|
| 190 |
+
plt.xlabel("False Positive Rate")
|
| 191 |
+
plt.ylabel("True Positive Rate")
|
| 192 |
+
plt.title(title)
|
| 193 |
+
plt.legend(loc="lower right")
|
| 194 |
+
plt.tight_layout()
|
| 195 |
+
plt.savefig(path, dpi=150)
|
| 196 |
+
plt.close()
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def plot_pr(y_true: np.ndarray, y_score: np.ndarray, title: str, path: Path) -> None:
|
| 200 |
+
precision, recall, _ = precision_recall_curve(y_true, y_score)
|
| 201 |
+
ap = average_precision_score(y_true, y_score)
|
| 202 |
+
plt.figure()
|
| 203 |
+
plt.plot(recall, precision, label=f"AP = {ap:.3f}")
|
| 204 |
+
plt.xlabel("Recall")
|
| 205 |
+
plt.ylabel("Precision")
|
| 206 |
+
plt.title(title)
|
| 207 |
+
plt.legend(loc="lower left")
|
| 208 |
+
plt.tight_layout()
|
| 209 |
+
plt.savefig(path, dpi=150)
|
| 210 |
+
plt.close()
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def plot_confusion(y_true: np.ndarray, y_pred: np.ndarray, title: str, path: Path) -> None:
|
| 214 |
+
cm = confusion_matrix(y_true, y_pred)
|
| 215 |
+
plt.figure()
|
| 216 |
+
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
|
| 217 |
+
plt.xlabel("Predicted")
|
| 218 |
+
plt.ylabel("Actual")
|
| 219 |
+
plt.title(title)
|
| 220 |
+
plt.tight_layout()
|
| 221 |
+
plt.savefig(path, dpi=150)
|
| 222 |
+
plt.close()
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def evaluate_models(dataset_name: str, data_path: Path) -> None:
|
| 226 |
+
print(f"=== Training on {dataset_name} dataset ===")
|
| 227 |
+
df = pd.read_csv(data_path)
|
| 228 |
+
if TARGET_COL not in df.columns:
|
| 229 |
+
raise SystemExit(f"target column missing in {data_path}")
|
| 230 |
+
|
| 231 |
+
X = df.drop(columns=[TARGET_COL] + ID_COLS, errors="ignore")
|
| 232 |
+
X = X.drop(columns=[c for c in FEATURES_TO_DROP if c in X.columns], errors="ignore")
|
| 233 |
+
y = df[TARGET_COL]
|
| 234 |
+
|
| 235 |
+
preprocessor, num_cols, cat_cols = build_preprocessor(X)
|
| 236 |
+
|
| 237 |
+
X_train, X_test, y_train, y_test, split_tag = split_data_leakage_safe(df, X, y)
|
| 238 |
+
print(f"Split used: {split_tag}")
|
| 239 |
+
pos = y_train.sum()
|
| 240 |
+
neg = len(y_train) - pos
|
| 241 |
+
scale_pos_weight = float(neg / pos) if pos > 0 else 1.0
|
| 242 |
+
|
| 243 |
+
models = get_models(scale_pos_weight)
|
| 244 |
+
ds_out = OUTPUT_DIR / dataset_name
|
| 245 |
+
ds_out.mkdir(parents=True, exist_ok=True)
|
| 246 |
+
|
| 247 |
+
# Save a small background sample for downstream explainability tooling.
|
| 248 |
+
background_path = ds_out / "explain_background.csv"
|
| 249 |
+
df.sample(min(len(df), 200), random_state=RANDOM_STATE).to_csv(background_path, index=False)
|
| 250 |
+
|
| 251 |
+
metrics_rows = []
|
| 252 |
+
report_manifest = {}
|
| 253 |
+
pre_feature_names = None
|
| 254 |
+
|
| 255 |
+
for model_name, model in models.items():
|
| 256 |
+
print(f"Training {model_name}...")
|
| 257 |
+
clf = Pipeline(steps=[("preprocess", preprocessor), ("model", model)])
|
| 258 |
+
clf.fit(X_train, y_train)
|
| 259 |
+
if pre_feature_names is None:
|
| 260 |
+
pre_feature_names = clf.named_steps["preprocess"].get_feature_names_out().tolist()
|
| 261 |
+
probas = clf.predict_proba(X_test)[:, 1]
|
| 262 |
+
preds = (probas >= 0.5).astype(int)
|
| 263 |
+
|
| 264 |
+
metrics = {
|
| 265 |
+
"dataset": dataset_name,
|
| 266 |
+
"split": split_tag,
|
| 267 |
+
"model": model_name,
|
| 268 |
+
"auc_roc": roc_auc_score(y_test, probas),
|
| 269 |
+
"auc_pr": average_precision_score(y_test, probas),
|
| 270 |
+
"accuracy": accuracy_score(y_test, preds),
|
| 271 |
+
"precision": precision_score(y_test, preds, zero_division=0),
|
| 272 |
+
"recall": recall_score(y_test, preds, zero_division=0),
|
| 273 |
+
"f1": f1_score(y_test, preds, zero_division=0),
|
| 274 |
+
}
|
| 275 |
+
metrics_rows.append(metrics)
|
| 276 |
+
|
| 277 |
+
# Classification report
|
| 278 |
+
cls_report = classification_report(
|
| 279 |
+
y_test,
|
| 280 |
+
preds,
|
| 281 |
+
target_names=["non_default", "default"],
|
| 282 |
+
digits=3,
|
| 283 |
+
zero_division=0,
|
| 284 |
+
)
|
| 285 |
+
report_path = ds_out / f"classification_report_{model_name}.txt"
|
| 286 |
+
report_path.write_text(cls_report)
|
| 287 |
+
report_manifest[f"classification_report_{model_name}"] = str(report_path)
|
| 288 |
+
|
| 289 |
+
# Plots
|
| 290 |
+
roc_path = ds_out / f"roc_{model_name}.png"
|
| 291 |
+
pr_path = ds_out / f"pr_{model_name}.png"
|
| 292 |
+
cm_path = ds_out / f"confusion_matrix_{model_name}.png"
|
| 293 |
+
model_path = ds_out / f"{model_name}_pipeline.pkl"
|
| 294 |
+
|
| 295 |
+
plot_roc(y_test, probas, f"{dataset_name.upper()} - {model_name} ROC", roc_path)
|
| 296 |
+
plot_pr(y_test, probas, f"{dataset_name.upper()} - {model_name} PR", pr_path)
|
| 297 |
+
plot_confusion(
|
| 298 |
+
y_test, preds, f"{dataset_name.upper()} - {model_name} Confusion", cm_path
|
| 299 |
+
)
|
| 300 |
+
joblib.dump(clf, model_path)
|
| 301 |
+
|
| 302 |
+
report_manifest[f"roc_{model_name}"] = str(roc_path)
|
| 303 |
+
report_manifest[f"pr_{model_name}"] = str(pr_path)
|
| 304 |
+
report_manifest[f"confusion_{model_name}"] = str(cm_path)
|
| 305 |
+
report_manifest[f"model_{model_name}"] = str(model_path)
|
| 306 |
+
|
| 307 |
+
if model_name == "xgboost":
|
| 308 |
+
booster = clf.named_steps["model"].get_booster()
|
| 309 |
+
base_score = booster.attr("base_score")
|
| 310 |
+
if base_score:
|
| 311 |
+
try:
|
| 312 |
+
float(base_score)
|
| 313 |
+
except ValueError:
|
| 314 |
+
cleaned = base_score.strip("[]")
|
| 315 |
+
try:
|
| 316 |
+
cleaned_val = str(float(cleaned))
|
| 317 |
+
except Exception:
|
| 318 |
+
cleaned_val = "0.5"
|
| 319 |
+
booster.set_param({"base_score": cleaned_val})
|
| 320 |
+
booster.set_attr(base_score=cleaned_val)
|
| 321 |
+
booster_path = ds_out / f"{model_name}_booster.json"
|
| 322 |
+
booster.save_model(str(booster_path))
|
| 323 |
+
report_manifest[f"booster_{model_name}"] = str(booster_path)
|
| 324 |
+
|
| 325 |
+
if pre_feature_names is None:
|
| 326 |
+
pre_feature_names = []
|
| 327 |
+
|
| 328 |
+
explain_meta = {
|
| 329 |
+
"dataset": dataset_name,
|
| 330 |
+
"target_col": TARGET_COL,
|
| 331 |
+
"raw_num_cols": num_cols,
|
| 332 |
+
"raw_cat_cols": cat_cols,
|
| 333 |
+
"pre_feature_names": pre_feature_names,
|
| 334 |
+
"id_cols": ID_COLS,
|
| 335 |
+
"dropped_features": sorted(list(FEATURES_TO_DROP)),
|
| 336 |
+
"split_used": split_tag,
|
| 337 |
+
}
|
| 338 |
+
meta_path = ds_out / "explain_meta.json"
|
| 339 |
+
meta_path.write_text(json.dumps(explain_meta, indent=2))
|
| 340 |
+
report_manifest["explain_meta"] = str(meta_path)
|
| 341 |
+
report_manifest["explain_background"] = str(background_path)
|
| 342 |
+
|
| 343 |
+
metrics_df = pd.DataFrame(metrics_rows).sort_values(
|
| 344 |
+
["dataset", "auc_roc"], ascending=[True, False]
|
| 345 |
+
)
|
| 346 |
+
metrics_path = ds_out / "metrics_summary.csv"
|
| 347 |
+
metrics_df.to_csv(metrics_path, index=False)
|
| 348 |
+
print(f"Saved metrics -> {metrics_path}")
|
| 349 |
+
|
| 350 |
+
manifest_path = ds_out / "artifacts.json"
|
| 351 |
+
manifest_path.write_text(json.dumps(report_manifest, indent=2))
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def main() -> None:
|
| 355 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 356 |
+
for name, path in DATASETS.items():
|
| 357 |
+
if not path.exists():
|
| 358 |
+
print(f"Skipping {name}, missing file: {path}")
|
| 359 |
+
continue
|
| 360 |
+
evaluate_models(name, path)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
if __name__ == "__main__":
|
| 364 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API runtime dependencies
|
| 2 |
+
fastapi[standard]
|
| 3 |
+
uvicorn[standard]
|
| 4 |
+
pydantic
|
| 5 |
+
numpy
|
| 6 |
+
pandas
|
| 7 |
+
scikit-learn==1.6.1 # match model pickling version; avoids SimpleImputer _fill_dtype errors
|
| 8 |
+
joblib
|
| 9 |
+
xgboost
|
| 10 |
+
|
| 11 |
+
# Model training extras
|
| 12 |
+
catboost
|
| 13 |
+
lightgbm
|
| 14 |
+
matplotlib
|
| 15 |
+
seaborn
|
test.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for /v1/predict endpoint.
|
| 4 |
+
|
| 5 |
+
This script demonstrates the correct request format:
|
| 6 |
+
- inputId: required string
|
| 7 |
+
- payload: object containing the feature data
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import hmac
|
| 11 |
+
import hashlib
|
| 12 |
+
import time
|
| 13 |
+
import uuid
|
| 14 |
+
import requests
|
| 15 |
+
|
| 16 |
+
# ============================================
|
| 17 |
+
# CONFIGURATION (from tenant creation)
|
| 18 |
+
# ============================================
|
| 19 |
+
CLIENT_ID = "acme-bank-463edc0a"
|
| 20 |
+
CLIENT_SECRET = "yPqsrtBizHgDvnK-NpkgVXMXw3WbV_s_JGK-c2pWr3U"
|
| 21 |
+
HMAC_SECRET = "OSSBJgx2QToeQhGtQgzwS_8Kf1QvTraq6M67uNrBKEo"
|
| 22 |
+
|
| 23 |
+
BASE_URL = "https://frankscore-backend.onrender.com"
|
| 24 |
+
|
| 25 |
+
# ============================================
|
| 26 |
+
# STEP 1: Login as Tenant
|
| 27 |
+
# ============================================
|
| 28 |
+
print("Step 1: Logging in...")
|
| 29 |
+
login_response = requests.post(
|
| 30 |
+
f"{BASE_URL}/auth/login",
|
| 31 |
+
json={
|
| 32 |
+
"clientId": CLIENT_ID,
|
| 33 |
+
"clientSecret": CLIENT_SECRET
|
| 34 |
+
}
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
if login_response.status_code != 200:
|
| 38 |
+
print(f"❌ Login failed: {login_response.status_code}")
|
| 39 |
+
print(login_response.text)
|
| 40 |
+
exit(1)
|
| 41 |
+
|
| 42 |
+
login_data = login_response.json()
|
| 43 |
+
jwt_token = login_data["access_token"] # Note: camelCase, not snake_case
|
| 44 |
+
print(f"✅ Logged in. JWT: {jwt_token[:20]}...")
|
| 45 |
+
|
| 46 |
+
# ============================================
|
| 47 |
+
# STEP 2: Prepare End-User Identity
|
| 48 |
+
# ============================================
|
| 49 |
+
end_user_id = "user-alice-123" # Your customer
|
| 50 |
+
timestamp = str(int(time.time()))
|
| 51 |
+
request_id = str(uuid.uuid4())
|
| 52 |
+
|
| 53 |
+
# ============================================
|
| 54 |
+
# STEP 3: Compute HMAC Signature
|
| 55 |
+
# ============================================
|
| 56 |
+
signing_string = f"{end_user_id}|{timestamp}|{request_id}"
|
| 57 |
+
signature = hmac.new(
|
| 58 |
+
HMAC_SECRET.encode('utf-8'), # SECRET KEY (never sent!)
|
| 59 |
+
signing_string.encode('utf-8'),
|
| 60 |
+
hashlib.sha256
|
| 61 |
+
).hexdigest()
|
| 62 |
+
|
| 63 |
+
print(f"📝 Signing string: {signing_string}")
|
| 64 |
+
print(f"🔐 Signature: {signature[:20]}...")
|
| 65 |
+
|
| 66 |
+
# ============================================
|
| 67 |
+
# STEP 4: Make Prediction Request
|
| 68 |
+
# ============================================
|
| 69 |
+
print("\nStep 4: Making prediction request...")
|
| 70 |
+
|
| 71 |
+
# IMPORTANT: The request format is:
|
| 72 |
+
# {
|
| 73 |
+
# "inputId": "string", # REQUIRED
|
| 74 |
+
# "payload": { ... } # The features go here
|
| 75 |
+
# }
|
| 76 |
+
|
| 77 |
+
request_body = {
|
| 78 |
+
"inputId": "loan-app-78945", # REQUIRED - unique identifier for this request
|
| 79 |
+
"payload": {
|
| 80 |
+
"num_previous_loans": 9,
|
| 81 |
+
"num_previous_defaults": 4,
|
| 82 |
+
"past_default_rate": 0.44,
|
| 83 |
+
"days_since_last_loan": 2,
|
| 84 |
+
"avg_time_bw_loans": 20,
|
| 85 |
+
"avg_past_amount": 26000,
|
| 86 |
+
"avg_past_daily_burden": 950,
|
| 87 |
+
"std_past_amount": 4000,
|
| 88 |
+
"std_past_daily_burden": 180,
|
| 89 |
+
"trend_in_amount": 1.3,
|
| 90 |
+
"trend_in_burden": 1.35,
|
| 91 |
+
"Total_Amount": 30000,
|
| 92 |
+
"Total_Amount_to_Repay": 36000,
|
| 93 |
+
"duration": 20,
|
| 94 |
+
"daily_burden": 1500,
|
| 95 |
+
"amount_ratio": 2.0,
|
| 96 |
+
"burden_ratio": 1.8,
|
| 97 |
+
"duration_bucket": "20",
|
| 98 |
+
"amount_bucket": "high",
|
| 99 |
+
"burden_percentile": 0.95,
|
| 100 |
+
"borrower_history_strength": "weak",
|
| 101 |
+
"month": 1,
|
| 102 |
+
"quarter": 1,
|
| 103 |
+
"week_of_year": 3,
|
| 104 |
+
"days_to_salary_day": 28,
|
| 105 |
+
"days_to_local_festival": 2,
|
| 106 |
+
"lender_id": "L_high3",
|
| 107 |
+
"lender_exposure_ratio": 0.4,
|
| 108 |
+
"account_age_days": 150,
|
| 109 |
+
"loan_frequency_per_year": 12,
|
| 110 |
+
"repayment_consistency": 0.4,
|
| 111 |
+
"latest_amount_ma3": 28000
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
response = requests.post(
|
| 116 |
+
f"{BASE_URL}/v1/predict_explain",
|
| 117 |
+
headers={
|
| 118 |
+
"Authorization": f"Bearer {jwt_token}",
|
| 119 |
+
"Content-Type": "application/json",
|
| 120 |
+
"X-End-User-Id": end_user_id,
|
| 121 |
+
"X-End-User-Timestamp": timestamp,
|
| 122 |
+
"X-Request-Id": request_id,
|
| 123 |
+
"X-End-User-Signature": signature
|
| 124 |
+
},
|
| 125 |
+
json=request_body
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
print(f"\nResponse Status: {response.status_code}")
|
| 129 |
+
|
| 130 |
+
if response.status_code == 200:
|
| 131 |
+
result = response.json()
|
| 132 |
+
print("✅ Prediction successful!")
|
| 133 |
+
print(f" Input ID: {result.get('inputId')}")
|
| 134 |
+
print(f" Score: {result.get('score')}")
|
| 135 |
+
if result.get('topFeatures'):
|
| 136 |
+
print(f" Top Features: {len(result.get('topFeatures'))} features")
|
| 137 |
+
print(f"\nFull response: {result}")
|
| 138 |
+
else:
|
| 139 |
+
print(f"❌ Prediction failed: {response.status_code}")
|
| 140 |
+
print(response.text)
|
test_data.json
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
{
|
| 3 |
+
"records": [
|
| 4 |
+
{
|
| 5 |
+
"num_previous_loans": 8,
|
| 6 |
+
"num_previous_defaults": 0,
|
| 7 |
+
"past_default_rate": 0.0,
|
| 8 |
+
"days_since_last_loan": 40,
|
| 9 |
+
"avg_time_bw_loans": 120,
|
| 10 |
+
"avg_past_amount": 8000,
|
| 11 |
+
"avg_past_daily_burden": 200,
|
| 12 |
+
"std_past_amount": 500,
|
| 13 |
+
"std_past_daily_burden": 20,
|
| 14 |
+
"trend_in_amount": 1.05,
|
| 15 |
+
"trend_in_burden": 0.9,
|
| 16 |
+
"Total_Amount": 6000,
|
| 17 |
+
"Total_Amount_to_Repay": 7200,
|
| 18 |
+
"duration": 45,
|
| 19 |
+
"daily_burden": 160,
|
| 20 |
+
"amount_ratio": 0.4,
|
| 21 |
+
"burden_ratio": 0.25,
|
| 22 |
+
"duration_bucket": "45",
|
| 23 |
+
"amount_bucket": "low",
|
| 24 |
+
"burden_percentile": 0.15,
|
| 25 |
+
"borrower_history_strength": "strong",
|
| 26 |
+
"month": 5,
|
| 27 |
+
"quarter": 2,
|
| 28 |
+
"week_of_year": 18,
|
| 29 |
+
"days_to_salary_day": 5,
|
| 30 |
+
"days_to_local_festival": 40,
|
| 31 |
+
"lender_id": "L_low1",
|
| 32 |
+
"lender_exposure_ratio": 0.05,
|
| 33 |
+
"account_age_days": 900,
|
| 34 |
+
"loan_frequency_per_year": 3,
|
| 35 |
+
"repayment_consistency": 0.98,
|
| 36 |
+
"latest_amount_ma3": 5500
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"num_previous_loans": 6,
|
| 40 |
+
"num_previous_defaults": 0,
|
| 41 |
+
"past_default_rate": 0.0,
|
| 42 |
+
"days_since_last_loan": 25,
|
| 43 |
+
"avg_time_bw_loans": 90,
|
| 44 |
+
"avg_past_amount": 12000,
|
| 45 |
+
"avg_past_daily_burden": 300,
|
| 46 |
+
"std_past_amount": 700,
|
| 47 |
+
"std_past_daily_burden": 30,
|
| 48 |
+
"trend_in_amount": 1.0,
|
| 49 |
+
"trend_in_burden": 0.95,
|
| 50 |
+
"Total_Amount": 10000,
|
| 51 |
+
"Total_Amount_to_Repay": 11500,
|
| 52 |
+
"duration": 60,
|
| 53 |
+
"daily_burden": 190,
|
| 54 |
+
"amount_ratio": 0.55,
|
| 55 |
+
"burden_ratio": 0.35,
|
| 56 |
+
"duration_bucket": "60",
|
| 57 |
+
"amount_bucket": "mid",
|
| 58 |
+
"burden_percentile": 0.25,
|
| 59 |
+
"borrower_history_strength": "strong",
|
| 60 |
+
"month": 7,
|
| 61 |
+
"quarter": 3,
|
| 62 |
+
"week_of_year": 27,
|
| 63 |
+
"days_to_salary_day": 12,
|
| 64 |
+
"days_to_local_festival": 25,
|
| 65 |
+
"lender_id": "L_low2",
|
| 66 |
+
"lender_exposure_ratio": 0.08,
|
| 67 |
+
"account_age_days": 750,
|
| 68 |
+
"loan_frequency_per_year": 4,
|
| 69 |
+
"repayment_consistency": 0.95,
|
| 70 |
+
"latest_amount_ma3": 10500
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"num_previous_loans": 4,
|
| 74 |
+
"num_previous_defaults": 0,
|
| 75 |
+
"past_default_rate": 0.0,
|
| 76 |
+
"days_since_last_loan": 15,
|
| 77 |
+
"avg_time_bw_loans": 60,
|
| 78 |
+
"avg_past_amount": 15000,
|
| 79 |
+
"avg_past_daily_burden": 450,
|
| 80 |
+
"std_past_amount": 1200,
|
| 81 |
+
"std_past_daily_burden": 40,
|
| 82 |
+
"trend_in_amount": 1.05,
|
| 83 |
+
"trend_in_burden": 1.0,
|
| 84 |
+
"Total_Amount": 15000,
|
| 85 |
+
"Total_Amount_to_Repay": 17500,
|
| 86 |
+
"duration": 45,
|
| 87 |
+
"daily_burden": 389,
|
| 88 |
+
"amount_ratio": 0.8,
|
| 89 |
+
"burden_ratio": 0.55,
|
| 90 |
+
"duration_bucket": "45",
|
| 91 |
+
"amount_bucket": "mid",
|
| 92 |
+
"burden_percentile": 0.45,
|
| 93 |
+
"borrower_history_strength": "medium",
|
| 94 |
+
"month": 2,
|
| 95 |
+
"quarter": 1,
|
| 96 |
+
"week_of_year": 8,
|
| 97 |
+
"days_to_salary_day": 18,
|
| 98 |
+
"days_to_local_festival": 50,
|
| 99 |
+
"lender_id": "L_mid1",
|
| 100 |
+
"lender_exposure_ratio": 0.12,
|
| 101 |
+
"account_age_days": 500,
|
| 102 |
+
"loan_frequency_per_year": 5,
|
| 103 |
+
"repayment_consistency": 0.88,
|
| 104 |
+
"latest_amount_ma3": 16000
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"num_previous_loans": 3,
|
| 108 |
+
"num_previous_defaults": 0,
|
| 109 |
+
"past_default_rate": 0.0,
|
| 110 |
+
"days_since_last_loan": 10,
|
| 111 |
+
"avg_time_bw_loans": 45,
|
| 112 |
+
"avg_past_amount": 17000,
|
| 113 |
+
"avg_past_daily_burden": 520,
|
| 114 |
+
"std_past_amount": 1500,
|
| 115 |
+
"std_past_daily_burden": 60,
|
| 116 |
+
"trend_in_amount": 1.1,
|
| 117 |
+
"trend_in_burden": 1.05,
|
| 118 |
+
"Total_Amount": 20000,
|
| 119 |
+
"Total_Amount_to_Repay": 23000,
|
| 120 |
+
"duration": 30,
|
| 121 |
+
"daily_burden": 750,
|
| 122 |
+
"amount_ratio": 1.2,
|
| 123 |
+
"burden_ratio": 0.9,
|
| 124 |
+
"duration_bucket": "30",
|
| 125 |
+
"amount_bucket": "mid",
|
| 126 |
+
"burden_percentile": 0.65,
|
| 127 |
+
"borrower_history_strength": "medium",
|
| 128 |
+
"month": 5,
|
| 129 |
+
"quarter": 2,
|
| 130 |
+
"week_of_year": 18,
|
| 131 |
+
"days_to_salary_day": 10,
|
| 132 |
+
"days_to_local_festival": 40,
|
| 133 |
+
"lender_id": "L_mid2",
|
| 134 |
+
"lender_exposure_ratio": 0.18,
|
| 135 |
+
"account_age_days": 400,
|
| 136 |
+
"loan_frequency_per_year": 6,
|
| 137 |
+
"repayment_consistency": 0.82,
|
| 138 |
+
"latest_amount_ma3": 18000
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"num_previous_loans": 5,
|
| 142 |
+
"num_previous_defaults": 1,
|
| 143 |
+
"past_default_rate": 0.2,
|
| 144 |
+
"days_since_last_loan": 7,
|
| 145 |
+
"avg_time_bw_loans": 40,
|
| 146 |
+
"avg_past_amount": 18000,
|
| 147 |
+
"avg_past_daily_burden": 600,
|
| 148 |
+
"std_past_amount": 2200,
|
| 149 |
+
"std_past_daily_burden": 90,
|
| 150 |
+
"trend_in_amount": 1.15,
|
| 151 |
+
"trend_in_burden": 1.1,
|
| 152 |
+
"Total_Amount": 22000,
|
| 153 |
+
"Total_Amount_to_Repay": 26000,
|
| 154 |
+
"duration": 30,
|
| 155 |
+
"daily_burden": 867,
|
| 156 |
+
"amount_ratio": 1.35,
|
| 157 |
+
"burden_ratio": 1.05,
|
| 158 |
+
"duration_bucket": "30",
|
| 159 |
+
"amount_bucket": "high",
|
| 160 |
+
"burden_percentile": 0.75,
|
| 161 |
+
"borrower_history_strength": "weak",
|
| 162 |
+
"month": 9,
|
| 163 |
+
"quarter": 3,
|
| 164 |
+
"week_of_year": 36,
|
| 165 |
+
"days_to_salary_day": 20,
|
| 166 |
+
"days_to_local_festival": 10,
|
| 167 |
+
"lender_id": "L_high1",
|
| 168 |
+
"lender_exposure_ratio": 0.25,
|
| 169 |
+
"account_age_days": 300,
|
| 170 |
+
"loan_frequency_per_year": 8,
|
| 171 |
+
"repayment_consistency": 0.7,
|
| 172 |
+
"latest_amount_ma3": 21000
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"num_previous_loans": 7,
|
| 176 |
+
"num_previous_defaults": 2,
|
| 177 |
+
"past_default_rate": 0.29,
|
| 178 |
+
"days_since_last_loan": 5,
|
| 179 |
+
"avg_time_bw_loans": 30,
|
| 180 |
+
"avg_past_amount": 22000,
|
| 181 |
+
"avg_past_daily_burden": 750,
|
| 182 |
+
"std_past_amount": 3000,
|
| 183 |
+
"std_past_daily_burden": 120,
|
| 184 |
+
"trend_in_amount": 1.2,
|
| 185 |
+
"trend_in_burden": 1.2,
|
| 186 |
+
"Total_Amount": 25000,
|
| 187 |
+
"Total_Amount_to_Repay": 30000,
|
| 188 |
+
"duration": 25,
|
| 189 |
+
"daily_burden": 1200,
|
| 190 |
+
"amount_ratio": 1.6,
|
| 191 |
+
"burden_ratio": 1.3,
|
| 192 |
+
"duration_bucket": "25",
|
| 193 |
+
"amount_bucket": "high",
|
| 194 |
+
"burden_percentile": 0.85,
|
| 195 |
+
"borrower_history_strength": "weak",
|
| 196 |
+
"month": 11,
|
| 197 |
+
"quarter": 4,
|
| 198 |
+
"week_of_year": 46,
|
| 199 |
+
"days_to_salary_day": 25,
|
| 200 |
+
"days_to_local_festival": 5,
|
| 201 |
+
"lender_id": "L_high2",
|
| 202 |
+
"lender_exposure_ratio": 0.32,
|
| 203 |
+
"account_age_days": 250,
|
| 204 |
+
"loan_frequency_per_year": 9,
|
| 205 |
+
"repayment_consistency": 0.6,
|
| 206 |
+
"latest_amount_ma3": 24000
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"num_previous_loans": 9,
|
| 210 |
+
"num_previous_defaults": 4,
|
| 211 |
+
"past_default_rate": 0.44,
|
| 212 |
+
"days_since_last_loan": 2,
|
| 213 |
+
"avg_time_bw_loans": 20,
|
| 214 |
+
"avg_past_amount": 26000,
|
| 215 |
+
"avg_past_daily_burden": 950,
|
| 216 |
+
"std_past_amount": 4000,
|
| 217 |
+
"std_past_daily_burden": 180,
|
| 218 |
+
"trend_in_amount": 1.3,
|
| 219 |
+
"trend_in_burden": 1.35,
|
| 220 |
+
"Total_Amount": 30000,
|
| 221 |
+
"Total_Amount_to_Repay": 36000,
|
| 222 |
+
"duration": 20,
|
| 223 |
+
"daily_burden": 1500,
|
| 224 |
+
"amount_ratio": 2.0,
|
| 225 |
+
"burden_ratio": 1.8,
|
| 226 |
+
"duration_bucket": "20",
|
| 227 |
+
"amount_bucket": "high",
|
| 228 |
+
"burden_percentile": 0.95,
|
| 229 |
+
"borrower_history_strength": "weak",
|
| 230 |
+
"month": 1,
|
| 231 |
+
"quarter": 1,
|
| 232 |
+
"week_of_year": 3,
|
| 233 |
+
"days_to_salary_day": 28,
|
| 234 |
+
"days_to_local_festival": 2,
|
| 235 |
+
"lender_id": "L_high3",
|
| 236 |
+
"lender_exposure_ratio": 0.4,
|
| 237 |
+
"account_age_days": 150,
|
| 238 |
+
"loan_frequency_per_year": 12,
|
| 239 |
+
"repayment_consistency": 0.4,
|
| 240 |
+
"latest_amount_ma3": 28000
|
| 241 |
+
}
|
| 242 |
+
]
|
| 243 |
+
}
|