Gutema-1990 commited on
Commit
97b9b33
·
1 Parent(s): f5f4d12

the model artifact is removed to be deployed separately

Browse files
.dockerignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fly.toml
2
+ .git/
3
+ __pycache__/
4
+ .envrc
5
+ .venv/
.gitignore copy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ catboost_info/
2
+ api/__pycache__/
3
+ api/catboost_info/
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12.12 AS builder
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ PYTHONDONTWRITEBYTECODE=1
5
+ WORKDIR /app
6
+
7
+
8
+ RUN python -m venv .venv
9
+ COPY requirements.txt ./
10
+ RUN .venv/bin/pip install -r requirements.txt
11
+
12
+ FROM python:3.12.12-slim
13
+ WORKDIR /app
14
+
15
+ # Native libs required by xgboost / scikit-learn wheels
16
+ RUN apt-get update \
17
+ && apt-get install -y --no-install-recommends libgomp1 libopenblas0-pthread \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ COPY --from=builder /app/.venv .venv/
21
+ COPY . .
22
+ CMD ["/app/.venv/bin/uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "7860"]
README copy.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # frankscore-deployment
api/app.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import joblib
8
+ import numpy as np
9
+ import pandas as pd
10
+ from fastapi import FastAPI, HTTPException
11
+ from pydantic import BaseModel, Field
12
+ import xgboost as xgb
13
+ # Compatibility shim for pickles created with newer sklearn that include _RemainderColsList
14
+ import sklearn.compose._column_transformer as _ct # type: ignore
15
+ if not hasattr(_ct, "_RemainderColsList"):
16
+ class _RemainderColsList(list): # type: ignore
17
+ pass
18
+ _ct._RemainderColsList = _RemainderColsList
19
+
20
+ ROOT = Path(__file__).resolve().parents[1]
21
+ MODEL_DIR = Path(__file__).resolve().parent / "model"
22
+ MODEL_PATH = MODEL_DIR / "xgboost_pipeline.pkl"
23
+ BOOSTER_PATH = MODEL_DIR / "xgboost_booster.json"
24
+ META_PATH = MODEL_DIR / "explain_meta.json"
25
+
26
+ if not MODEL_PATH.exists():
27
+ raise FileNotFoundError(f"Model file missing at {MODEL_PATH}")
28
+ if not META_PATH.exists():
29
+ raise FileNotFoundError(f"Explainability meta missing at {META_PATH}")
30
+ if not BOOSTER_PATH.exists():
31
+ raise FileNotFoundError(f"Booster file missing at {BOOSTER_PATH}")
32
+
33
+ PIPELINE = joblib.load(MODEL_PATH)
34
+ META = json.loads(META_PATH.read_text())
35
+
36
+ EXPECTED_FEATURES = list(getattr(PIPELINE, "feature_names_in_", []))
37
+ PREPROCESS = PIPELINE.named_steps.get("preprocess") if hasattr(PIPELINE, "named_steps") else None
38
+ if PREPROCESS is None:
39
+ raise RuntimeError("Pipeline missing 'preprocess' step; cannot infer columns.")
40
+
41
+ if not EXPECTED_FEATURES:
42
+ EXPECTED_FEATURES = list(getattr(PREPROCESS, "feature_names_in_", []))
43
+ if not EXPECTED_FEATURES:
44
+ raise RuntimeError("Unable to determine expected feature names from the pipeline.")
45
+
46
+ _col_map = {name: cols for name, _, cols in getattr(PREPROCESS, "transformers_", [])}
47
+ NUM_FEATURES = list(_col_map.get("num", []))
48
+ CAT_FEATURES = list(_col_map.get("cat", []))
49
+ PRE_FEATURE_NAMES = META.get("pre_feature_names") or list(getattr(PREPROCESS, "get_feature_names_out", lambda: [])())
50
+ RAW_FEATURE_SET = set((META.get("raw_num_cols") or []) + (META.get("raw_cat_cols") or []))
51
+ FEATURE_GROUPS = {
52
+ "Borrowing History & Maturity": [
53
+ "account_age_days",
54
+ "avg_past_amount",
55
+ "avg_past_daily_burden",
56
+ "avg_time_bw_loans",
57
+ "borrower_history_strength",
58
+ "days_since_last_loan",
59
+ "loan_frequency_per_year",
60
+ "num_previous_loans",
61
+ "std_past_amount",
62
+ "std_past_daily_burden",
63
+ "trend_in_amount",
64
+ "trend_in_burden",
65
+ ],
66
+ "Repayment Speed & Delinquency": [
67
+ "num_previous_defaults",
68
+ "past_default_rate",
69
+ "repayment_consistency",
70
+ ],
71
+ "Current Loan Size, Pricing & Burden": [
72
+ "Total_Amount",
73
+ "Total_Amount_to_Repay",
74
+ "amount_bucket",
75
+ "burden_percentile",
76
+ "daily_burden",
77
+ "duration",
78
+ "duration_bucket",
79
+ "interest_rate",
80
+ ],
81
+ "Affordability & Risk Ratios": [
82
+ "amount_ratio",
83
+ "burden_ratio",
84
+ "repayment_intensity",
85
+ ],
86
+ "Seasonality & Timing": [
87
+ "days_to_local_festival",
88
+ "days_to_salary_day",
89
+ "month",
90
+ "quarter",
91
+ "week_of_year",
92
+ ],
93
+ "Operational, Referral & Lender Signals": [
94
+ "lender_exposure_ratio",
95
+ "lender_id",
96
+ "lender_risk_profile",
97
+ ],
98
+ "Time-based Trends & Volatility": [
99
+ "latest_amount_ma3",
100
+ ],
101
+ }
102
+ FEATURE_GROUP_LOOKUP: Dict[str, str] = {}
103
+ for group, variables in FEATURE_GROUPS.items():
104
+ for var in variables:
105
+ FEATURE_GROUP_LOOKUP[var] = group
106
+
107
+ app = FastAPI(title="FrankScore", version="1.0.0")
108
+
109
+
110
+ class PredictionRequest(BaseModel):
111
+ records: List[Dict[str, Any]] = Field(..., description="List of borrower feature dictionaries")
112
+
113
+
114
+ class PredictionResponse(BaseModel):
115
+ probabilities: List[float]
116
+
117
+
118
+ class ScoreRequest(BaseModel):
119
+ probabilities: List[float] = Field(..., description="Probabilities of default (0-1)")
120
+
121
+
122
+ class ScoreResponse(BaseModel):
123
+ scores: List[float]
124
+
125
+
126
+ class ExplainRequest(BaseModel):
127
+ records: List[Dict[str, Any]]
128
+ top_k: Optional[int] = Field(default=10, ge=1, le=100, description="Number of top features to return per record")
129
+
130
+
131
+ class FeatureContribution(BaseModel):
132
+ feature: str
133
+ shap_value: float
134
+
135
+
136
+ class GroupContribution(BaseModel):
137
+ group: str
138
+ total_shap_value: float
139
+ features: List[FeatureContribution]
140
+
141
+
142
+ class ExplainItem(BaseModel):
143
+ probability: float
144
+ base_value: float
145
+ group_contributions: List[GroupContribution]
146
+
147
+
148
+ class ExplainResponse(BaseModel):
149
+ explanations: List[ExplainItem]
150
+
151
+
152
+ class PredictExplainItem(BaseModel):
153
+ probability: float
154
+ score: float
155
+ base_value: float
156
+ group_contributions: List[GroupContribution]
157
+
158
+
159
+ class PredictExplainResponse(BaseModel):
160
+ results: List[PredictExplainItem]
161
+
162
+
163
+ def prepare_frame(records: List[Dict[str, Any]]) -> pd.DataFrame:
164
+ if not records:
165
+ raise HTTPException(status_code=400, detail="No records provided.")
166
+ df = pd.DataFrame(records)
167
+ for col in EXPECTED_FEATURES:
168
+ if col not in df.columns:
169
+ df[col] = np.nan
170
+ df = df[EXPECTED_FEATURES]
171
+ if NUM_FEATURES:
172
+ df[NUM_FEATURES] = df[NUM_FEATURES].apply(pd.to_numeric, errors="coerce")
173
+ if CAT_FEATURES:
174
+ df[CAT_FEATURES] = df[CAT_FEATURES].astype("object")
175
+ return df
176
+
177
+
178
+ def pd_to_score(p: np.ndarray, base_score: float = 50, base_odds: float = 9, pdo: float = 20) -> np.ndarray:
179
+ p = np.clip(p, 1e-6, 1 - 1e-6)
180
+ B = pdo / np.log(2)
181
+ A = base_score - B * np.log(base_odds)
182
+ odds = (1 - p) / p
183
+ score = A + B * np.log(odds)
184
+ return np.clip(score, 0, 100)
185
+
186
+
187
+ def _sanitize_feature_name(name: str) -> str:
188
+ sanitized = name
189
+ for ch, repl in {"[": "", "]": "", "<": "lt", ">": "gt", " ": "_", ",": "_", "=": "_"}.items():
190
+ sanitized = sanitized.replace(ch, repl)
191
+ return sanitized
192
+
193
+
194
+ def _base_feature_name(name: str) -> str:
195
+ base = name
196
+ if "__" in base:
197
+ base = base.split("__", 1)[1]
198
+ if base in RAW_FEATURE_SET:
199
+ return base
200
+ parts = base.split("_")
201
+ while len(parts) > 1:
202
+ candidate = "_".join(parts[:-1])
203
+ if candidate in RAW_FEATURE_SET:
204
+ return candidate
205
+ parts = parts[:-1]
206
+ return base
207
+
208
+
209
+ def get_booster():
210
+ if not hasattr(get_booster, "_booster"):
211
+ booster = xgb.Booster()
212
+ booster.load_model(str(BOOSTER_PATH))
213
+ base_score = booster.attr("base_score")
214
+ if base_score:
215
+ try:
216
+ float(base_score)
217
+ except ValueError:
218
+ cleaned = base_score.strip("[]")
219
+ try:
220
+ cleaned_val = str(float(cleaned))
221
+ except Exception:
222
+ cleaned_val = "0.5"
223
+ booster.set_param({"base_score": cleaned_val})
224
+ booster.set_attr(base_score=cleaned_val)
225
+ get_booster._booster = booster
226
+ return get_booster._booster
227
+
228
+
229
+ @app.post("/predict", response_model=PredictionResponse)
230
+ def predict(req: PredictionRequest) -> PredictionResponse:
231
+ frame = prepare_frame(req.records)
232
+ probas = PIPELINE.predict_proba(frame)[:, 1]
233
+ return PredictionResponse(probabilities=probas.tolist())
234
+
235
+
236
+ @app.get("/health")
237
+ def health() -> Dict[str, str]:
238
+ return {"status": "ok", "model_path": str(MODEL_PATH)}
239
+
240
+
241
+ @app.post("/score", response_model=ScoreResponse)
242
+ def score(req: ScoreRequest) -> ScoreResponse:
243
+ if not req.probabilities:
244
+ raise HTTPException(status_code=400, detail="No probabilities provided.")
245
+ arr = np.array(req.probabilities, dtype=float)
246
+ scores = pd_to_score(arr)
247
+ return ScoreResponse(scores=scores.tolist())
248
+
249
+
250
+ @app.post("/explain", response_model=ExplainResponse)
251
+ def explain(req: ExplainRequest) -> ExplainResponse:
252
+ if not req.records:
253
+ raise HTTPException(status_code=400, detail="No records provided.")
254
+ frame = prepare_frame(req.records)
255
+ probas = PIPELINE.predict_proba(frame)[:, 1]
256
+ booster = get_booster()
257
+ X_proc = PREPROCESS.transform(frame)
258
+ feat_names = np.array(PRE_FEATURE_NAMES) if PRE_FEATURE_NAMES else np.array([f"f{i}" for i in range(X_proc.shape[1])])
259
+ sanitized_names = [_sanitize_feature_name(n) for n in feat_names]
260
+ dmat = xgb.DMatrix(X_proc, feature_names=sanitized_names)
261
+ contribs = booster.predict(dmat, pred_contribs=True)
262
+ if contribs.shape[1] != X_proc.shape[1] + 1:
263
+ raise HTTPException(status_code=500, detail="Unexpected contribution shape from booster.")
264
+ base_vals = contribs[:, -1]
265
+ feat_contribs = contribs[:, :-1]
266
+ explanations: List[ExplainItem] = []
267
+ for i in range(feat_contribs.shape[0]):
268
+ row_vals = feat_contribs[i]
269
+ group_totals: Dict[str, float] = {}
270
+ group_details: Dict[str, List[FeatureContribution]] = {}
271
+ for name, val in zip(feat_names, row_vals):
272
+ base = _base_feature_name(str(name))
273
+ group = FEATURE_GROUP_LOOKUP.get(base, "Other")
274
+ group_totals[group] = group_totals.get(group, 0.0) + float(val)
275
+ group_details.setdefault(group, []).append(
276
+ FeatureContribution(feature=str(name), shap_value=float(val))
277
+ )
278
+ group_contribs: List[GroupContribution] = []
279
+ for grp, total in sorted(group_totals.items(), key=lambda kv: abs(kv[1]), reverse=True):
280
+ feats = sorted(group_details.get(grp, []), key=lambda fc: abs(fc.shap_value), reverse=True)
281
+ if req.top_k:
282
+ feats = feats[:req.top_k]
283
+ group_contribs.append(GroupContribution(group=grp, total_shap_value=total, features=feats))
284
+ explanations.append(
285
+ ExplainItem(
286
+ probability=float(probas[i]),
287
+ base_value=float(base_vals[i]),
288
+ group_contributions=group_contribs,
289
+ )
290
+ )
291
+ return ExplainResponse(explanations=explanations)
292
+
293
+
294
+ @app.post("/predict_explain", response_model=PredictExplainResponse)
295
+ def predict_explain(req: ExplainRequest) -> PredictExplainResponse:
296
+ if not req.records:
297
+ raise HTTPException(status_code=400, detail="No records provided.")
298
+ frame = prepare_frame(req.records)
299
+ probas = PIPELINE.predict_proba(frame)[:, 1]
300
+ booster = get_booster()
301
+ X_proc = PREPROCESS.transform(frame)
302
+ feat_names = np.array(PRE_FEATURE_NAMES) if PRE_FEATURE_NAMES else np.array([f"f{i}" for i in range(X_proc.shape[1])])
303
+ sanitized_names = [_sanitize_feature_name(n) for n in feat_names]
304
+ dmat = xgb.DMatrix(X_proc, feature_names=sanitized_names)
305
+ contribs = booster.predict(dmat, pred_contribs=True)
306
+ if contribs.shape[1] != X_proc.shape[1] + 1:
307
+ raise HTTPException(status_code=500, detail="Unexpected contribution shape from booster.")
308
+ base_vals = contribs[:, -1]
309
+ feat_contribs = contribs[:, :-1]
310
+ items: List[PredictExplainItem] = []
311
+ for i in range(feat_contribs.shape[0]):
312
+ row_vals = feat_contribs[i]
313
+ group_totals: Dict[str, float] = {}
314
+ group_details: Dict[str, List[FeatureContribution]] = {}
315
+ for name, val in zip(feat_names, row_vals):
316
+ base = _base_feature_name(str(name))
317
+ group = FEATURE_GROUP_LOOKUP.get(base, "Other")
318
+ group_totals[group] = group_totals.get(group, 0.0) + float(val)
319
+ group_details.setdefault(group, []).append(
320
+ FeatureContribution(feature=str(name), shap_value=float(val))
321
+ )
322
+ group_contribs: List[GroupContribution] = []
323
+ for grp, total in sorted(group_totals.items(), key=lambda kv: abs(kv[1]), reverse=True):
324
+ feats = sorted(group_details.get(grp, []), key=lambda fc: abs(fc.shap_value), reverse=True)
325
+ if req.top_k:
326
+ feats = feats[:req.top_k]
327
+ group_contribs.append(GroupContribution(group=grp, total_shap_value=total, features=feats))
328
+ score_val = int(round(float(pd_to_score(np.array([probas[i]]))[0])))
329
+ items.append(
330
+ PredictExplainItem(
331
+ probability=float(probas[i]),
332
+ score=score_val,
333
+ base_value=float(base_vals[i]),
334
+ group_contributions=group_contribs,
335
+ )
336
+ )
337
+ return PredictExplainResponse(results=items)
api/model/explain_meta.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "full",
3
+ "target_col": "target",
4
+ "raw_num_cols": [
5
+ "num_previous_loans",
6
+ "num_previous_defaults",
7
+ "past_default_rate",
8
+ "days_since_last_loan",
9
+ "avg_time_bw_loans",
10
+ "avg_past_amount",
11
+ "avg_past_daily_burden",
12
+ "std_past_amount",
13
+ "std_past_daily_burden",
14
+ "trend_in_amount",
15
+ "trend_in_burden",
16
+ "Total_Amount",
17
+ "Total_Amount_to_Repay",
18
+ "duration",
19
+ "daily_burden",
20
+ "amount_ratio",
21
+ "burden_ratio",
22
+ "burden_percentile",
23
+ "borrower_history_strength",
24
+ "month",
25
+ "quarter",
26
+ "week_of_year",
27
+ "days_to_salary_day",
28
+ "days_to_local_festival",
29
+ "lender_id",
30
+ "lender_exposure_ratio",
31
+ "account_age_days",
32
+ "loan_frequency_per_year",
33
+ "repayment_consistency",
34
+ "latest_amount_ma3"
35
+ ],
36
+ "raw_cat_cols": [
37
+ "duration_bucket",
38
+ "amount_bucket"
39
+ ],
40
+ "pre_feature_names": [
41
+ "num__num_previous_loans",
42
+ "num__num_previous_defaults",
43
+ "num__past_default_rate",
44
+ "num__days_since_last_loan",
45
+ "num__avg_time_bw_loans",
46
+ "num__avg_past_amount",
47
+ "num__avg_past_daily_burden",
48
+ "num__std_past_amount",
49
+ "num__std_past_daily_burden",
50
+ "num__trend_in_amount",
51
+ "num__trend_in_burden",
52
+ "num__Total_Amount",
53
+ "num__Total_Amount_to_Repay",
54
+ "num__duration",
55
+ "num__daily_burden",
56
+ "num__amount_ratio",
57
+ "num__burden_ratio",
58
+ "num__burden_percentile",
59
+ "num__borrower_history_strength",
60
+ "num__month",
61
+ "num__quarter",
62
+ "num__week_of_year",
63
+ "num__days_to_salary_day",
64
+ "num__days_to_local_festival",
65
+ "num__lender_id",
66
+ "num__lender_exposure_ratio",
67
+ "num__account_age_days",
68
+ "num__loan_frequency_per_year",
69
+ "num__repayment_consistency",
70
+ "num__latest_amount_ma3",
71
+ "cat__duration_bucket_<=1m",
72
+ "cat__duration_bucket_<=1w",
73
+ "cat__duration_bucket_<=2m",
74
+ "cat__duration_bucket_<=2w",
75
+ "cat__duration_bucket_>2m",
76
+ "cat__amount_bucket_q1",
77
+ "cat__amount_bucket_q2",
78
+ "cat__amount_bucket_q3",
79
+ "cat__amount_bucket_q4"
80
+ ],
81
+ "id_cols": [
82
+ "customer_id",
83
+ "tbl_loan_id"
84
+ ],
85
+ "dropped_features": [
86
+ "interest_rate",
87
+ "lender_risk_profile",
88
+ "pseudo_disb_date",
89
+ "repayment_intensity"
90
+ ],
91
+ "split_used": "time_split(pseudo_disb_date)"
92
+ }
api/model/xgboost_booster.json ADDED
The diff for this file is too large to render. See raw diff
 
api/model_training/train_model.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Dict, List, Tuple
6
+
7
+ import matplotlib
8
+ matplotlib.use("Agg") # Use non-GUI backend to avoid Tkinter cleanup warnings
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ import pandas as pd
12
+ import seaborn as sns
13
+ import joblib
14
+ from catboost import CatBoostClassifier
15
+ from lightgbm import LGBMClassifier
16
+ from sklearn.compose import ColumnTransformer
17
+ from sklearn.ensemble import RandomForestClassifier
18
+ from sklearn.impute import SimpleImputer
19
+ from sklearn.metrics import (
20
+ accuracy_score,
21
+ average_precision_score,
22
+ classification_report,
23
+ confusion_matrix,
24
+ f1_score,
25
+ precision_recall_curve,
26
+ precision_score,
27
+ recall_score,
28
+ roc_auc_score,
29
+ roc_curve,
30
+ )
31
+ from sklearn.model_selection import GroupShuffleSplit, train_test_split
32
+ from sklearn.pipeline import Pipeline
33
+ from sklearn.preprocessing import OneHotEncoder
34
+ from xgboost import XGBClassifier
35
+ import xgboost as xgb
36
+
37
+
38
+ RANDOM_STATE = 42
39
+ # Repository root (two levels up from this file: code/model/train_models.py -> repo root)
40
+ ROOT = Path(__file__).resolve().parents[2]
41
+ DATA_BASE = Path(
42
+ "/home/name-1/AI-Agent/frankscore/kenyan-dataset-issue/data/feature-generated"
43
+ )
44
+
45
+ DATASETS: Dict[str, Path] = {
46
+ "full": DATA_BASE / "kenya_engineered_features.csv",
47
+ "borrower": DATA_BASE / "kenya_engineered_features_borrower_side.csv",
48
+ }
49
+ OUTPUT_DIR = ROOT / "code" / "model" / "outputs_for_demo"
50
+ TARGET_COL = "target"
51
+ ID_COLS = ["customer_id", "tbl_loan_id"]
52
+ GROUP_COL_CANDIDATES = ["customer_id", "customerId", "client_id"]
53
+ DATE_COL_CANDIDATES = ["pseudo_disb_date", "disb_date", "disbursement_date", "application_date", "loan_date"]
54
+ FEATURES_TO_DROP = {
55
+ "interest_rate",
56
+ "repayment_intensity",
57
+ "lender_risk_profile",
58
+ "pseudo_disb_date",
59
+ }
60
+
61
+
62
+ def build_preprocessor(
63
+ feature_frame: pd.DataFrame,
64
+ ) -> Tuple[ColumnTransformer, List[str], List[str]]:
65
+ cat_cols = feature_frame.select_dtypes(include=["object"]).columns.tolist()
66
+ num_cols = [c for c in feature_frame.columns if c not in cat_cols]
67
+
68
+ num_pipe = Pipeline(
69
+ steps=[
70
+ ("imputer", SimpleImputer(strategy="median")),
71
+ ]
72
+ )
73
+ cat_pipe = Pipeline(
74
+ steps=[
75
+ ("imputer", SimpleImputer(strategy="most_frequent")),
76
+ ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
77
+ ]
78
+ )
79
+ preprocessor = ColumnTransformer(
80
+ transformers=[
81
+ ("num", num_pipe, num_cols),
82
+ ("cat", cat_pipe, cat_cols),
83
+ ]
84
+ )
85
+ return preprocessor, num_cols, cat_cols
86
+
87
+
88
+ def find_first_existing_col(df: pd.DataFrame, candidates: List[str]) -> str | None:
89
+ for c in candidates:
90
+ if c in df.columns:
91
+ return c
92
+ return None
93
+
94
+
95
+ def split_data_leakage_safe(
96
+ df: pd.DataFrame, X: pd.DataFrame, y: pd.Series
97
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, str]:
98
+ """
99
+ Priority:
100
+ 1) time split if a date column exists
101
+ 2) group split on customer id
102
+ 3) stratified fallback
103
+ """
104
+ date_col = find_first_existing_col(df, DATE_COL_CANDIDATES)
105
+ group_col = find_first_existing_col(df, GROUP_COL_CANDIDATES)
106
+
107
+ if date_col is not None:
108
+ tmp = df[[date_col]].copy()
109
+ tmp[date_col] = pd.to_datetime(tmp[date_col], errors="coerce")
110
+ if tmp[date_col].notna().mean() > 0.8:
111
+ order = tmp[date_col].sort_values().index
112
+ cutoff = int(len(order) * 0.8)
113
+ train_idx = order[:cutoff]
114
+ test_idx = order[cutoff:]
115
+ return (
116
+ X.loc[train_idx],
117
+ X.loc[test_idx],
118
+ y.loc[train_idx],
119
+ y.loc[test_idx],
120
+ f"time_split({date_col})",
121
+ )
122
+
123
+ if group_col is not None:
124
+ groups = df[group_col]
125
+ gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
126
+ train_idx, test_idx = next(gss.split(X, y, groups=groups))
127
+ return (
128
+ X.iloc[train_idx],
129
+ X.iloc[test_idx],
130
+ y.iloc[train_idx],
131
+ y.iloc[test_idx],
132
+ f"group_split({group_col})",
133
+ )
134
+
135
+ X_train, X_test, y_train, y_test = train_test_split(
136
+ X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
137
+ )
138
+ return X_train, X_test, y_train, y_test, "stratified_random_split"
139
+
140
+
141
+ def get_models(scale_pos_weight: float) -> Dict[str, object]:
142
+ # Using moderate defaults to keep runtime reasonable.
143
+ return {
144
+ "random_forest": RandomForestClassifier(
145
+ n_estimators=300,
146
+ max_depth=None,
147
+ n_jobs=-1,
148
+ class_weight="balanced",
149
+ random_state=RANDOM_STATE,
150
+ ),
151
+ "xgboost": XGBClassifier(
152
+ n_estimators=300,
153
+ max_depth=6,
154
+ learning_rate=0.05,
155
+ subsample=0.8,
156
+ colsample_bytree=0.8,
157
+ eval_metric="logloss",
158
+ n_jobs=-1,
159
+ random_state=RANDOM_STATE,
160
+ scale_pos_weight=scale_pos_weight,
161
+ ),
162
+ "lightgbm": LGBMClassifier(
163
+ n_estimators=400,
164
+ learning_rate=0.05,
165
+ max_depth=-1,
166
+ subsample=0.9,
167
+ colsample_bytree=0.9,
168
+ random_state=RANDOM_STATE,
169
+ n_jobs=-1,
170
+ class_weight="balanced",
171
+ ),
172
+ "catboost": CatBoostClassifier(
173
+ iterations=400,
174
+ depth=8,
175
+ learning_rate=0.05,
176
+ loss_function="Logloss",
177
+ eval_metric="AUC",
178
+ verbose=0,
179
+ random_seed=RANDOM_STATE,
180
+ ),
181
+ }
182
+
183
+
184
+ def plot_roc(y_true: np.ndarray, y_score: np.ndarray, title: str, path: Path) -> None:
185
+ fpr, tpr, _ = roc_curve(y_true, y_score)
186
+ auc_val = roc_auc_score(y_true, y_score)
187
+ plt.figure()
188
+ plt.plot(fpr, tpr, label=f"AUC = {auc_val:.3f}")
189
+ plt.plot([0, 1], [0, 1], linestyle="--", color="grey")
190
+ plt.xlabel("False Positive Rate")
191
+ plt.ylabel("True Positive Rate")
192
+ plt.title(title)
193
+ plt.legend(loc="lower right")
194
+ plt.tight_layout()
195
+ plt.savefig(path, dpi=150)
196
+ plt.close()
197
+
198
+
199
+ def plot_pr(y_true: np.ndarray, y_score: np.ndarray, title: str, path: Path) -> None:
200
+ precision, recall, _ = precision_recall_curve(y_true, y_score)
201
+ ap = average_precision_score(y_true, y_score)
202
+ plt.figure()
203
+ plt.plot(recall, precision, label=f"AP = {ap:.3f}")
204
+ plt.xlabel("Recall")
205
+ plt.ylabel("Precision")
206
+ plt.title(title)
207
+ plt.legend(loc="lower left")
208
+ plt.tight_layout()
209
+ plt.savefig(path, dpi=150)
210
+ plt.close()
211
+
212
+
213
+ def plot_confusion(y_true: np.ndarray, y_pred: np.ndarray, title: str, path: Path) -> None:
214
+ cm = confusion_matrix(y_true, y_pred)
215
+ plt.figure()
216
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
217
+ plt.xlabel("Predicted")
218
+ plt.ylabel("Actual")
219
+ plt.title(title)
220
+ plt.tight_layout()
221
+ plt.savefig(path, dpi=150)
222
+ plt.close()
223
+
224
+
225
+ def evaluate_models(dataset_name: str, data_path: Path) -> None:
226
+ print(f"=== Training on {dataset_name} dataset ===")
227
+ df = pd.read_csv(data_path)
228
+ if TARGET_COL not in df.columns:
229
+ raise SystemExit(f"target column missing in {data_path}")
230
+
231
+ X = df.drop(columns=[TARGET_COL] + ID_COLS, errors="ignore")
232
+ X = X.drop(columns=[c for c in FEATURES_TO_DROP if c in X.columns], errors="ignore")
233
+ y = df[TARGET_COL]
234
+
235
+ preprocessor, num_cols, cat_cols = build_preprocessor(X)
236
+
237
+ X_train, X_test, y_train, y_test, split_tag = split_data_leakage_safe(df, X, y)
238
+ print(f"Split used: {split_tag}")
239
+ pos = y_train.sum()
240
+ neg = len(y_train) - pos
241
+ scale_pos_weight = float(neg / pos) if pos > 0 else 1.0
242
+
243
+ models = get_models(scale_pos_weight)
244
+ ds_out = OUTPUT_DIR / dataset_name
245
+ ds_out.mkdir(parents=True, exist_ok=True)
246
+
247
+ # Save a small background sample for downstream explainability tooling.
248
+ background_path = ds_out / "explain_background.csv"
249
+ df.sample(min(len(df), 200), random_state=RANDOM_STATE).to_csv(background_path, index=False)
250
+
251
+ metrics_rows = []
252
+ report_manifest = {}
253
+ pre_feature_names = None
254
+
255
+ for model_name, model in models.items():
256
+ print(f"Training {model_name}...")
257
+ clf = Pipeline(steps=[("preprocess", preprocessor), ("model", model)])
258
+ clf.fit(X_train, y_train)
259
+ if pre_feature_names is None:
260
+ pre_feature_names = clf.named_steps["preprocess"].get_feature_names_out().tolist()
261
+ probas = clf.predict_proba(X_test)[:, 1]
262
+ preds = (probas >= 0.5).astype(int)
263
+
264
+ metrics = {
265
+ "dataset": dataset_name,
266
+ "split": split_tag,
267
+ "model": model_name,
268
+ "auc_roc": roc_auc_score(y_test, probas),
269
+ "auc_pr": average_precision_score(y_test, probas),
270
+ "accuracy": accuracy_score(y_test, preds),
271
+ "precision": precision_score(y_test, preds, zero_division=0),
272
+ "recall": recall_score(y_test, preds, zero_division=0),
273
+ "f1": f1_score(y_test, preds, zero_division=0),
274
+ }
275
+ metrics_rows.append(metrics)
276
+
277
+ # Classification report
278
+ cls_report = classification_report(
279
+ y_test,
280
+ preds,
281
+ target_names=["non_default", "default"],
282
+ digits=3,
283
+ zero_division=0,
284
+ )
285
+ report_path = ds_out / f"classification_report_{model_name}.txt"
286
+ report_path.write_text(cls_report)
287
+ report_manifest[f"classification_report_{model_name}"] = str(report_path)
288
+
289
+ # Plots
290
+ roc_path = ds_out / f"roc_{model_name}.png"
291
+ pr_path = ds_out / f"pr_{model_name}.png"
292
+ cm_path = ds_out / f"confusion_matrix_{model_name}.png"
293
+ model_path = ds_out / f"{model_name}_pipeline.pkl"
294
+
295
+ plot_roc(y_test, probas, f"{dataset_name.upper()} - {model_name} ROC", roc_path)
296
+ plot_pr(y_test, probas, f"{dataset_name.upper()} - {model_name} PR", pr_path)
297
+ plot_confusion(
298
+ y_test, preds, f"{dataset_name.upper()} - {model_name} Confusion", cm_path
299
+ )
300
+ joblib.dump(clf, model_path)
301
+
302
+ report_manifest[f"roc_{model_name}"] = str(roc_path)
303
+ report_manifest[f"pr_{model_name}"] = str(pr_path)
304
+ report_manifest[f"confusion_{model_name}"] = str(cm_path)
305
+ report_manifest[f"model_{model_name}"] = str(model_path)
306
+
307
+ if model_name == "xgboost":
308
+ booster = clf.named_steps["model"].get_booster()
309
+ base_score = booster.attr("base_score")
310
+ if base_score:
311
+ try:
312
+ float(base_score)
313
+ except ValueError:
314
+ cleaned = base_score.strip("[]")
315
+ try:
316
+ cleaned_val = str(float(cleaned))
317
+ except Exception:
318
+ cleaned_val = "0.5"
319
+ booster.set_param({"base_score": cleaned_val})
320
+ booster.set_attr(base_score=cleaned_val)
321
+ booster_path = ds_out / f"{model_name}_booster.json"
322
+ booster.save_model(str(booster_path))
323
+ report_manifest[f"booster_{model_name}"] = str(booster_path)
324
+
325
+ if pre_feature_names is None:
326
+ pre_feature_names = []
327
+
328
+ explain_meta = {
329
+ "dataset": dataset_name,
330
+ "target_col": TARGET_COL,
331
+ "raw_num_cols": num_cols,
332
+ "raw_cat_cols": cat_cols,
333
+ "pre_feature_names": pre_feature_names,
334
+ "id_cols": ID_COLS,
335
+ "dropped_features": sorted(list(FEATURES_TO_DROP)),
336
+ "split_used": split_tag,
337
+ }
338
+ meta_path = ds_out / "explain_meta.json"
339
+ meta_path.write_text(json.dumps(explain_meta, indent=2))
340
+ report_manifest["explain_meta"] = str(meta_path)
341
+ report_manifest["explain_background"] = str(background_path)
342
+
343
+ metrics_df = pd.DataFrame(metrics_rows).sort_values(
344
+ ["dataset", "auc_roc"], ascending=[True, False]
345
+ )
346
+ metrics_path = ds_out / "metrics_summary.csv"
347
+ metrics_df.to_csv(metrics_path, index=False)
348
+ print(f"Saved metrics -> {metrics_path}")
349
+
350
+ manifest_path = ds_out / "artifacts.json"
351
+ manifest_path.write_text(json.dumps(report_manifest, indent=2))
352
+
353
+
354
+ def main() -> None:
355
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
356
+ for name, path in DATASETS.items():
357
+ if not path.exists():
358
+ print(f"Skipping {name}, missing file: {path}")
359
+ continue
360
+ evaluate_models(name, path)
361
+
362
+
363
+ if __name__ == "__main__":
364
+ main()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API runtime dependencies
2
+ fastapi[standard]
3
+ uvicorn[standard]
4
+ pydantic
5
+ numpy
6
+ pandas
7
+ scikit-learn==1.6.1 # match model pickling version; avoids SimpleImputer _fill_dtype errors
8
+ joblib
9
+ xgboost
10
+
11
+ # Model training extras
12
+ catboost
13
+ lightgbm
14
+ matplotlib
15
+ seaborn
test.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for /v1/predict endpoint.
4
+
5
+ This script demonstrates the correct request format:
6
+ - inputId: required string
7
+ - payload: object containing the feature data
8
+ """
9
+
10
+ import hmac
11
+ import hashlib
12
+ import time
13
+ import uuid
14
+ import requests
15
+
16
+ # ============================================
17
+ # CONFIGURATION (from tenant creation)
18
+ # ============================================
19
+ CLIENT_ID = "acme-bank-463edc0a"
20
+ CLIENT_SECRET = "yPqsrtBizHgDvnK-NpkgVXMXw3WbV_s_JGK-c2pWr3U"
21
+ HMAC_SECRET = "OSSBJgx2QToeQhGtQgzwS_8Kf1QvTraq6M67uNrBKEo"
22
+
23
+ BASE_URL = "https://frankscore-backend.onrender.com"
24
+
25
+ # ============================================
26
+ # STEP 1: Login as Tenant
27
+ # ============================================
28
+ print("Step 1: Logging in...")
29
+ login_response = requests.post(
30
+ f"{BASE_URL}/auth/login",
31
+ json={
32
+ "clientId": CLIENT_ID,
33
+ "clientSecret": CLIENT_SECRET
34
+ }
35
+ )
36
+
37
+ if login_response.status_code != 200:
38
+ print(f"❌ Login failed: {login_response.status_code}")
39
+ print(login_response.text)
40
+ exit(1)
41
+
42
+ login_data = login_response.json()
43
+ jwt_token = login_data["access_token"] # Note: camelCase, not snake_case
44
+ print(f"✅ Logged in. JWT: {jwt_token[:20]}...")
45
+
46
+ # ============================================
47
+ # STEP 2: Prepare End-User Identity
48
+ # ============================================
49
+ end_user_id = "user-alice-123" # Your customer
50
+ timestamp = str(int(time.time()))
51
+ request_id = str(uuid.uuid4())
52
+
53
+ # ============================================
54
+ # STEP 3: Compute HMAC Signature
55
+ # ============================================
56
+ signing_string = f"{end_user_id}|{timestamp}|{request_id}"
57
+ signature = hmac.new(
58
+ HMAC_SECRET.encode('utf-8'), # SECRET KEY (never sent!)
59
+ signing_string.encode('utf-8'),
60
+ hashlib.sha256
61
+ ).hexdigest()
62
+
63
+ print(f"📝 Signing string: {signing_string}")
64
+ print(f"🔐 Signature: {signature[:20]}...")
65
+
66
+ # ============================================
67
+ # STEP 4: Make Prediction Request
68
+ # ============================================
69
+ print("\nStep 4: Making prediction request...")
70
+
71
+ # IMPORTANT: The request format is:
72
+ # {
73
+ # "inputId": "string", # REQUIRED
74
+ # "payload": { ... } # The features go here
75
+ # }
76
+
77
+ request_body = {
78
+ "inputId": "loan-app-78945", # REQUIRED - unique identifier for this request
79
+ "payload": {
80
+ "num_previous_loans": 9,
81
+ "num_previous_defaults": 4,
82
+ "past_default_rate": 0.44,
83
+ "days_since_last_loan": 2,
84
+ "avg_time_bw_loans": 20,
85
+ "avg_past_amount": 26000,
86
+ "avg_past_daily_burden": 950,
87
+ "std_past_amount": 4000,
88
+ "std_past_daily_burden": 180,
89
+ "trend_in_amount": 1.3,
90
+ "trend_in_burden": 1.35,
91
+ "Total_Amount": 30000,
92
+ "Total_Amount_to_Repay": 36000,
93
+ "duration": 20,
94
+ "daily_burden": 1500,
95
+ "amount_ratio": 2.0,
96
+ "burden_ratio": 1.8,
97
+ "duration_bucket": "20",
98
+ "amount_bucket": "high",
99
+ "burden_percentile": 0.95,
100
+ "borrower_history_strength": "weak",
101
+ "month": 1,
102
+ "quarter": 1,
103
+ "week_of_year": 3,
104
+ "days_to_salary_day": 28,
105
+ "days_to_local_festival": 2,
106
+ "lender_id": "L_high3",
107
+ "lender_exposure_ratio": 0.4,
108
+ "account_age_days": 150,
109
+ "loan_frequency_per_year": 12,
110
+ "repayment_consistency": 0.4,
111
+ "latest_amount_ma3": 28000
112
+ }
113
+ }
114
+
115
+ response = requests.post(
116
+ f"{BASE_URL}/v1/predict_explain",
117
+ headers={
118
+ "Authorization": f"Bearer {jwt_token}",
119
+ "Content-Type": "application/json",
120
+ "X-End-User-Id": end_user_id,
121
+ "X-End-User-Timestamp": timestamp,
122
+ "X-Request-Id": request_id,
123
+ "X-End-User-Signature": signature
124
+ },
125
+ json=request_body
126
+ )
127
+
128
+ print(f"\nResponse Status: {response.status_code}")
129
+
130
+ if response.status_code == 200:
131
+ result = response.json()
132
+ print("✅ Prediction successful!")
133
+ print(f" Input ID: {result.get('inputId')}")
134
+ print(f" Score: {result.get('score')}")
135
+ if result.get('topFeatures'):
136
+ print(f" Top Features: {len(result.get('topFeatures'))} features")
137
+ print(f"\nFull response: {result}")
138
+ else:
139
+ print(f"❌ Prediction failed: {response.status_code}")
140
+ print(response.text)
test_data.json ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "records": [
4
+ {
5
+ "num_previous_loans": 8,
6
+ "num_previous_defaults": 0,
7
+ "past_default_rate": 0.0,
8
+ "days_since_last_loan": 40,
9
+ "avg_time_bw_loans": 120,
10
+ "avg_past_amount": 8000,
11
+ "avg_past_daily_burden": 200,
12
+ "std_past_amount": 500,
13
+ "std_past_daily_burden": 20,
14
+ "trend_in_amount": 1.05,
15
+ "trend_in_burden": 0.9,
16
+ "Total_Amount": 6000,
17
+ "Total_Amount_to_Repay": 7200,
18
+ "duration": 45,
19
+ "daily_burden": 160,
20
+ "amount_ratio": 0.4,
21
+ "burden_ratio": 0.25,
22
+ "duration_bucket": "45",
23
+ "amount_bucket": "low",
24
+ "burden_percentile": 0.15,
25
+ "borrower_history_strength": "strong",
26
+ "month": 5,
27
+ "quarter": 2,
28
+ "week_of_year": 18,
29
+ "days_to_salary_day": 5,
30
+ "days_to_local_festival": 40,
31
+ "lender_id": "L_low1",
32
+ "lender_exposure_ratio": 0.05,
33
+ "account_age_days": 900,
34
+ "loan_frequency_per_year": 3,
35
+ "repayment_consistency": 0.98,
36
+ "latest_amount_ma3": 5500
37
+ },
38
+ {
39
+ "num_previous_loans": 6,
40
+ "num_previous_defaults": 0,
41
+ "past_default_rate": 0.0,
42
+ "days_since_last_loan": 25,
43
+ "avg_time_bw_loans": 90,
44
+ "avg_past_amount": 12000,
45
+ "avg_past_daily_burden": 300,
46
+ "std_past_amount": 700,
47
+ "std_past_daily_burden": 30,
48
+ "trend_in_amount": 1.0,
49
+ "trend_in_burden": 0.95,
50
+ "Total_Amount": 10000,
51
+ "Total_Amount_to_Repay": 11500,
52
+ "duration": 60,
53
+ "daily_burden": 190,
54
+ "amount_ratio": 0.55,
55
+ "burden_ratio": 0.35,
56
+ "duration_bucket": "60",
57
+ "amount_bucket": "mid",
58
+ "burden_percentile": 0.25,
59
+ "borrower_history_strength": "strong",
60
+ "month": 7,
61
+ "quarter": 3,
62
+ "week_of_year": 27,
63
+ "days_to_salary_day": 12,
64
+ "days_to_local_festival": 25,
65
+ "lender_id": "L_low2",
66
+ "lender_exposure_ratio": 0.08,
67
+ "account_age_days": 750,
68
+ "loan_frequency_per_year": 4,
69
+ "repayment_consistency": 0.95,
70
+ "latest_amount_ma3": 10500
71
+ },
72
+ {
73
+ "num_previous_loans": 4,
74
+ "num_previous_defaults": 0,
75
+ "past_default_rate": 0.0,
76
+ "days_since_last_loan": 15,
77
+ "avg_time_bw_loans": 60,
78
+ "avg_past_amount": 15000,
79
+ "avg_past_daily_burden": 450,
80
+ "std_past_amount": 1200,
81
+ "std_past_daily_burden": 40,
82
+ "trend_in_amount": 1.05,
83
+ "trend_in_burden": 1.0,
84
+ "Total_Amount": 15000,
85
+ "Total_Amount_to_Repay": 17500,
86
+ "duration": 45,
87
+ "daily_burden": 389,
88
+ "amount_ratio": 0.8,
89
+ "burden_ratio": 0.55,
90
+ "duration_bucket": "45",
91
+ "amount_bucket": "mid",
92
+ "burden_percentile": 0.45,
93
+ "borrower_history_strength": "medium",
94
+ "month": 2,
95
+ "quarter": 1,
96
+ "week_of_year": 8,
97
+ "days_to_salary_day": 18,
98
+ "days_to_local_festival": 50,
99
+ "lender_id": "L_mid1",
100
+ "lender_exposure_ratio": 0.12,
101
+ "account_age_days": 500,
102
+ "loan_frequency_per_year": 5,
103
+ "repayment_consistency": 0.88,
104
+ "latest_amount_ma3": 16000
105
+ },
106
+ {
107
+ "num_previous_loans": 3,
108
+ "num_previous_defaults": 0,
109
+ "past_default_rate": 0.0,
110
+ "days_since_last_loan": 10,
111
+ "avg_time_bw_loans": 45,
112
+ "avg_past_amount": 17000,
113
+ "avg_past_daily_burden": 520,
114
+ "std_past_amount": 1500,
115
+ "std_past_daily_burden": 60,
116
+ "trend_in_amount": 1.1,
117
+ "trend_in_burden": 1.05,
118
+ "Total_Amount": 20000,
119
+ "Total_Amount_to_Repay": 23000,
120
+ "duration": 30,
121
+ "daily_burden": 750,
122
+ "amount_ratio": 1.2,
123
+ "burden_ratio": 0.9,
124
+ "duration_bucket": "30",
125
+ "amount_bucket": "mid",
126
+ "burden_percentile": 0.65,
127
+ "borrower_history_strength": "medium",
128
+ "month": 5,
129
+ "quarter": 2,
130
+ "week_of_year": 18,
131
+ "days_to_salary_day": 10,
132
+ "days_to_local_festival": 40,
133
+ "lender_id": "L_mid2",
134
+ "lender_exposure_ratio": 0.18,
135
+ "account_age_days": 400,
136
+ "loan_frequency_per_year": 6,
137
+ "repayment_consistency": 0.82,
138
+ "latest_amount_ma3": 18000
139
+ },
140
+ {
141
+ "num_previous_loans": 5,
142
+ "num_previous_defaults": 1,
143
+ "past_default_rate": 0.2,
144
+ "days_since_last_loan": 7,
145
+ "avg_time_bw_loans": 40,
146
+ "avg_past_amount": 18000,
147
+ "avg_past_daily_burden": 600,
148
+ "std_past_amount": 2200,
149
+ "std_past_daily_burden": 90,
150
+ "trend_in_amount": 1.15,
151
+ "trend_in_burden": 1.1,
152
+ "Total_Amount": 22000,
153
+ "Total_Amount_to_Repay": 26000,
154
+ "duration": 30,
155
+ "daily_burden": 867,
156
+ "amount_ratio": 1.35,
157
+ "burden_ratio": 1.05,
158
+ "duration_bucket": "30",
159
+ "amount_bucket": "high",
160
+ "burden_percentile": 0.75,
161
+ "borrower_history_strength": "weak",
162
+ "month": 9,
163
+ "quarter": 3,
164
+ "week_of_year": 36,
165
+ "days_to_salary_day": 20,
166
+ "days_to_local_festival": 10,
167
+ "lender_id": "L_high1",
168
+ "lender_exposure_ratio": 0.25,
169
+ "account_age_days": 300,
170
+ "loan_frequency_per_year": 8,
171
+ "repayment_consistency": 0.7,
172
+ "latest_amount_ma3": 21000
173
+ },
174
+ {
175
+ "num_previous_loans": 7,
176
+ "num_previous_defaults": 2,
177
+ "past_default_rate": 0.29,
178
+ "days_since_last_loan": 5,
179
+ "avg_time_bw_loans": 30,
180
+ "avg_past_amount": 22000,
181
+ "avg_past_daily_burden": 750,
182
+ "std_past_amount": 3000,
183
+ "std_past_daily_burden": 120,
184
+ "trend_in_amount": 1.2,
185
+ "trend_in_burden": 1.2,
186
+ "Total_Amount": 25000,
187
+ "Total_Amount_to_Repay": 30000,
188
+ "duration": 25,
189
+ "daily_burden": 1200,
190
+ "amount_ratio": 1.6,
191
+ "burden_ratio": 1.3,
192
+ "duration_bucket": "25",
193
+ "amount_bucket": "high",
194
+ "burden_percentile": 0.85,
195
+ "borrower_history_strength": "weak",
196
+ "month": 11,
197
+ "quarter": 4,
198
+ "week_of_year": 46,
199
+ "days_to_salary_day": 25,
200
+ "days_to_local_festival": 5,
201
+ "lender_id": "L_high2",
202
+ "lender_exposure_ratio": 0.32,
203
+ "account_age_days": 250,
204
+ "loan_frequency_per_year": 9,
205
+ "repayment_consistency": 0.6,
206
+ "latest_amount_ma3": 24000
207
+ },
208
+ {
209
+ "num_previous_loans": 9,
210
+ "num_previous_defaults": 4,
211
+ "past_default_rate": 0.44,
212
+ "days_since_last_loan": 2,
213
+ "avg_time_bw_loans": 20,
214
+ "avg_past_amount": 26000,
215
+ "avg_past_daily_burden": 950,
216
+ "std_past_amount": 4000,
217
+ "std_past_daily_burden": 180,
218
+ "trend_in_amount": 1.3,
219
+ "trend_in_burden": 1.35,
220
+ "Total_Amount": 30000,
221
+ "Total_Amount_to_Repay": 36000,
222
+ "duration": 20,
223
+ "daily_burden": 1500,
224
+ "amount_ratio": 2.0,
225
+ "burden_ratio": 1.8,
226
+ "duration_bucket": "20",
227
+ "amount_bucket": "high",
228
+ "burden_percentile": 0.95,
229
+ "borrower_history_strength": "weak",
230
+ "month": 1,
231
+ "quarter": 1,
232
+ "week_of_year": 3,
233
+ "days_to_salary_day": 28,
234
+ "days_to_local_festival": 2,
235
+ "lender_id": "L_high3",
236
+ "lender_exposure_ratio": 0.4,
237
+ "account_age_days": 150,
238
+ "loan_frequency_per_year": 12,
239
+ "repayment_consistency": 0.4,
240
+ "latest_amount_ma3": 28000
241
+ }
242
+ ]
243
+ }