ShuoDuan commited on
Commit
6def4ae
·
verified ·
1 Parent(s): a2f4dd9

Upload 9 main files

Browse files
Files changed (9) hide show
  1. README.md +28 -8
  2. app.py +460 -0
  3. imputer.pkl +3 -0
  4. predictor.py +434 -0
  5. requirements.txt +7 -0
  6. scaler.pkl +3 -0
  7. training_features.csv +99 -0
  8. training_features.json +155 -0
  9. xgb_best_model.pkl +3 -0
README.md CHANGED
@@ -1,14 +1,34 @@
1
  ---
2
- title: CreditShield Credit Risk Predictor
3
- emoji: 😻
4
- colorFrom: red
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.1.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: Predict loan defaults with 92.3% accuracy using machine lear
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Credit Risk Predictor
3
+ emoji: 🏦
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.50.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
 
11
  ---
12
 
13
+ # 🏦 Credit Risk Prediction System
14
+
15
+ Predict loan defaults with 92.3% accuracy using machine learning.
16
+
17
+ ## Features
18
+ - 92.3% AUC-ROC accuracy (beats academic paper)
19
+ - Business-optimized for maximum profit
20
+ - Real-time predictions with visualizations
21
+ - Based on 358,244 real loans
22
+
23
+ ## How to Use
24
+ 1. Fill in the loan application details
25
+ 2. Click "Assess Credit Risk"
26
+ 3. Get instant approval/rejection with risk analysis
27
+
28
+ ## Model Details
29
+ - **Algorithm**: XGBoost with enhanced features
30
+ - **Training Data**: Lending Club (2013-2014)
31
+ - **Key Improvement**: +0.010 AUC over baseline
32
+ - **Business Impact**: Optimized threshold at 28%
33
+
34
+ *For research purposes only. Not financial advice.*
app.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deployment/gradio_app.py
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import json
5
+ from predictor import CreditRiskPredictor
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+
9
+ # Initialize predictor
10
+ predictor = CreditRiskPredictor("model_artifacts")
11
+
12
+ # Get the actual base features needed from the predictor
13
+ if hasattr(predictor, 'base_features_needed') and predictor.base_features_needed:
14
+ print(f"📋 Model needs these base features: {predictor.base_features_needed}")
15
+ else:
16
+ print("⚠️ Could not determine base features needed")
17
+
18
+ # Feature descriptions for tooltips
19
+ FEATURE_INFO = {
20
+ 'loan_amnt': "Total amount of the loan applied for",
21
+ 'int_rate': "Interest rate on the loan",
22
+ 'grade': "LC assigned loan grade (A=best, G=worst)",
23
+ 'emp_length': "Employment length in years",
24
+ 'annual_inc': "Self-reported annual income",
25
+ 'dti': "Debt-to-income ratio",
26
+ 'revol_util': "Revolving line utilization rate",
27
+ 'delinq_2yrs': "Number of delinquencies in past 2 years",
28
+ 'inq_last_6mths': "Number of credit inquiries in past 6 months",
29
+ 'open_acc': "Number of open credit lines",
30
+ 'total_acc': "Total number of credit lines",
31
+ # Additional features from your predictor
32
+ 'revol_bal': "Total credit revolving balance",
33
+ 'total_bc_limit': "Total bankcard limit",
34
+ 'total_bal_ex_mort': "Total balance excluding mortgage",
35
+ 'avg_cur_bal': "Average current balance",
36
+ 'mo_sin_old_il_acct': "Months since oldest installment account opened",
37
+ 'mo_sin_old_rev_tl_op': "Months since oldest revolving account opened",
38
+ 'mo_sin_rcnt_rev_tl_op': "Months since most recent revolving account opened",
39
+ 'mths_since_recent_bc': "Months since most recent bankcard account opened",
40
+ 'mths_since_recent_inq': "Months since most recent inquiry",
41
+ 'pct_tl_nvr_dlq': "Percent of trades never delinquent",
42
+ 'last_fico_range_low': "Lower bound of the last FICO range",
43
+ 'last_fico_range_high': "Upper bound of the last FICO range",
44
+ 'years_since_earliest_cr': "Years since earliest credit line opened",
45
+ 'addr_state': "State of the borrower (2-letter code)",
46
+ 'home_ownership': "Home ownership status",
47
+ 'purpose': "Purpose of the loan",
48
+ 'verification_status': "Income verification status",
49
+ 'title': "Loan title/description"
50
+ }
51
+
52
+ def create_visualization(default_prob, threshold=0.28):
53
+ """Create risk visualization"""
54
+ fig, ax = plt.subplots(figsize=(8, 2))
55
+
56
+ # Create gradient risk bar
57
+ x = np.linspace(0, 1, 100)
58
+ colors = plt.cm.RdYlGn_r(x) # Red to Green (reversed)
59
+
60
+ for i in range(len(x)-1):
61
+ ax.fill_between([x[i], x[i+1]], 0, 1, color=colors[i], alpha=0.7)
62
+
63
+ # Mark threshold
64
+ ax.axvline(x=threshold, color='black', linestyle='--', linewidth=2, label=f'Threshold ({threshold:.0%})')
65
+
66
+ # Mark prediction
67
+ ax.plot(default_prob, 0.5, 'ro', markersize=15, label=f'Prediction ({default_prob:.1%})')
68
+
69
+ ax.set_xlim(0, 1)
70
+ ax.set_ylim(0, 1)
71
+ ax.set_xlabel('Default Probability')
72
+ ax.set_title('Risk Assessment')
73
+ ax.legend(loc='upper right')
74
+ ax.set_yticks([])
75
+
76
+ plt.tight_layout()
77
+ return fig
78
+
79
+ def predict_loan(loan_amnt, int_rate, grade, emp_length, annual_inc,
80
+ dti, revol_util, delinq_2yrs, inq_last_6mths,
81
+ open_acc, total_acc, revol_bal=5000, total_bc_limit=20000,
82
+ total_bal_ex_mort=30000, avg_cur_bal=2500,
83
+ mo_sin_old_il_acct=60, mo_sin_old_rev_tl_op=48,
84
+ mo_sin_rcnt_rev_tl_op=12, mths_since_recent_bc=6,
85
+ mths_since_recent_inq=3, pct_tl_nvr_dlq=95,
86
+ last_fico_range_low=680, last_fico_range_high=684,
87
+ years_since_earliest_cr=10, addr_state="CA",
88
+ home_ownership="RENT", purpose="debt_consolidation",
89
+ verification_status="Verified",
90
+ title="Debt consolidation loan"):
91
+ """Main prediction function with all needed features"""
92
+
93
+ # Prepare input with ALL features the model expects
94
+ loan_data = {
95
+ # Basic loan info
96
+ 'loan_amnt': float(loan_amnt),
97
+ 'int_rate': float(int_rate),
98
+ 'grade': grade,
99
+ 'emp_length': emp_length,
100
+ 'annual_inc': float(annual_inc),
101
+ 'dti': float(dti),
102
+ 'revol_util': f"{revol_util}%",
103
+ 'delinq_2yrs': int(delinq_2yrs),
104
+ 'inq_last_6mths': int(inq_last_6mths),
105
+ 'open_acc': int(open_acc),
106
+ 'total_acc': int(total_acc),
107
+
108
+ # Additional credit features
109
+ 'revol_bal': float(revol_bal),
110
+ 'total_bc_limit': float(total_bc_limit),
111
+ 'total_bal_ex_mort': float(total_bal_ex_mort),
112
+ 'avg_cur_bal': float(avg_cur_bal),
113
+ 'mo_sin_old_il_acct': float(mo_sin_old_il_acct),
114
+ 'mo_sin_old_rev_tl_op': float(mo_sin_old_rev_tl_op),
115
+ 'mo_sin_rcnt_rev_tl_op': float(mo_sin_rcnt_rev_tl_op),
116
+ 'mths_since_recent_bc': float(mths_since_recent_bc),
117
+ 'mths_since_recent_inq': float(mths_since_recent_inq),
118
+ 'pct_tl_nvr_dlq': float(pct_tl_nvr_dlq) / 100.0, # Convert to decimal
119
+ 'last_fico_range_low': float(last_fico_range_low),
120
+ 'last_fico_range_high': float(last_fico_range_high),
121
+ 'years_since_earliest_cr': float(years_since_earliest_cr),
122
+
123
+ # Categorical features for one-hot encoding
124
+ 'addr_state': str(addr_state),
125
+ 'home_ownership': str(home_ownership),
126
+ 'purpose': str(purpose),
127
+ 'verification_status': str(verification_status),
128
+ 'title': str(title)
129
+ }
130
+
131
+ # Get prediction
132
+ result = predictor.predict(loan_data)
133
+
134
+ if not result['success']:
135
+ return f"❌ Error: {result['error']}", None, "red"
136
+
137
+ # Format results
138
+ if result['decision'] == 'APPROVE':
139
+ decision_html = """
140
+ <div style='background-color: #d4edda; padding: 20px; border-radius: 10px; border: 2px solid #c3e6cb;'>
141
+ <h2 style='color: #155724; margin: 0;'>✅ LOAN APPROVED</h2>
142
+ </div>
143
+ """
144
+ color = "green"
145
+ else:
146
+ decision_html = """
147
+ <div style='background-color: #f8d7da; padding: 20px; border-radius: 10px; border: 2px solid #f5c6cb;'>
148
+ <h2 style='color: #721c24; margin: 0;'>❌ LOAN REJECTED</h2>
149
+ </div>
150
+ """
151
+ color = "red"
152
+
153
+ # Create results table
154
+ results_md = f"""
155
+ ## 📊 Prediction Results
156
+
157
+ | Metric | Value |
158
+ |--------|-------|
159
+ | **Default Probability** | {result['default_probability']:.2%} |
160
+ | **Risk Level** | {result['risk_level']} |
161
+ | **Confidence** | {result['confidence']:.0%} |
162
+ | **Optimal Threshold** | {result['optimal_threshold']:.0%} |
163
+
164
+ ### 💡 Explanation
165
+ {result['explanation']}
166
+
167
+ ### 🔧 Model Info
168
+ - **Features used**: {len(predictor.feature_list) if predictor.feature_list else 'Unknown'}
169
+ - **Features provided**: {len(loan_data)}
170
+ - **Threshold optimized for profit**: {result['optimal_threshold']:.0%}
171
+
172
+ ---
173
+ *Model accuracy: 92.3% AUC-ROC | Trained on 358,244 loans*
174
+ """
175
+
176
+ # Create visualization
177
+ fig = create_visualization(result['default_probability'], result['optimal_threshold'])
178
+
179
+ return decision_html, results_md, color, fig
180
+
181
+ # Create Gradio interface
182
+ with gr.Blocks(title="Credit Risk Predictor", theme=gr.themes.Soft()) as demo:
183
+ gr.Markdown("""
184
+ # 🏦 Credit Risk Prediction System
185
+ *Predict loan defaults with 92.3% accuracy using machine learning*
186
+
187
+ Based on research: *"Credit scoring for peer-to-peer lending using machine learning techniques"*
188
+ (Quantitative Finance and Economics, Volume 6, Issue 2) with enhancements.
189
+ """)
190
+
191
+ # Advanced features accordion
192
+ with gr.Accordion("🔧 Advanced Features (Optional)", open=False):
193
+ gr.Markdown("""
194
+ **Default values are set to typical/average levels.**
195
+ These additional features improve prediction accuracy but are optional.
196
+ """)
197
+
198
+ with gr.Row():
199
+ with gr.Column():
200
+ revol_bal = gr.Slider(0, 100000, 5000, step=1000,
201
+ label="Revolving Balance ($)",
202
+ info=FEATURE_INFO['revol_bal'])
203
+
204
+ total_bc_limit = gr.Slider(0, 100000, 20000, step=1000,
205
+ label="Total Bankcard Limit ($)",
206
+ info=FEATURE_INFO['total_bc_limit'])
207
+
208
+ total_bal_ex_mort = gr.Slider(0, 200000, 30000, step=1000,
209
+ label="Total Balance Excl. Mortgage ($)",
210
+ info=FEATURE_INFO['total_bal_ex_mort'])
211
+
212
+ avg_cur_bal = gr.Slider(0, 50000, 2500, step=100,
213
+ label="Average Current Balance ($)",
214
+ info=FEATURE_INFO['avg_cur_bal'])
215
+
216
+ with gr.Column():
217
+ mo_sin_old_il_acct = gr.Slider(0, 300, 60, step=1,
218
+ label="Months since oldest installment account",
219
+ info=FEATURE_INFO['mo_sin_old_il_acct'])
220
+
221
+ mo_sin_old_rev_tl_op = gr.Slider(0, 300, 48, step=1,
222
+ label="Months since oldest revolving account",
223
+ info=FEATURE_INFO['mo_sin_old_rev_tl_op'])
224
+
225
+ mo_sin_rcnt_rev_tl_op = gr.Slider(0, 300, 12, step=1,
226
+ label="Months since newest revolving account",
227
+ info=FEATURE_INFO['mo_sin_rcnt_rev_tl_op'])
228
+
229
+ mths_since_recent_bc = gr.Slider(0, 120, 6, step=1,
230
+ label="Months since newest bankcard",
231
+ info=FEATURE_INFO['mths_since_recent_bc'])
232
+
233
+ with gr.Row():
234
+ with gr.Column():
235
+ mths_since_recent_inq = gr.Slider(0, 120, 3, step=1,
236
+ label="Months since newest inquiry",
237
+ info=FEATURE_INFO['mths_since_recent_inq'])
238
+
239
+ pct_tl_nvr_dlq = gr.Slider(0, 100, 95, step=1,
240
+ label="% of trades never delinquent",
241
+ info=FEATURE_INFO['pct_tl_nvr_dlq'])
242
+
243
+ last_fico_range_low = gr.Slider(300, 850, 680, step=10,
244
+ label="Lowest recent FICO score",
245
+ info=FEATURE_INFO['last_fico_range_low'])
246
+
247
+ last_fico_range_high = gr.Slider(300, 850, 684, step=10,
248
+ label="Highest recent FICO score",
249
+ info=FEATURE_INFO['last_fico_range_high'])
250
+
251
+ with gr.Column():
252
+ years_since_earliest_cr = gr.Slider(0, 50, 10, step=1,
253
+ label="Years since first credit line",
254
+ info=FEATURE_INFO['years_since_earliest_cr'])
255
+
256
+ addr_state = gr.Textbox(value="CA", label="State (2 letters)",
257
+ info=FEATURE_INFO['addr_state'])
258
+
259
+ home_ownership = gr.Dropdown(["RENT", "MORTGAGE", "OWN", "OTHER"],
260
+ value="RENT", label="Home Ownership",
261
+ info=FEATURE_INFO['home_ownership'])
262
+
263
+ with gr.Row():
264
+ purpose = gr.Dropdown(["debt_consolidation", "credit_card", "home_improvement",
265
+ "major_purchase", "medical", "car", "wedding"],
266
+ value="debt_consolidation", label="Loan Purpose",
267
+ info=FEATURE_INFO['purpose'])
268
+
269
+ verification_status = gr.Dropdown(["Verified", "Source Verified", "Not Verified"],
270
+ value="Verified", label="Income Verification",
271
+ info=FEATURE_INFO['verification_status'])
272
+
273
+ title = gr.Textbox(value="Debt consolidation loan", label="Loan Title",
274
+ info=FEATURE_INFO['title'])
275
+
276
+ # Main form
277
+ gr.Markdown("## 📝 Required Loan Information")
278
+ with gr.Row():
279
+ with gr.Column(scale=1):
280
+ gr.Markdown("### Loan Application")
281
+
282
+ with gr.Group():
283
+ loan_amnt = gr.Slider(1000, 40000, 15000, step=500,
284
+ label="Loan Amount ($)",
285
+ info=FEATURE_INFO['loan_amnt'])
286
+
287
+ int_rate = gr.Slider(5.0, 30.0, 12.5, step=0.1,
288
+ label="Interest Rate (%)",
289
+ info=FEATURE_INFO['int_rate'])
290
+
291
+ grade = gr.Radio(["A", "B", "C", "D", "E", "F", "G"], value="C",
292
+ label="Loan Grade",
293
+ info=FEATURE_INFO['grade'])
294
+
295
+ with gr.Group():
296
+ emp_length = gr.Dropdown(["< 1 year", "1 year", "2 years", "3 years",
297
+ "4 years", "5 years", "6 years", "7 years",
298
+ "8 years", "9 years", "10+ years"],
299
+ value="5 years",
300
+ label="Employment Length",
301
+ info=FEATURE_INFO['emp_length'])
302
+
303
+ annual_inc = gr.Slider(20000, 1000000, 75000, step=1000,
304
+ label="Annual Income ($)",
305
+ info=FEATURE_INFO['annual_inc'])
306
+
307
+ dti = gr.Slider(0, 40, 18.5, step=0.1,
308
+ label="Debt-to-Income Ratio",
309
+ info=FEATURE_INFO['dti'])
310
+
311
+ with gr.Column(scale=1):
312
+ gr.Markdown("### Credit History")
313
+
314
+ with gr.Group():
315
+ revol_util = gr.Slider(0, 100, 45, step=1,
316
+ label="Credit Utilization (%)",
317
+ info=FEATURE_INFO['revol_util'])
318
+
319
+ delinq_2yrs = gr.Slider(0, 10, 0, step=1,
320
+ label="Delinquencies (last 2 years)",
321
+ info=FEATURE_INFO['delinq_2yrs'])
322
+
323
+ inq_last_6mths = gr.Slider(0, 10, 2, step=1,
324
+ label="Credit Inquiries (last 6 months)",
325
+ info=FEATURE_INFO['inq_last_6mths'])
326
+
327
+ with gr.Group():
328
+ open_acc = gr.Slider(0, 50, 8, step=1,
329
+ label="Open Credit Lines",
330
+ info=FEATURE_INFO['open_acc'])
331
+
332
+ total_acc = gr.Slider(0, 100, 25, step=1,
333
+ label="Total Credit Lines",
334
+ info=FEATURE_INFO['total_acc'])
335
+
336
+ with gr.Row():
337
+ submit_btn = gr.Button("🔍 Assess Credit Risk", variant="primary", size="lg")
338
+ clear_btn = gr.Button("🔄 Clear Form", variant="secondary")
339
+ simple_mode_btn = gr.Button("📱 Simple Mode", variant="secondary")
340
+
341
+ # Example buttons
342
+ gr.Markdown("### 🚀 Quick Examples")
343
+ with gr.Row():
344
+ low_risk_btn = gr.Button("👍 Low Risk Example", variant="secondary", size="sm")
345
+ high_risk_btn = gr.Button("👎 High Risk Example", variant="secondary", size="sm")
346
+ borderline_btn = gr.Button("⚖️ Borderline Example", variant="secondary", size="sm")
347
+
348
+ # Results section
349
+ gr.Markdown("## 📈 Assessment Results")
350
+
351
+ with gr.Row():
352
+ decision_output = gr.HTML(label="Decision")
353
+ color_indicator = gr.HTML(visible=False)
354
+
355
+ with gr.Row():
356
+ with gr.Column(scale=2):
357
+ results_output = gr.Markdown(label="Detailed Results")
358
+ with gr.Column(scale=1):
359
+ plot_output = gr.Plot(label="Risk Visualization")
360
+
361
+ # Footer
362
+ gr.Markdown("""
363
+ ---
364
+ ### ℹ️ About This Model
365
+ - **Accuracy**: 92.3% AUC-ROC (beats paper's 86-87%)
366
+ - **Training Data**: 358,244 loans from Lending Club (2013-2014)
367
+ - **Key Features**: 98 engineered features including credit history and financial ratios
368
+ - **Business Impact**: Optimized for maximum profit (threshold: 28%)
369
+ - **Improvements**: No undersampling, time-based validation, enhanced features
370
+
371
+ *For research purposes only. Not financial advice.*
372
+ """)
373
+
374
+ # Define examples with all needed features
375
+ examples = {
376
+ 'low': {
377
+ 'basic': [10000, 8.5, 'A', '10+ years', 120000, 12.0, 30, 0, 1, 5, 20],
378
+ 'advanced': [3000, 15000, 25000, 3000, 120, 96, 24, 12, 6, 98, 720, 724, 15,
379
+ "CA", "OWN", "debt_consolidation", "Verified", "Debt consolidation"]
380
+ },
381
+ 'high': {
382
+ 'basic': [35000, 25.0, 'F', '< 1 year', 30000, 35.0, 95, 3, 8, 15, 40],
383
+ 'advanced': [20000, 5000, 10000, 1000, 6, 12, 1, 1, 1, 60, 580, 590, 2,
384
+ "NV", "RENT", "credit_card", "Not Verified", "Credit card payoff"]
385
+ },
386
+ 'borderline': {
387
+ 'basic': [20000, 15.0, 'D', '3 years', 55000, 22.0, 75, 1, 4, 10, 30],
388
+ 'advanced': [10000, 10000, 20000, 2000, 36, 48, 6, 6, 3, 85, 650, 660, 5,
389
+ "TX", "MORTGAGE", "home_improvement", "Source Verified", "Home renovation loan"]
390
+ }
391
+ }
392
+
393
+ # Function to get all inputs for an example
394
+ def get_example(example_type):
395
+ basic = examples[example_type]['basic']
396
+ advanced = examples[example_type]['advanced']
397
+ return basic + advanced
398
+
399
+ # Connect buttons
400
+ all_inputs = [loan_amnt, int_rate, grade, emp_length, annual_inc,
401
+ dti, revol_util, delinq_2yrs, inq_last_6mths,
402
+ open_acc, total_acc, revol_bal, total_bc_limit,
403
+ total_bal_ex_mort, avg_cur_bal, mo_sin_old_il_acct,
404
+ mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op,
405
+ mths_since_recent_bc, mths_since_recent_inq,
406
+ pct_tl_nvr_dlq, last_fico_range_low, last_fico_range_high,
407
+ years_since_earliest_cr, addr_state, home_ownership,
408
+ purpose, verification_status, title]
409
+
410
+ submit_btn.click(
411
+ fn=predict_loan,
412
+ inputs=all_inputs,
413
+ outputs=[decision_output, results_output, color_indicator, plot_output]
414
+ )
415
+
416
+ # Clear function with defaults
417
+ def clear_form():
418
+ basic_defaults = [15000, 12.5, 'C', '5 years', 75000, 18.5, 45, 0, 2, 8, 25]
419
+ advanced_defaults = [5000, 20000, 30000, 2500, 60, 48, 12, 6, 3, 95, 680, 684,
420
+ 10, "CA", "RENT", "debt_consolidation", "Verified",
421
+ "Debt consolidation loan"]
422
+ return basic_defaults + advanced_defaults + [None, None, None, None]
423
+
424
+ clear_btn.click(
425
+ fn=clear_form,
426
+ outputs=all_inputs + [decision_output, results_output, plot_output]
427
+ )
428
+
429
+ # Example buttons
430
+ low_risk_btn.click(
431
+ fn=lambda: get_example('low'),
432
+ outputs=all_inputs
433
+ )
434
+
435
+ high_risk_btn.click(
436
+ fn=lambda: get_example('high'),
437
+ outputs=all_inputs
438
+ )
439
+
440
+ borderline_btn.click(
441
+ fn=lambda: get_example('borderline'),
442
+ outputs=all_inputs
443
+ )
444
+
445
+ # Simple mode button (hides advanced features)
446
+ simple_mode_btn.click(
447
+ fn=lambda: gr.Accordion(open=False),
448
+ outputs=None
449
+ )
450
+
451
+ # Run the app
452
+ if __name__ == "__main__":
453
+ print("🚀 Starting Credit Risk Predictor...")
454
+ print(f"📊 Model features: {len(predictor.feature_list) if predictor.feature_list else 'Unknown'}")
455
+ demo.launch(
456
+ server_name="0.0.0.0",
457
+ server_port=7860,
458
+ share=False,
459
+ debug=True
460
+ )
imputer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43ccf3bf22bce59767331fdda0a5fa8c5ef839774b42b05f1746f162a13b119f
3
+ size 3319
predictor.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deployment/predictor.py
2
+ import joblib
3
+ import numpy as np
4
+ import pandas as pd
5
+ import re
6
+ from pathlib import Path
7
+ import json
8
+
9
+ class CreditRiskPredictor:
10
+ """Predictor using your actual trained model features"""
11
+
12
+ def __init__(self, model_dir="model_artifacts"):
13
+ self.model_dir = Path(model_dir)
14
+ self.model = None
15
+ self.scaler = None
16
+ self.imputer = None
17
+ self.optimal_threshold = 0.28
18
+
19
+ # Load the ACTUAL feature list from your JSON
20
+ self.feature_list = self._load_actual_features()
21
+ print(f"📋 Using {len(self.feature_list)} ACTUAL features")
22
+
23
+ # Extract base features needed from user input
24
+ self.base_features_needed = self._extract_base_features()
25
+ print(f"📋 Expecting {len(self.base_features_needed)} base input features")
26
+
27
+ self.load_artifacts()
28
+
29
+ def _load_actual_features(self):
30
+ """Load the actual features used in training"""
31
+ feature_file = self.model_dir / "training_features.json"
32
+ if not feature_file.exists():
33
+ print(f"⚠️ {feature_file} not found")
34
+ return []
35
+
36
+ with open(feature_file, 'r') as f:
37
+ data = json.load(f)
38
+
39
+ # Your JSON has 'feature_names' key
40
+ if 'feature_names' in data:
41
+ features = data['feature_names']
42
+ if isinstance(features, list):
43
+ return features
44
+ elif 'enhanced_features' in data:
45
+ features = data['enhanced_features']
46
+ if isinstance(features, list):
47
+ return features
48
+
49
+ print(f"❌ Could not find feature list in JSON. Keys: {list(data.keys())}")
50
+ return []
51
+
52
+ def _extract_base_features(self):
53
+ """Extract base features from one-hot encoded feature list"""
54
+ if not self.feature_list:
55
+ return []
56
+
57
+ base_features = set()
58
+ for feature in self.feature_list:
59
+ # Handle one-hot encoded features
60
+ if feature.startswith('addr_state_'):
61
+ base_features.add('addr_state')
62
+ elif feature.startswith('home_ownership_'):
63
+ base_features.add('home_ownership')
64
+ elif feature.startswith('purpose_'):
65
+ base_features.add('purpose')
66
+ elif feature.startswith('verification_status_'):
67
+ base_features.add('verification_status')
68
+ elif feature.startswith('title_has_'):
69
+ # These are title-based engineered features
70
+ base_features.add('title')
71
+ elif '_' in feature and not feature.replace('_', '').isnumeric():
72
+ # Other potential categoricals
73
+ parts = feature.split('_')
74
+ if len(parts) > 1:
75
+ base_features.add(parts[0])
76
+ else:
77
+ # Regular feature
78
+ base_features.add(feature)
79
+
80
+ # Filter out features that don't make sense as user inputs
81
+ user_input_features = []
82
+ for feature in base_features:
83
+ if feature not in ['purpose_debt_consolidation', 'verification_status_Verified',
84
+ 'verification_status_Source', 'title_has_car', 'title_has_medical',
85
+ 'title_has_credit', 'title_has_home', 'title_has_consolidation',
86
+ 'title_has_debt', 'title_has_card'] and not any(feature + '_' in f for f in self.feature_list):
87
+ user_input_features.append(feature)
88
+
89
+ return user_input_features
90
+
91
+ def load_artifacts(self):
92
+ """Load model, scaler, and imputer"""
93
+ try:
94
+ # Find the latest model
95
+ model_files = list(self.model_dir.glob("*xgb*.pkl"))
96
+ scaler_files = list(self.model_dir.glob("*scaler*.pkl"))
97
+ imputer_files = list(self.model_dir.glob("*imputer*.pkl"))
98
+
99
+ if not model_files:
100
+ raise FileNotFoundError("No model files found")
101
+
102
+ # Load the first available
103
+ self.model = joblib.load(model_files[0])
104
+ print(f"✅ Loaded model: {model_files[0].name}")
105
+
106
+ if scaler_files:
107
+ self.scaler = joblib.load(scaler_files[0])
108
+ print(f"✅ Loaded scaler: {scaler_files[0].name}")
109
+
110
+ if imputer_files:
111
+ self.imputer = joblib.load(imputer_files[0])
112
+ print(f"✅ Loaded imputer: {imputer_files[0].name}")
113
+
114
+ # Verify feature count
115
+ if hasattr(self.model, 'n_features_in_'):
116
+ print(f"📊 Model expects {self.model.n_features_in_} features")
117
+ print(f"📊 We have {len(self.feature_list)} features in our list")
118
+
119
+ if self.model.n_features_in_ != len(self.feature_list):
120
+ print("⚠️ WARNING: Feature count mismatch!")
121
+
122
+ except Exception as e:
123
+ print(f"❌ Error loading artifacts: {e}")
124
+ raise
125
+
126
+ def _engineer_features(self, df):
127
+ """Create all features including one-hot encoded"""
128
+ if not self.feature_list:
129
+ raise ValueError("No feature list available!")
130
+
131
+ # First, ensure we have all base features (fill missing with defaults)
132
+ for feature in self.base_features_needed:
133
+ if feature not in df.columns:
134
+ # Set appropriate defaults based on feature type
135
+ if feature in ['loan_amnt', 'annual_inc', 'int_rate', 'dti', 'total_acc',
136
+ 'revol_bal', 'total_bc_limit', 'total_bal_ex_mort', 'avg_cur_bal',
137
+ 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op',
138
+ 'mths_since_recent_bc', 'mths_since_recent_inq', 'last_fico_range_low',
139
+ 'last_fico_range_high', 'years_since_earliest_cr']:
140
+ df[feature] = 0 # Numerical defaults
141
+ elif feature in ['addr_state', 'home_ownership', 'purpose', 'verification_status', 'title']:
142
+ df[feature] = 'unknown' # Categorical defaults
143
+ elif feature in ['grade_numeric', 'emp_length_numeric', 'revol_util_decimal',
144
+ 'loan_to_income', 'int_rate_times_loan', 'subprime_high_dti',
145
+ 'pct_tl_nvr_dlq', 'title_length', 'title_word_count']:
146
+ df[feature] = 0 # Engineered feature defaults
147
+ elif feature in ['delinq_2yrs', 'inq_last_6mths', 'open_acc', 'has_delinq_history']:
148
+ df[feature] = 0 # Credit history defaults
149
+ else:
150
+ df[feature] = 0
151
+
152
+ # Convert categorical to one-hot
153
+ df = self._create_one_hot_features(df)
154
+
155
+ # Engineered features
156
+ df = self._create_engineered_features(df)
157
+
158
+ return df
159
+
160
+ def _create_one_hot_features(self, df):
161
+ """Create one-hot encoded features from categorical variables"""
162
+ if not self.feature_list:
163
+ return df
164
+
165
+ for feature in self.feature_list:
166
+ # Handle different categorical encodings
167
+ if feature.startswith('addr_state_'):
168
+ state_code = feature.replace('addr_state_', '')
169
+ if 'addr_state' in df.columns:
170
+ df[feature] = (df['addr_state'].astype(str).str.upper() == state_code).astype(int)
171
+ else:
172
+ df[feature] = 0
173
+
174
+ elif feature.startswith('home_ownership_'):
175
+ ownership_type = feature.replace('home_ownership_', '')
176
+ if 'home_ownership' in df.columns:
177
+ df[feature] = (df['home_ownership'].astype(str).str.upper() == ownership_type).astype(int)
178
+ else:
179
+ df[feature] = 0
180
+
181
+ elif feature.startswith('purpose_'):
182
+ purpose_type = feature.replace('purpose_', '')
183
+ if 'purpose' in df.columns:
184
+ df[feature] = (df['purpose'].astype(str).str.lower().replace(' ', '_') == purpose_type).astype(int)
185
+ else:
186
+ df[feature] = 0
187
+
188
+ elif feature.startswith('verification_status_'):
189
+ status_type = feature.replace('verification_status_', '')
190
+ if 'verification_status' in df.columns:
191
+ df[feature] = (df['verification_status'].astype(str).str.replace(' ', '_') == status_type).astype(int)
192
+ else:
193
+ df[feature] = 0
194
+
195
+ elif feature.startswith('title_has_'):
196
+ # These are title-based engineered features
197
+ keyword = feature.replace('title_has_', '')
198
+ if 'title' in df.columns:
199
+ title_str = str(df['title'].iloc[0]).lower() if len(df) > 0 else ''
200
+ df[feature] = 1 if keyword in title_str else 0
201
+ else:
202
+ df[feature] = 0
203
+
204
+ return df
205
+
206
+ def _create_engineered_features(self, df):
207
+ """Create engineered features"""
208
+ # Grade to numeric (if grade is provided)
209
+ if 'grade' in df.columns:
210
+ grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
211
+ df['grade_numeric'] = df['grade'].map(grade_map).fillna(4)
212
+
213
+ # Employment length to numeric
214
+ if 'emp_length' in df.columns:
215
+ df['emp_length_numeric'] = df['emp_length'].apply(self._convert_emp_length)
216
+
217
+ # Credit utilization to decimal
218
+ if 'revol_util' in df.columns:
219
+ df['revol_util_decimal'] = df['revol_util'].astype(str).str.replace('%', '', regex=False).astype(float) / 100
220
+
221
+ # Financial ratios
222
+ if 'loan_amnt' in df.columns and 'annual_inc' in df.columns:
223
+ df['loan_to_income'] = df['loan_amnt'] / (df['annual_inc'].replace(0, 1) + 1)
224
+
225
+ if 'int_rate' in df.columns and 'loan_amnt' in df.columns:
226
+ df['int_rate_times_loan'] = df['int_rate'] * df['loan_amnt'] / 1000
227
+
228
+ # Credit flags
229
+ if 'delinq_2yrs' in df.columns:
230
+ df['has_delinq_history'] = (df['delinq_2yrs'] > 0).astype(int)
231
+
232
+ # Subprime indicator
233
+ if 'grade_numeric' in df.columns and 'dti' in df.columns:
234
+ df['subprime_high_dti'] = ((df['grade_numeric'] >= 4) & (df['dti'] > 20)).astype(int)
235
+
236
+ # Title-based features
237
+ if 'title' in df.columns:
238
+ title_str = str(df['title'].iloc[0]).lower() if len(df) > 0 else ''
239
+ df['title_length'] = len(title_str)
240
+ df['title_word_count'] = len(title_str.split())
241
+
242
+ # Years since earliest credit line (simplified)
243
+ if 'years_since_earliest_cr' not in df.columns:
244
+ df['years_since_earliest_cr'] = 10 # Default value
245
+
246
+ # Set defaults for any missing engineered features
247
+ for feature in self.feature_list:
248
+ if feature not in df.columns and not feature.startswith(('addr_state_', 'home_ownership_',
249
+ 'purpose_', 'verification_status_', 'title_has_')):
250
+ # Default values based on feature type
251
+ if 'fico' in feature.lower():
252
+ df[feature] = 700 # Average FICO score
253
+ elif any(x in feature for x in ['rate', 'util', 'pct', 'ratio']):
254
+ df[feature] = 0.5 # Percentage default
255
+ elif any(x in feature for x in ['loan', 'amt', 'bal', 'limit', 'inc']):
256
+ df[feature] = 0 # Monetary default
257
+ elif any(x in feature for x in ['month', 'mo', 'mth', 'year']):
258
+ df[feature] = 0 # Time default
259
+ else:
260
+ df[feature] = 0
261
+
262
+ return df
263
+
264
+ def _convert_emp_length(self, val):
265
+ """Convert employment length string to numeric"""
266
+ if pd.isna(val):
267
+ return 3.0 # Default
268
+ val = str(val).lower()
269
+ if '10+' in val:
270
+ return 10.0
271
+ elif '< 1' in val:
272
+ return 0.5
273
+ else:
274
+ numbers = re.findall(r'\d+', val)
275
+ return float(numbers[0]) if numbers else 3.0
276
+
277
+ def preprocess_input(self, input_dict):
278
+ """Convert raw input to model-ready features"""
279
+ if not self.feature_list:
280
+ raise ValueError("No feature list available!")
281
+
282
+ df = pd.DataFrame([input_dict])
283
+
284
+ # Engineer all features including one-hot
285
+ df = self._engineer_features(df)
286
+
287
+ # Ensure we have all features in correct order
288
+ processed_df = pd.DataFrame(columns=self.feature_list)
289
+
290
+ # Fill with available values, zeros for missing
291
+ for feature in self.feature_list:
292
+ if feature in df.columns:
293
+ processed_df[feature] = df[feature].values
294
+ else:
295
+ processed_df[feature] = 0
296
+
297
+ # Debug: Show we have the right number of features
298
+ print(f"🔧 Created dataframe with {len(processed_df.columns)} features")
299
+
300
+ # Handle missing values (imputer)
301
+ if self.imputer is not None and not processed_df.empty:
302
+ try:
303
+ processed_df = pd.DataFrame(
304
+ self.imputer.transform(processed_df),
305
+ columns=self.feature_list
306
+ )
307
+ except Exception as e:
308
+ print(f"⚠️ Imputer error: {e}")
309
+
310
+ # Scale features
311
+ if self.scaler is not None and not processed_df.empty:
312
+ try:
313
+ processed_df = pd.DataFrame(
314
+ self.scaler.transform(processed_df),
315
+ columns=self.feature_list
316
+ )
317
+ except Exception as e:
318
+ print(f"⚠️ Scaler error: {e}")
319
+
320
+ return processed_df.values
321
+
322
+ def predict(self, input_dict):
323
+ """Make prediction"""
324
+ try:
325
+ # Preprocess
326
+ features = self.preprocess_input(input_dict)
327
+
328
+ if features.size == 0:
329
+ raise ValueError("No features generated!")
330
+
331
+ # Debug info
332
+ print(f"🔧 Processed features shape: {features.shape}")
333
+
334
+ # Predict
335
+ default_prob = self.model.predict_proba(features)[0, 1]
336
+
337
+ # Decision
338
+ decision = "APPROVE" if default_prob < self.optimal_threshold else "REJECT"
339
+
340
+ return {
341
+ 'success': True,
342
+ 'default_probability': float(default_prob),
343
+ 'decision': decision,
344
+ 'risk_level': self._get_risk_level(default_prob),
345
+ 'confidence': self._get_confidence(default_prob),
346
+ 'optimal_threshold': self.optimal_threshold,
347
+ 'explanation': f"Default probability: {default_prob:.1%} (threshold: {self.optimal_threshold:.1%})"
348
+ }
349
+
350
+ except Exception as e:
351
+ import traceback
352
+ print(f"❌ Prediction error: {e}")
353
+ traceback.print_exc()
354
+ return {
355
+ 'success': False,
356
+ 'error': str(e),
357
+ 'decision': 'ERROR'
358
+ }
359
+
360
+ def _get_risk_level(self, prob):
361
+ if prob < 0.2: return "LOW"
362
+ elif prob < 0.4: return "MEDIUM"
363
+ elif prob < 0.6: return "HIGH"
364
+ else: return "VERY HIGH"
365
+
366
+ def _get_confidence(self, prob):
367
+ distance = abs(prob - self.optimal_threshold)
368
+ return max(0.5, 1.0 - distance * 2)
369
+
370
+ # Test with the exact features your model expects
371
+ if __name__ == "__main__":
372
+ print("🧪 Testing CreditRiskPredictor...")
373
+ print("=" * 60)
374
+
375
+ # Create predictor
376
+ predictor = CreditRiskPredictor("model_artifacts")
377
+
378
+ if not predictor.feature_list:
379
+ print("\n❌ Cannot proceed without features!")
380
+ else:
381
+ # Create a test input with ALL the features your model actually needs
382
+ # Based on your JSON, here's what to provide:
383
+ test_loan = {
384
+ # Basic loan info
385
+ 'loan_amnt': 15000,
386
+ 'int_rate': 12.5,
387
+
388
+ # Categorical features (will be one-hot encoded)
389
+ 'addr_state': 'CA', # Will create addr_state_CA = 1
390
+ 'home_ownership': 'RENT', # Will create home_ownership_RENT = 1
391
+ 'purpose': 'debt_consolidation', # Will create purpose_debt_consolidation = 1
392
+ 'verification_status': 'Verified', # Will create verification_status_Verified = 1
393
+
394
+ # Title for title-based features
395
+ 'title': 'Debt consolidation loan for credit card payoff',
396
+
397
+ # Credit features from your feature list
398
+ 'dti': 18.5,
399
+ 'annual_inc': 75000,
400
+ 'revol_util': '45%',
401
+ 'delinq_2yrs': 0,
402
+ 'inq_last_6mths': 2,
403
+ 'open_acc': 8,
404
+ 'total_acc': 25,
405
+ 'revol_bal': 5000,
406
+ 'total_bc_limit': 20000,
407
+ 'total_bal_ex_mort': 30000,
408
+ 'avg_cur_bal': 2500,
409
+ 'mo_sin_old_il_acct': 60,
410
+ 'mo_sin_old_rev_tl_op': 48,
411
+ 'mo_sin_rcnt_rev_tl_op': 12,
412
+ 'mths_since_recent_bc': 6,
413
+ 'mths_since_recent_inq': 3,
414
+ 'pct_tl_nvr_dlq': 0.95,
415
+ 'last_fico_range_low': 680,
416
+ 'last_fico_range_high': 684,
417
+
418
+ # Additional features that might be needed
419
+ 'grade': 'C',
420
+ 'emp_length': '5 years',
421
+ 'years_since_earliest_cr': 10
422
+ }
423
+
424
+ print(f"\n📊 Making test prediction...")
425
+ print(f"Using input with {len(test_loan)} fields")
426
+
427
+ result = predictor.predict(test_loan)
428
+
429
+ print("\n" + "=" * 60)
430
+ print("📈 PREDICTION RESULTS:")
431
+ print("=" * 60)
432
+ for key, value in result.items():
433
+ if key != 'explanation' or result['success']:
434
+ print(f"{key:25}: {value}")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==3.50.0
2
+ pandas==1.5.0
3
+ numpy==1.24.0
4
+ scikit-learn==1.2.0
5
+ xgboost==1.7.6
6
+ joblib==1.2.0
7
+ matplotlib==3.7.0
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c4b10627fd238874896d566a58e6d09c1a24275882a4c8aec293ff0cad2e9b0
3
+ size 2935
training_features.csv ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_name
2
+ mths_since_recent_inq
3
+ purpose_debt_consolidation
4
+ addr_state_NE
5
+ purpose_other
6
+ purpose_wedding
7
+ title_length
8
+ purpose_vacation
9
+ addr_state_SD
10
+ addr_state_OH
11
+ addr_state_VT
12
+ addr_state_WY
13
+ purpose_home_improvement
14
+ addr_state_AR
15
+ dti
16
+ purpose_small_business
17
+ last_fico_range_low
18
+ addr_state_KY
19
+ emp_length_numeric
20
+ addr_state_TN
21
+ addr_state_UT
22
+ title_has_car
23
+ addr_state_WA
24
+ addr_state_KS
25
+ addr_state_VA
26
+ pct_tl_nvr_dlq
27
+ int_rate_times_loan
28
+ addr_state_ID
29
+ loan_to_income
30
+ addr_state_MA
31
+ addr_state_ME
32
+ mths_since_recent_bc
33
+ addr_state_DE
34
+ addr_state_MT
35
+ addr_state_DC
36
+ home_ownership_MORTGAGE
37
+ addr_state_IA
38
+ addr_state_LA
39
+ mo_sin_old_rev_tl_op
40
+ title_has_medical
41
+ addr_state_NY
42
+ addr_state_IL
43
+ title_has_credit
44
+ purpose_major_purchase
45
+ addr_state_AL
46
+ addr_state_CA
47
+ verification_status_Verified
48
+ verification_status_Source Verified
49
+ total_bal_ex_mort
50
+ addr_state_MD
51
+ purpose_medical
52
+ addr_state_MS
53
+ revol_util_decimal
54
+ addr_state_CT
55
+ years_since_earliest_cr
56
+ title_has_home
57
+ mo_sin_old_il_acct
58
+ addr_state_NC
59
+ addr_state_RI
60
+ addr_state_CO
61
+ addr_state_OR
62
+ addr_state_AZ
63
+ addr_state_NV
64
+ addr_state_MI
65
+ addr_state_IN
66
+ total_bc_limit
67
+ home_ownership_OWN
68
+ addr_state_SC
69
+ subprime_high_dti
70
+ home_ownership_RENT
71
+ mo_sin_rcnt_rev_tl_op
72
+ title_has_consolidation
73
+ has_delinq_history
74
+ addr_state_FL
75
+ title_has_debt
76
+ addr_state_NJ
77
+ addr_state_WV
78
+ addr_state_NH
79
+ addr_state_HI
80
+ title_has_card
81
+ addr_state_TX
82
+ annual_inc
83
+ addr_state_GA
84
+ addr_state_WI
85
+ addr_state_MO
86
+ addr_state_MN
87
+ total_acc
88
+ grade_numeric
89
+ addr_state_PA
90
+ addr_state_NM
91
+ purpose_renewable_energy
92
+ addr_state_OK
93
+ last_fico_range_high
94
+ revol_bal
95
+ purpose_house
96
+ purpose_moving
97
+ title_word_count
98
+ avg_cur_bal
99
+ purpose_credit_card
training_features.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_names": [
3
+ "mths_since_recent_inq",
4
+ "purpose_debt_consolidation",
5
+ "addr_state_NE",
6
+ "purpose_other",
7
+ "purpose_wedding",
8
+ "title_length",
9
+ "purpose_vacation",
10
+ "addr_state_SD",
11
+ "addr_state_OH",
12
+ "addr_state_VT",
13
+ "addr_state_WY",
14
+ "purpose_home_improvement",
15
+ "addr_state_AR",
16
+ "dti",
17
+ "purpose_small_business",
18
+ "last_fico_range_low",
19
+ "addr_state_KY",
20
+ "emp_length_numeric",
21
+ "addr_state_TN",
22
+ "addr_state_UT",
23
+ "title_has_car",
24
+ "addr_state_WA",
25
+ "addr_state_KS",
26
+ "addr_state_VA",
27
+ "pct_tl_nvr_dlq",
28
+ "int_rate_times_loan",
29
+ "addr_state_ID",
30
+ "loan_to_income",
31
+ "addr_state_MA",
32
+ "addr_state_ME",
33
+ "mths_since_recent_bc",
34
+ "addr_state_DE",
35
+ "addr_state_MT",
36
+ "addr_state_DC",
37
+ "home_ownership_MORTGAGE",
38
+ "addr_state_IA",
39
+ "addr_state_LA",
40
+ "mo_sin_old_rev_tl_op",
41
+ "title_has_medical",
42
+ "addr_state_NY",
43
+ "addr_state_IL",
44
+ "title_has_credit",
45
+ "purpose_major_purchase",
46
+ "addr_state_AL",
47
+ "addr_state_CA",
48
+ "verification_status_Verified",
49
+ "verification_status_Source Verified",
50
+ "total_bal_ex_mort",
51
+ "addr_state_MD",
52
+ "purpose_medical",
53
+ "addr_state_MS",
54
+ "revol_util_decimal",
55
+ "addr_state_CT",
56
+ "years_since_earliest_cr",
57
+ "title_has_home",
58
+ "mo_sin_old_il_acct",
59
+ "addr_state_NC",
60
+ "addr_state_RI",
61
+ "addr_state_CO",
62
+ "addr_state_OR",
63
+ "addr_state_AZ",
64
+ "addr_state_NV",
65
+ "addr_state_MI",
66
+ "addr_state_IN",
67
+ "total_bc_limit",
68
+ "home_ownership_OWN",
69
+ "addr_state_SC",
70
+ "subprime_high_dti",
71
+ "home_ownership_RENT",
72
+ "mo_sin_rcnt_rev_tl_op",
73
+ "title_has_consolidation",
74
+ "has_delinq_history",
75
+ "addr_state_FL",
76
+ "title_has_debt",
77
+ "addr_state_NJ",
78
+ "addr_state_WV",
79
+ "addr_state_NH",
80
+ "addr_state_HI",
81
+ "title_has_card",
82
+ "addr_state_TX",
83
+ "annual_inc",
84
+ "addr_state_GA",
85
+ "addr_state_WI",
86
+ "addr_state_MO",
87
+ "addr_state_MN",
88
+ "total_acc",
89
+ "grade_numeric",
90
+ "addr_state_PA",
91
+ "addr_state_NM",
92
+ "purpose_renewable_energy",
93
+ "addr_state_OK",
94
+ "last_fico_range_high",
95
+ "revol_bal",
96
+ "purpose_house",
97
+ "purpose_moving",
98
+ "title_word_count",
99
+ "avg_cur_bal",
100
+ "purpose_credit_card"
101
+ ],
102
+ "feature_count": 98,
103
+ "categorical_features": [
104
+ "purpose_debt_consolidation",
105
+ "addr_state_NE",
106
+ "purpose_other",
107
+ "purpose_wedding",
108
+ "purpose_vacation",
109
+ "addr_state_SD",
110
+ "addr_state_OH",
111
+ "addr_state_VT",
112
+ "addr_state_WY",
113
+ "purpose_home_improvement",
114
+ "addr_state_AR",
115
+ "purpose_small_business",
116
+ "addr_state_KY",
117
+ "addr_state_TN",
118
+ "addr_state_UT",
119
+ "addr_state_WA",
120
+ "addr_state_KS",
121
+ "addr_state_VA",
122
+ "addr_state_ID",
123
+ "addr_state_MA",
124
+ "addr_state_ME",
125
+ "addr_state_DE",
126
+ "addr_state_MT",
127
+ "addr_state_DC",
128
+ "home_ownership_MORTGAGE",
129
+ "addr_state_IA",
130
+ "addr_state_LA",
131
+ "addr_state_NY",
132
+ "addr_state_IL",
133
+ "purpose_major_purchase",
134
+ "addr_state_AL",
135
+ "addr_state_CA",
136
+ "verification_status_Verified",
137
+ "verification_status_Source Verified",
138
+ "addr_state_MD",
139
+ "purpose_medical",
140
+ "addr_state_MS",
141
+ "addr_state_CT",
142
+ "addr_state_NC",
143
+ "addr_state_RI",
144
+ "addr_state_CO",
145
+ "addr_state_OR",
146
+ "addr_state_AZ",
147
+ "addr_state_NV",
148
+ "addr_state_MI",
149
+ "addr_state_IN",
150
+ "home_ownership_OWN",
151
+ "addr_state_SC",
152
+ "home_ownership_RENT",
153
+ "addr_state_FL"
154
+ ]
155
+ }
xgb_best_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bb8b39bb961670917a07abb4a256d08bc70c7245a59df0bee99820e709f5b61
3
+ size 2583734