Spaces:

ShuoDuan
/

CreditShield-Credit-Risk-Predictor

Runtime error

App Files Files Community

ShuoDuan commited on Dec 14, 2025

Commit

6def4ae

verified ·

1 Parent(s): a2f4dd9

Upload 9 main files

Browse files

Files changed (9) hide show

README.md +28 -8
app.py +460 -0
imputer.pkl +3 -0
predictor.py +434 -0
requirements.txt +7 -0
scaler.pkl +3 -0
training_features.csv +99 -0
training_features.json +155 -0
xgb_best_model.pkl +3 -0

README.md CHANGED Viewed

@@ -1,14 +1,34 @@
 ---
-title: CreditShield Credit Risk Predictor
-emoji: 😻
-colorFrom: red
-colorTo: gray
 sdk: gradio
-sdk_version: 6.1.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: Predict loan defaults with 92.3% accuracy using machine lear
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Credit Risk Predictor
+emoji: 🏦
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 3.50.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# 🏦 Credit Risk Prediction System
+Predict loan defaults with 92.3% accuracy using machine learning.
+## Features
+- 92.3% AUC-ROC accuracy (beats academic paper)
+- Business-optimized for maximum profit
+- Real-time predictions with visualizations
+- Based on 358,244 real loans
+## How to Use
+1. Fill in the loan application details
+2. Click "Assess Credit Risk"
+3. Get instant approval/rejection with risk analysis
+## Model Details
+- **Algorithm**: XGBoost with enhanced features
+- **Training Data**: Lending Club (2013-2014)
+- **Key Improvement**: +0.010 AUC over baseline
+- **Business Impact**: Optimized threshold at 28%
+*For research purposes only. Not financial advice.*

app.py ADDED Viewed

	@@ -0,0 +1,460 @@

+# deployment/gradio_app.py
+import gradio as gr
+import pandas as pd
+import json
+from predictor import CreditRiskPredictor
+import matplotlib.pyplot as plt
+import numpy as np
+# Initialize predictor
+predictor = CreditRiskPredictor("model_artifacts")
+# Get the actual base features needed from the predictor
+if hasattr(predictor, 'base_features_needed') and predictor.base_features_needed:
+    print(f"📋 Model needs these base features: {predictor.base_features_needed}")
+else:
+    print("⚠️ Could not determine base features needed")
+# Feature descriptions for tooltips
+FEATURE_INFO = {
+    'loan_amnt': "Total amount of the loan applied for",
+    'int_rate': "Interest rate on the loan",
+    'grade': "LC assigned loan grade (A=best, G=worst)",
+    'emp_length': "Employment length in years",
+    'annual_inc': "Self-reported annual income",
+    'dti': "Debt-to-income ratio",
+    'revol_util': "Revolving line utilization rate",
+    'delinq_2yrs': "Number of delinquencies in past 2 years",
+    'inq_last_6mths': "Number of credit inquiries in past 6 months",
+    'open_acc': "Number of open credit lines",
+    'total_acc': "Total number of credit lines",
+    # Additional features from your predictor
+    'revol_bal': "Total credit revolving balance",
+    'total_bc_limit': "Total bankcard limit",
+    'total_bal_ex_mort': "Total balance excluding mortgage",
+    'avg_cur_bal': "Average current balance",
+    'mo_sin_old_il_acct': "Months since oldest installment account opened",
+    'mo_sin_old_rev_tl_op': "Months since oldest revolving account opened",
+    'mo_sin_rcnt_rev_tl_op': "Months since most recent revolving account opened",
+    'mths_since_recent_bc': "Months since most recent bankcard account opened",
+    'mths_since_recent_inq': "Months since most recent inquiry",
+    'pct_tl_nvr_dlq': "Percent of trades never delinquent",
+    'last_fico_range_low': "Lower bound of the last FICO range",
+    'last_fico_range_high': "Upper bound of the last FICO range",
+    'years_since_earliest_cr': "Years since earliest credit line opened",
+    'addr_state': "State of the borrower (2-letter code)",
+    'home_ownership': "Home ownership status",
+    'purpose': "Purpose of the loan",
+    'verification_status': "Income verification status",
+    'title': "Loan title/description"
+}
+def create_visualization(default_prob, threshold=0.28):
+    """Create risk visualization"""
+    fig, ax = plt.subplots(figsize=(8, 2))
+    # Create gradient risk bar
+    x = np.linspace(0, 1, 100)
+    colors = plt.cm.RdYlGn_r(x)  # Red to Green (reversed)
+    for i in range(len(x)-1):
+        ax.fill_between([x[i], x[i+1]], 0, 1, color=colors[i], alpha=0.7)
+    # Mark threshold
+    ax.axvline(x=threshold, color='black', linestyle='--', linewidth=2, label=f'Threshold ({threshold:.0%})')
+    # Mark prediction
+    ax.plot(default_prob, 0.5, 'ro', markersize=15, label=f'Prediction ({default_prob:.1%})')
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.set_xlabel('Default Probability')
+    ax.set_title('Risk Assessment')
+    ax.legend(loc='upper right')
+    ax.set_yticks([])
+    plt.tight_layout()
+    return fig
+def predict_loan(loan_amnt, int_rate, grade, emp_length, annual_inc,
+                 dti, revol_util, delinq_2yrs, inq_last_6mths,
+                 open_acc, total_acc, revol_bal=5000, total_bc_limit=20000,
+                 total_bal_ex_mort=30000, avg_cur_bal=2500,
+                 mo_sin_old_il_acct=60, mo_sin_old_rev_tl_op=48,
+                 mo_sin_rcnt_rev_tl_op=12, mths_since_recent_bc=6,
+                 mths_since_recent_inq=3, pct_tl_nvr_dlq=95,
+                 last_fico_range_low=680, last_fico_range_high=684,
+                 years_since_earliest_cr=10, addr_state="CA",
+                 home_ownership="RENT", purpose="debt_consolidation",
+                 verification_status="Verified",
+                 title="Debt consolidation loan"):
+    """Main prediction function with all needed features"""
+    # Prepare input with ALL features the model expects
+    loan_data = {
+        # Basic loan info
+        'loan_amnt': float(loan_amnt),
+        'int_rate': float(int_rate),
+        'grade': grade,
+        'emp_length': emp_length,
+        'annual_inc': float(annual_inc),
+        'dti': float(dti),
+        'revol_util': f"{revol_util}%",
+        'delinq_2yrs': int(delinq_2yrs),
+        'inq_last_6mths': int(inq_last_6mths),
+        'open_acc': int(open_acc),
+        'total_acc': int(total_acc),
+        # Additional credit features
+        'revol_bal': float(revol_bal),
+        'total_bc_limit': float(total_bc_limit),
+        'total_bal_ex_mort': float(total_bal_ex_mort),
+        'avg_cur_bal': float(avg_cur_bal),
+        'mo_sin_old_il_acct': float(mo_sin_old_il_acct),
+        'mo_sin_old_rev_tl_op': float(mo_sin_old_rev_tl_op),
+        'mo_sin_rcnt_rev_tl_op': float(mo_sin_rcnt_rev_tl_op),
+        'mths_since_recent_bc': float(mths_since_recent_bc),
+        'mths_since_recent_inq': float(mths_since_recent_inq),
+        'pct_tl_nvr_dlq': float(pct_tl_nvr_dlq) / 100.0,  # Convert to decimal
+        'last_fico_range_low': float(last_fico_range_low),
+        'last_fico_range_high': float(last_fico_range_high),
+        'years_since_earliest_cr': float(years_since_earliest_cr),
+        # Categorical features for one-hot encoding
+        'addr_state': str(addr_state),
+        'home_ownership': str(home_ownership),
+        'purpose': str(purpose),
+        'verification_status': str(verification_status),
+        'title': str(title)
+    }
+    # Get prediction
+    result = predictor.predict(loan_data)
+    if not result['success']:
+        return f"❌ Error: {result['error']}", None, "red"
+    # Format results
+    if result['decision'] == 'APPROVE':
+        decision_html = """
+        <div style='background-color: #d4edda; padding: 20px; border-radius: 10px; border: 2px solid #c3e6cb;'>
+            <h2 style='color: #155724; margin: 0;'>✅ LOAN APPROVED</h2>
+        </div>
+        """
+        color = "green"
+    else:
+        decision_html = """
+        <div style='background-color: #f8d7da; padding: 20px; border-radius: 10px; border: 2px solid #f5c6cb;'>
+            <h2 style='color: #721c24; margin: 0;'>❌ LOAN REJECTED</h2>
+        </div>
+        """
+        color = "red"
+    # Create results table
+    results_md = f"""
+    ## 📊 Prediction Results
+    | Metric | Value |
+    |--------|-------|
+    | **Default Probability** | {result['default_probability']:.2%} |
+    | **Risk Level** | {result['risk_level']} |
+    | **Confidence** | {result['confidence']:.0%} |
+    | **Optimal Threshold** | {result['optimal_threshold']:.0%} |
+    ### 💡 Explanation
+    {result['explanation']}
+    ### 🔧 Model Info
+    - **Features used**: {len(predictor.feature_list) if predictor.feature_list else 'Unknown'}
+    - **Features provided**: {len(loan_data)}
+    - **Threshold optimized for profit**: {result['optimal_threshold']:.0%}
+    ---
+    *Model accuracy: 92.3% AUC-ROC | Trained on 358,244 loans*
+    """
+    # Create visualization
+    fig = create_visualization(result['default_probability'], result['optimal_threshold'])
+    return decision_html, results_md, color, fig
+# Create Gradio interface
+with gr.Blocks(title="Credit Risk Predictor", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🏦 Credit Risk Prediction System
+    *Predict loan defaults with 92.3% accuracy using machine learning*
+    Based on research: *"Credit scoring for peer-to-peer lending using machine learning techniques"*
+    (Quantitative Finance and Economics, Volume 6, Issue 2) with enhancements.
+    """)
+    # Advanced features accordion
+    with gr.Accordion("🔧 Advanced Features (Optional)", open=False):
+        gr.Markdown("""
+        **Default values are set to typical/average levels.**
+        These additional features improve prediction accuracy but are optional.
+        """)
+        with gr.Row():
+            with gr.Column():
+                revol_bal = gr.Slider(0, 100000, 5000, step=1000,
+                                     label="Revolving Balance ($)",
+                                     info=FEATURE_INFO['revol_bal'])
+                total_bc_limit = gr.Slider(0, 100000, 20000, step=1000,
+                                          label="Total Bankcard Limit ($)",
+                                          info=FEATURE_INFO['total_bc_limit'])
+                total_bal_ex_mort = gr.Slider(0, 200000, 30000, step=1000,
+                                            label="Total Balance Excl. Mortgage ($)",
+                                            info=FEATURE_INFO['total_bal_ex_mort'])
+                avg_cur_bal = gr.Slider(0, 50000, 2500, step=100,
+                                       label="Average Current Balance ($)",
+                                       info=FEATURE_INFO['avg_cur_bal'])
+            with gr.Column():
+                mo_sin_old_il_acct = gr.Slider(0, 300, 60, step=1,
+                                              label="Months since oldest installment account",
+                                              info=FEATURE_INFO['mo_sin_old_il_acct'])
+                mo_sin_old_rev_tl_op = gr.Slider(0, 300, 48, step=1,
+                                                label="Months since oldest revolving account",
+                                                info=FEATURE_INFO['mo_sin_old_rev_tl_op'])
+                mo_sin_rcnt_rev_tl_op = gr.Slider(0, 300, 12, step=1,
+                                                 label="Months since newest revolving account",
+                                                 info=FEATURE_INFO['mo_sin_rcnt_rev_tl_op'])
+                mths_since_recent_bc = gr.Slider(0, 120, 6, step=1,
+                                                label="Months since newest bankcard",
+                                                info=FEATURE_INFO['mths_since_recent_bc'])
+        with gr.Row():
+            with gr.Column():
+                mths_since_recent_inq = gr.Slider(0, 120, 3, step=1,
+                                                 label="Months since newest inquiry",
+                                                 info=FEATURE_INFO['mths_since_recent_inq'])
+                pct_tl_nvr_dlq = gr.Slider(0, 100, 95, step=1,
+                                          label="% of trades never delinquent",
+                                          info=FEATURE_INFO['pct_tl_nvr_dlq'])
+                last_fico_range_low = gr.Slider(300, 850, 680, step=10,
+                                               label="Lowest recent FICO score",
+                                               info=FEATURE_INFO['last_fico_range_low'])
+                last_fico_range_high = gr.Slider(300, 850, 684, step=10,
+                                                label="Highest recent FICO score",
+                                                info=FEATURE_INFO['last_fico_range_high'])
+            with gr.Column():
+                years_since_earliest_cr = gr.Slider(0, 50, 10, step=1,
+                                                   label="Years since first credit line",
+                                                   info=FEATURE_INFO['years_since_earliest_cr'])
+                addr_state = gr.Textbox(value="CA", label="State (2 letters)",
+                                       info=FEATURE_INFO['addr_state'])
+                home_ownership = gr.Dropdown(["RENT", "MORTGAGE", "OWN", "OTHER"],
+                                           value="RENT", label="Home Ownership",
+                                           info=FEATURE_INFO['home_ownership'])
+        with gr.Row():
+            purpose = gr.Dropdown(["debt_consolidation", "credit_card", "home_improvement",
+                                 "major_purchase", "medical", "car", "wedding"],
+                                value="debt_consolidation", label="Loan Purpose",
+                                info=FEATURE_INFO['purpose'])
+            verification_status = gr.Dropdown(["Verified", "Source Verified", "Not Verified"],
+                                            value="Verified", label="Income Verification",
+                                            info=FEATURE_INFO['verification_status'])
+            title = gr.Textbox(value="Debt consolidation loan", label="Loan Title",
+                              info=FEATURE_INFO['title'])
+    # Main form
+    gr.Markdown("## 📝 Required Loan Information")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Loan Application")
+            with gr.Group():
+                loan_amnt = gr.Slider(1000, 40000, 15000, step=500,
+                                     label="Loan Amount ($)",
+                                     info=FEATURE_INFO['loan_amnt'])
+                int_rate = gr.Slider(5.0, 30.0, 12.5, step=0.1,
+                                    label="Interest Rate (%)",
+                                    info=FEATURE_INFO['int_rate'])
+                grade = gr.Radio(["A", "B", "C", "D", "E", "F", "G"], value="C",
+                                label="Loan Grade",
+                                info=FEATURE_INFO['grade'])
+            with gr.Group():
+                emp_length = gr.Dropdown(["< 1 year", "1 year", "2 years", "3 years",
+                                         "4 years", "5 years", "6 years", "7 years",
+                                         "8 years", "9 years", "10+ years"],
+                                        value="5 years",
+                                        label="Employment Length",
+                                        info=FEATURE_INFO['emp_length'])
+                annual_inc = gr.Slider(20000, 1000000, 75000, step=1000,
+                                      label="Annual Income ($)",
+                                      info=FEATURE_INFO['annual_inc'])
+                dti = gr.Slider(0, 40, 18.5, step=0.1,
+                               label="Debt-to-Income Ratio",
+                               info=FEATURE_INFO['dti'])
+        with gr.Column(scale=1):
+            gr.Markdown("### Credit History")
+            with gr.Group():
+                revol_util = gr.Slider(0, 100, 45, step=1,
+                                      label="Credit Utilization (%)",
+                                      info=FEATURE_INFO['revol_util'])
+                delinq_2yrs = gr.Slider(0, 10, 0, step=1,
+                                       label="Delinquencies (last 2 years)",
+                                       info=FEATURE_INFO['delinq_2yrs'])
+                inq_last_6mths = gr.Slider(0, 10, 2, step=1,
+                                          label="Credit Inquiries (last 6 months)",
+                                          info=FEATURE_INFO['inq_last_6mths'])
+            with gr.Group():
+                open_acc = gr.Slider(0, 50, 8, step=1,
+                                    label="Open Credit Lines",
+                                    info=FEATURE_INFO['open_acc'])
+                total_acc = gr.Slider(0, 100, 25, step=1,
+                                     label="Total Credit Lines",
+                                     info=FEATURE_INFO['total_acc'])
+    with gr.Row():
+        submit_btn = gr.Button("🔍 Assess Credit Risk", variant="primary", size="lg")
+        clear_btn = gr.Button("🔄 Clear Form", variant="secondary")
+        simple_mode_btn = gr.Button("📱 Simple Mode", variant="secondary")
+    # Example buttons
+    gr.Markdown("### 🚀 Quick Examples")
+    with gr.Row():
+        low_risk_btn = gr.Button("👍 Low Risk Example", variant="secondary", size="sm")
+        high_risk_btn = gr.Button("👎 High Risk Example", variant="secondary", size="sm")
+        borderline_btn = gr.Button("⚖️ Borderline Example", variant="secondary", size="sm")
+    # Results section
+    gr.Markdown("## 📈 Assessment Results")
+    with gr.Row():
+        decision_output = gr.HTML(label="Decision")
+        color_indicator = gr.HTML(visible=False)
+    with gr.Row():
+        with gr.Column(scale=2):
+            results_output = gr.Markdown(label="Detailed Results")
+        with gr.Column(scale=1):
+            plot_output = gr.Plot(label="Risk Visualization")
+    # Footer
+    gr.Markdown("""
+    ---
+    ### ℹ️ About This Model
+    - **Accuracy**: 92.3% AUC-ROC (beats paper's 86-87%)
+    - **Training Data**: 358,244 loans from Lending Club (2013-2014)
+    - **Key Features**: 98 engineered features including credit history and financial ratios
+    - **Business Impact**: Optimized for maximum profit (threshold: 28%)
+    - **Improvements**: No undersampling, time-based validation, enhanced features
+    *For research purposes only. Not financial advice.*
+    """)
+    # Define examples with all needed features
+    examples = {
+        'low': {
+            'basic': [10000, 8.5, 'A', '10+ years', 120000, 12.0, 30, 0, 1, 5, 20],
+            'advanced': [3000, 15000, 25000, 3000, 120, 96, 24, 12, 6, 98, 720, 724, 15,
+                        "CA", "OWN", "debt_consolidation", "Verified", "Debt consolidation"]
+        },
+        'high': {
+            'basic': [35000, 25.0, 'F', '< 1 year', 30000, 35.0, 95, 3, 8, 15, 40],
+            'advanced': [20000, 5000, 10000, 1000, 6, 12, 1, 1, 1, 60, 580, 590, 2,
+                        "NV", "RENT", "credit_card", "Not Verified", "Credit card payoff"]
+        },
+        'borderline': {
+            'basic': [20000, 15.0, 'D', '3 years', 55000, 22.0, 75, 1, 4, 10, 30],
+            'advanced': [10000, 10000, 20000, 2000, 36, 48, 6, 6, 3, 85, 650, 660, 5,
+                        "TX", "MORTGAGE", "home_improvement", "Source Verified", "Home renovation loan"]
+        }
+    }
+    # Function to get all inputs for an example
+    def get_example(example_type):
+        basic = examples[example_type]['basic']
+        advanced = examples[example_type]['advanced']
+        return basic + advanced
+    # Connect buttons
+    all_inputs = [loan_amnt, int_rate, grade, emp_length, annual_inc,
+                  dti, revol_util, delinq_2yrs, inq_last_6mths,
+                  open_acc, total_acc, revol_bal, total_bc_limit,
+                  total_bal_ex_mort, avg_cur_bal, mo_sin_old_il_acct,
+                  mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op,
+                  mths_since_recent_bc, mths_since_recent_inq,
+                  pct_tl_nvr_dlq, last_fico_range_low, last_fico_range_high,
+                  years_since_earliest_cr, addr_state, home_ownership,
+                  purpose, verification_status, title]
+    submit_btn.click(
+        fn=predict_loan,
+        inputs=all_inputs,
+        outputs=[decision_output, results_output, color_indicator, plot_output]
+    )
+    # Clear function with defaults
+    def clear_form():
+        basic_defaults = [15000, 12.5, 'C', '5 years', 75000, 18.5, 45, 0, 2, 8, 25]
+        advanced_defaults = [5000, 20000, 30000, 2500, 60, 48, 12, 6, 3, 95, 680, 684,
+                            10, "CA", "RENT", "debt_consolidation", "Verified",
+                            "Debt consolidation loan"]
+        return basic_defaults + advanced_defaults + [None, None, None, None]
+    clear_btn.click(
+        fn=clear_form,
+        outputs=all_inputs + [decision_output, results_output, plot_output]
+    )
+    # Example buttons
+    low_risk_btn.click(
+        fn=lambda: get_example('low'),
+        outputs=all_inputs
+    )
+    high_risk_btn.click(
+        fn=lambda: get_example('high'),
+        outputs=all_inputs
+    )
+    borderline_btn.click(
+        fn=lambda: get_example('borderline'),
+        outputs=all_inputs
+    )
+    # Simple mode button (hides advanced features)
+    simple_mode_btn.click(
+        fn=lambda: gr.Accordion(open=False),
+        outputs=None
+    )
+# Run the app
+if __name__ == "__main__":
+    print("🚀 Starting Credit Risk Predictor...")
+    print(f"📊 Model features: {len(predictor.feature_list) if predictor.feature_list else 'Unknown'}")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )

imputer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43ccf3bf22bce59767331fdda0a5fa8c5ef839774b42b05f1746f162a13b119f
+size 3319

predictor.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# deployment/predictor.py
+import joblib
+import numpy as np
+import pandas as pd
+import re
+from pathlib import Path
+import json
+class CreditRiskPredictor:
+    """Predictor using your actual trained model features"""
+    def __init__(self, model_dir="model_artifacts"):
+        self.model_dir = Path(model_dir)
+        self.model = None
+        self.scaler = None
+        self.imputer = None
+        self.optimal_threshold = 0.28
+        # Load the ACTUAL feature list from your JSON
+        self.feature_list = self._load_actual_features()
+        print(f"📋 Using {len(self.feature_list)} ACTUAL features")
+        # Extract base features needed from user input
+        self.base_features_needed = self._extract_base_features()
+        print(f"📋 Expecting {len(self.base_features_needed)} base input features")
+        self.load_artifacts()
+    def _load_actual_features(self):
+        """Load the actual features used in training"""
+        feature_file = self.model_dir / "training_features.json"
+        if not feature_file.exists():
+            print(f"⚠️ {feature_file} not found")
+            return []
+        with open(feature_file, 'r') as f:
+            data = json.load(f)
+        # Your JSON has 'feature_names' key
+        if 'feature_names' in data:
+            features = data['feature_names']
+            if isinstance(features, list):
+                return features
+        elif 'enhanced_features' in data:
+            features = data['enhanced_features']
+            if isinstance(features, list):
+                return features
+        print(f"❌ Could not find feature list in JSON. Keys: {list(data.keys())}")
+        return []
+    def _extract_base_features(self):
+        """Extract base features from one-hot encoded feature list"""
+        if not self.feature_list:
+            return []
+        base_features = set()
+        for feature in self.feature_list:
+            # Handle one-hot encoded features
+            if feature.startswith('addr_state_'):
+                base_features.add('addr_state')
+            elif feature.startswith('home_ownership_'):
+                base_features.add('home_ownership')
+            elif feature.startswith('purpose_'):
+                base_features.add('purpose')
+            elif feature.startswith('verification_status_'):
+                base_features.add('verification_status')
+            elif feature.startswith('title_has_'):
+                # These are title-based engineered features
+                base_features.add('title')
+            elif '_' in feature and not feature.replace('_', '').isnumeric():
+                # Other potential categoricals
+                parts = feature.split('_')
+                if len(parts) > 1:
+                    base_features.add(parts[0])
+            else:
+                # Regular feature
+                base_features.add(feature)
+        # Filter out features that don't make sense as user inputs
+        user_input_features = []
+        for feature in base_features:
+            if feature not in ['purpose_debt_consolidation', 'verification_status_Verified',
+                              'verification_status_Source', 'title_has_car', 'title_has_medical',
+                              'title_has_credit', 'title_has_home', 'title_has_consolidation',
+                              'title_has_debt', 'title_has_card'] and not any(feature + '_' in f for f in self.feature_list):
+                user_input_features.append(feature)
+        return user_input_features
+    def load_artifacts(self):
+        """Load model, scaler, and imputer"""
+        try:
+            # Find the latest model
+            model_files = list(self.model_dir.glob("*xgb*.pkl"))
+            scaler_files = list(self.model_dir.glob("*scaler*.pkl"))
+            imputer_files = list(self.model_dir.glob("*imputer*.pkl"))
+            if not model_files:
+                raise FileNotFoundError("No model files found")
+            # Load the first available
+            self.model = joblib.load(model_files[0])
+            print(f"✅ Loaded model: {model_files[0].name}")
+            if scaler_files:
+                self.scaler = joblib.load(scaler_files[0])
+                print(f"✅ Loaded scaler: {scaler_files[0].name}")
+            if imputer_files:
+                self.imputer = joblib.load(imputer_files[0])
+                print(f"✅ Loaded imputer: {imputer_files[0].name}")
+            # Verify feature count
+            if hasattr(self.model, 'n_features_in_'):
+                print(f"📊 Model expects {self.model.n_features_in_} features")
+                print(f"📊 We have {len(self.feature_list)} features in our list")
+                if self.model.n_features_in_ != len(self.feature_list):
+                    print("⚠️ WARNING: Feature count mismatch!")
+        except Exception as e:
+            print(f"❌ Error loading artifacts: {e}")
+            raise
+    def _engineer_features(self, df):
+        """Create all features including one-hot encoded"""
+        if not self.feature_list:
+            raise ValueError("No feature list available!")
+        # First, ensure we have all base features (fill missing with defaults)
+        for feature in self.base_features_needed:
+            if feature not in df.columns:
+                # Set appropriate defaults based on feature type
+                if feature in ['loan_amnt', 'annual_inc', 'int_rate', 'dti', 'total_acc',
+                              'revol_bal', 'total_bc_limit', 'total_bal_ex_mort', 'avg_cur_bal',
+                              'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op',
+                              'mths_since_recent_bc', 'mths_since_recent_inq', 'last_fico_range_low',
+                              'last_fico_range_high', 'years_since_earliest_cr']:
+                    df[feature] = 0  # Numerical defaults
+                elif feature in ['addr_state', 'home_ownership', 'purpose', 'verification_status', 'title']:
+                    df[feature] = 'unknown'  # Categorical defaults
+                elif feature in ['grade_numeric', 'emp_length_numeric', 'revol_util_decimal',
+                                'loan_to_income', 'int_rate_times_loan', 'subprime_high_dti',
+                                'pct_tl_nvr_dlq', 'title_length', 'title_word_count']:
+                    df[feature] = 0  # Engineered feature defaults
+                elif feature in ['delinq_2yrs', 'inq_last_6mths', 'open_acc', 'has_delinq_history']:
+                    df[feature] = 0  # Credit history defaults
+                else:
+                    df[feature] = 0
+        # Convert categorical to one-hot
+        df = self._create_one_hot_features(df)
+        # Engineered features
+        df = self._create_engineered_features(df)
+        return df
+    def _create_one_hot_features(self, df):
+        """Create one-hot encoded features from categorical variables"""
+        if not self.feature_list:
+            return df
+        for feature in self.feature_list:
+            # Handle different categorical encodings
+            if feature.startswith('addr_state_'):
+                state_code = feature.replace('addr_state_', '')
+                if 'addr_state' in df.columns:
+                    df[feature] = (df['addr_state'].astype(str).str.upper() == state_code).astype(int)
+                else:
+                    df[feature] = 0
+            elif feature.startswith('home_ownership_'):
+                ownership_type = feature.replace('home_ownership_', '')
+                if 'home_ownership' in df.columns:
+                    df[feature] = (df['home_ownership'].astype(str).str.upper() == ownership_type).astype(int)
+                else:
+                    df[feature] = 0
+            elif feature.startswith('purpose_'):
+                purpose_type = feature.replace('purpose_', '')
+                if 'purpose' in df.columns:
+                    df[feature] = (df['purpose'].astype(str).str.lower().replace(' ', '_') == purpose_type).astype(int)
+                else:
+                    df[feature] = 0
+            elif feature.startswith('verification_status_'):
+                status_type = feature.replace('verification_status_', '')
+                if 'verification_status' in df.columns:
+                    df[feature] = (df['verification_status'].astype(str).str.replace(' ', '_') == status_type).astype(int)
+                else:
+                    df[feature] = 0
+            elif feature.startswith('title_has_'):
+                # These are title-based engineered features
+                keyword = feature.replace('title_has_', '')
+                if 'title' in df.columns:
+                    title_str = str(df['title'].iloc[0]).lower() if len(df) > 0 else ''
+                    df[feature] = 1 if keyword in title_str else 0
+                else:
+                    df[feature] = 0
+        return df
+    def _create_engineered_features(self, df):
+        """Create engineered features"""
+        # Grade to numeric (if grade is provided)
+        if 'grade' in df.columns:
+            grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
+            df['grade_numeric'] = df['grade'].map(grade_map).fillna(4)
+        # Employment length to numeric
+        if 'emp_length' in df.columns:
+            df['emp_length_numeric'] = df['emp_length'].apply(self._convert_emp_length)
+        # Credit utilization to decimal
+        if 'revol_util' in df.columns:
+            df['revol_util_decimal'] = df['revol_util'].astype(str).str.replace('%', '', regex=False).astype(float) / 100
+        # Financial ratios
+        if 'loan_amnt' in df.columns and 'annual_inc' in df.columns:
+            df['loan_to_income'] = df['loan_amnt'] / (df['annual_inc'].replace(0, 1) + 1)
+        if 'int_rate' in df.columns and 'loan_amnt' in df.columns:
+            df['int_rate_times_loan'] = df['int_rate'] * df['loan_amnt'] / 1000
+        # Credit flags
+        if 'delinq_2yrs' in df.columns:
+            df['has_delinq_history'] = (df['delinq_2yrs'] > 0).astype(int)
+        # Subprime indicator
+        if 'grade_numeric' in df.columns and 'dti' in df.columns:
+            df['subprime_high_dti'] = ((df['grade_numeric'] >= 4) & (df['dti'] > 20)).astype(int)
+        # Title-based features
+        if 'title' in df.columns:
+            title_str = str(df['title'].iloc[0]).lower() if len(df) > 0 else ''
+            df['title_length'] = len(title_str)
+            df['title_word_count'] = len(title_str.split())
+        # Years since earliest credit line (simplified)
+        if 'years_since_earliest_cr' not in df.columns:
+            df['years_since_earliest_cr'] = 10  # Default value
+        # Set defaults for any missing engineered features
+        for feature in self.feature_list:
+            if feature not in df.columns and not feature.startswith(('addr_state_', 'home_ownership_',
+                                                                    'purpose_', 'verification_status_', 'title_has_')):
+                # Default values based on feature type
+                if 'fico' in feature.lower():
+                    df[feature] = 700  # Average FICO score
+                elif any(x in feature for x in ['rate', 'util', 'pct', 'ratio']):
+                    df[feature] = 0.5  # Percentage default
+                elif any(x in feature for x in ['loan', 'amt', 'bal', 'limit', 'inc']):
+                    df[feature] = 0  # Monetary default
+                elif any(x in feature for x in ['month', 'mo', 'mth', 'year']):
+                    df[feature] = 0  # Time default
+                else:
+                    df[feature] = 0
+        return df
+    def _convert_emp_length(self, val):
+        """Convert employment length string to numeric"""
+        if pd.isna(val):
+            return 3.0  # Default
+        val = str(val).lower()
+        if '10+' in val:
+            return 10.0
+        elif '< 1' in val:
+            return 0.5
+        else:
+            numbers = re.findall(r'\d+', val)
+            return float(numbers[0]) if numbers else 3.0
+    def preprocess_input(self, input_dict):
+        """Convert raw input to model-ready features"""
+        if not self.feature_list:
+            raise ValueError("No feature list available!")
+        df = pd.DataFrame([input_dict])
+        # Engineer all features including one-hot
+        df = self._engineer_features(df)
+        # Ensure we have all features in correct order
+        processed_df = pd.DataFrame(columns=self.feature_list)
+        # Fill with available values, zeros for missing
+        for feature in self.feature_list:
+            if feature in df.columns:
+                processed_df[feature] = df[feature].values
+            else:
+                processed_df[feature] = 0
+        # Debug: Show we have the right number of features
+        print(f"🔧 Created dataframe with {len(processed_df.columns)} features")
+        # Handle missing values (imputer)
+        if self.imputer is not None and not processed_df.empty:
+            try:
+                processed_df = pd.DataFrame(
+                    self.imputer.transform(processed_df),
+                    columns=self.feature_list
+                )
+            except Exception as e:
+                print(f"⚠️ Imputer error: {e}")
+        # Scale features
+        if self.scaler is not None and not processed_df.empty:
+            try:
+                processed_df = pd.DataFrame(
+                    self.scaler.transform(processed_df),
+                    columns=self.feature_list
+                )
+            except Exception as e:
+                print(f"⚠️ Scaler error: {e}")
+        return processed_df.values
+    def predict(self, input_dict):
+        """Make prediction"""
+        try:
+            # Preprocess
+            features = self.preprocess_input(input_dict)
+            if features.size == 0:
+                raise ValueError("No features generated!")
+            # Debug info
+            print(f"🔧 Processed features shape: {features.shape}")
+            # Predict
+            default_prob = self.model.predict_proba(features)[0, 1]
+            # Decision
+            decision = "APPROVE" if default_prob < self.optimal_threshold else "REJECT"
+            return {
+                'success': True,
+                'default_probability': float(default_prob),
+                'decision': decision,
+                'risk_level': self._get_risk_level(default_prob),
+                'confidence': self._get_confidence(default_prob),
+                'optimal_threshold': self.optimal_threshold,
+                'explanation': f"Default probability: {default_prob:.1%} (threshold: {self.optimal_threshold:.1%})"
+            }
+        except Exception as e:
+            import traceback
+            print(f"❌ Prediction error: {e}")
+            traceback.print_exc()
+            return {
+                'success': False,
+                'error': str(e),
+                'decision': 'ERROR'
+            }
+    def _get_risk_level(self, prob):
+        if prob < 0.2: return "LOW"
+        elif prob < 0.4: return "MEDIUM"
+        elif prob < 0.6: return "HIGH"
+        else: return "VERY HIGH"
+    def _get_confidence(self, prob):
+        distance = abs(prob - self.optimal_threshold)
+        return max(0.5, 1.0 - distance * 2)
+# Test with the exact features your model expects
+if __name__ == "__main__":
+    print("🧪 Testing CreditRiskPredictor...")
+    print("=" * 60)
+    # Create predictor
+    predictor = CreditRiskPredictor("model_artifacts")
+    if not predictor.feature_list:
+        print("\n❌ Cannot proceed without features!")
+    else:
+        # Create a test input with ALL the features your model actually needs
+        # Based on your JSON, here's what to provide:
+        test_loan = {
+            # Basic loan info
+            'loan_amnt': 15000,
+            'int_rate': 12.5,
+            # Categorical features (will be one-hot encoded)
+            'addr_state': 'CA',  # Will create addr_state_CA = 1
+            'home_ownership': 'RENT',  # Will create home_ownership_RENT = 1
+            'purpose': 'debt_consolidation',  # Will create purpose_debt_consolidation = 1
+            'verification_status': 'Verified',  # Will create verification_status_Verified = 1
+            # Title for title-based features
+            'title': 'Debt consolidation loan for credit card payoff',
+            # Credit features from your feature list
+            'dti': 18.5,
+            'annual_inc': 75000,
+            'revol_util': '45%',
+            'delinq_2yrs': 0,
+            'inq_last_6mths': 2,
+            'open_acc': 8,
+            'total_acc': 25,
+            'revol_bal': 5000,
+            'total_bc_limit': 20000,
+            'total_bal_ex_mort': 30000,
+            'avg_cur_bal': 2500,
+            'mo_sin_old_il_acct': 60,
+            'mo_sin_old_rev_tl_op': 48,
+            'mo_sin_rcnt_rev_tl_op': 12,
+            'mths_since_recent_bc': 6,
+            'mths_since_recent_inq': 3,
+            'pct_tl_nvr_dlq': 0.95,
+            'last_fico_range_low': 680,
+            'last_fico_range_high': 684,
+            # Additional features that might be needed
+            'grade': 'C',
+            'emp_length': '5 years',
+            'years_since_earliest_cr': 10
+        }
+        print(f"\n📊 Making test prediction...")
+        print(f"Using input with {len(test_loan)} fields")
+        result = predictor.predict(test_loan)
+        print("\n" + "=" * 60)
+        print("📈 PREDICTION RESULTS:")
+        print("=" * 60)
+        for key, value in result.items():
+            if key != 'explanation' or result['success']:
+                print(f"{key:25}: {value}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==3.50.0
+pandas==1.5.0
+numpy==1.24.0
+scikit-learn==1.2.0
+xgboost==1.7.6
+joblib==1.2.0
+matplotlib==3.7.0

scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c4b10627fd238874896d566a58e6d09c1a24275882a4c8aec293ff0cad2e9b0
+size 2935

training_features.csv ADDED Viewed

	@@ -0,0 +1,99 @@

+feature_name
+mths_since_recent_inq
+purpose_debt_consolidation
+addr_state_NE
+purpose_other
+purpose_wedding
+title_length
+purpose_vacation
+addr_state_SD
+addr_state_OH
+addr_state_VT
+addr_state_WY
+purpose_home_improvement
+addr_state_AR
+dti
+purpose_small_business
+last_fico_range_low
+addr_state_KY
+emp_length_numeric
+addr_state_TN
+addr_state_UT
+title_has_car
+addr_state_WA
+addr_state_KS
+addr_state_VA
+pct_tl_nvr_dlq
+int_rate_times_loan
+addr_state_ID
+loan_to_income
+addr_state_MA
+addr_state_ME
+mths_since_recent_bc
+addr_state_DE
+addr_state_MT
+addr_state_DC
+home_ownership_MORTGAGE
+addr_state_IA
+addr_state_LA
+mo_sin_old_rev_tl_op
+title_has_medical
+addr_state_NY
+addr_state_IL
+title_has_credit
+purpose_major_purchase
+addr_state_AL
+addr_state_CA
+verification_status_Verified
+verification_status_Source Verified
+total_bal_ex_mort
+addr_state_MD
+purpose_medical
+addr_state_MS
+revol_util_decimal
+addr_state_CT
+years_since_earliest_cr
+title_has_home
+mo_sin_old_il_acct
+addr_state_NC
+addr_state_RI
+addr_state_CO
+addr_state_OR
+addr_state_AZ
+addr_state_NV
+addr_state_MI
+addr_state_IN
+total_bc_limit
+home_ownership_OWN
+addr_state_SC
+subprime_high_dti
+home_ownership_RENT
+mo_sin_rcnt_rev_tl_op
+title_has_consolidation
+has_delinq_history
+addr_state_FL
+title_has_debt
+addr_state_NJ
+addr_state_WV
+addr_state_NH
+addr_state_HI
+title_has_card
+addr_state_TX
+annual_inc
+addr_state_GA
+addr_state_WI
+addr_state_MO
+addr_state_MN
+total_acc
+grade_numeric
+addr_state_PA
+addr_state_NM
+purpose_renewable_energy
+addr_state_OK
+last_fico_range_high
+revol_bal
+purpose_house
+purpose_moving
+title_word_count
+avg_cur_bal
+purpose_credit_card

training_features.json ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+  "feature_names": [
+    "mths_since_recent_inq",
+    "purpose_debt_consolidation",
+    "addr_state_NE",
+    "purpose_other",
+    "purpose_wedding",
+    "title_length",
+    "purpose_vacation",
+    "addr_state_SD",
+    "addr_state_OH",
+    "addr_state_VT",
+    "addr_state_WY",
+    "purpose_home_improvement",
+    "addr_state_AR",
+    "dti",
+    "purpose_small_business",
+    "last_fico_range_low",
+    "addr_state_KY",
+    "emp_length_numeric",
+    "addr_state_TN",
+    "addr_state_UT",
+    "title_has_car",
+    "addr_state_WA",
+    "addr_state_KS",
+    "addr_state_VA",
+    "pct_tl_nvr_dlq",
+    "int_rate_times_loan",
+    "addr_state_ID",
+    "loan_to_income",
+    "addr_state_MA",
+    "addr_state_ME",
+    "mths_since_recent_bc",
+    "addr_state_DE",
+    "addr_state_MT",
+    "addr_state_DC",
+    "home_ownership_MORTGAGE",
+    "addr_state_IA",
+    "addr_state_LA",
+    "mo_sin_old_rev_tl_op",
+    "title_has_medical",
+    "addr_state_NY",
+    "addr_state_IL",
+    "title_has_credit",
+    "purpose_major_purchase",
+    "addr_state_AL",
+    "addr_state_CA",
+    "verification_status_Verified",
+    "verification_status_Source Verified",
+    "total_bal_ex_mort",
+    "addr_state_MD",
+    "purpose_medical",
+    "addr_state_MS",
+    "revol_util_decimal",
+    "addr_state_CT",
+    "years_since_earliest_cr",
+    "title_has_home",
+    "mo_sin_old_il_acct",
+    "addr_state_NC",
+    "addr_state_RI",
+    "addr_state_CO",
+    "addr_state_OR",
+    "addr_state_AZ",
+    "addr_state_NV",
+    "addr_state_MI",
+    "addr_state_IN",
+    "total_bc_limit",
+    "home_ownership_OWN",
+    "addr_state_SC",
+    "subprime_high_dti",
+    "home_ownership_RENT",
+    "mo_sin_rcnt_rev_tl_op",
+    "title_has_consolidation",
+    "has_delinq_history",
+    "addr_state_FL",
+    "title_has_debt",
+    "addr_state_NJ",
+    "addr_state_WV",
+    "addr_state_NH",
+    "addr_state_HI",
+    "title_has_card",
+    "addr_state_TX",
+    "annual_inc",
+    "addr_state_GA",
+    "addr_state_WI",
+    "addr_state_MO",
+    "addr_state_MN",
+    "total_acc",
+    "grade_numeric",
+    "addr_state_PA",
+    "addr_state_NM",
+    "purpose_renewable_energy",
+    "addr_state_OK",
+    "last_fico_range_high",
+    "revol_bal",
+    "purpose_house",
+    "purpose_moving",
+    "title_word_count",
+    "avg_cur_bal",
+    "purpose_credit_card"
+  ],
+  "feature_count": 98,
+  "categorical_features": [
+    "purpose_debt_consolidation",
+    "addr_state_NE",
+    "purpose_other",
+    "purpose_wedding",
+    "purpose_vacation",
+    "addr_state_SD",
+    "addr_state_OH",
+    "addr_state_VT",
+    "addr_state_WY",
+    "purpose_home_improvement",
+    "addr_state_AR",
+    "purpose_small_business",
+    "addr_state_KY",
+    "addr_state_TN",
+    "addr_state_UT",
+    "addr_state_WA",
+    "addr_state_KS",
+    "addr_state_VA",
+    "addr_state_ID",
+    "addr_state_MA",
+    "addr_state_ME",
+    "addr_state_DE",
+    "addr_state_MT",
+    "addr_state_DC",
+    "home_ownership_MORTGAGE",
+    "addr_state_IA",
+    "addr_state_LA",
+    "addr_state_NY",
+    "addr_state_IL",
+    "purpose_major_purchase",
+    "addr_state_AL",
+    "addr_state_CA",
+    "verification_status_Verified",
+    "verification_status_Source Verified",
+    "addr_state_MD",
+    "purpose_medical",
+    "addr_state_MS",
+    "addr_state_CT",
+    "addr_state_NC",
+    "addr_state_RI",
+    "addr_state_CO",
+    "addr_state_OR",
+    "addr_state_AZ",
+    "addr_state_NV",
+    "addr_state_MI",
+    "addr_state_IN",
+    "home_ownership_OWN",
+    "addr_state_SC",
+    "home_ownership_RENT",
+    "addr_state_FL"
+  ]
+}

xgb_best_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bb8b39bb961670917a07abb4a256d08bc70c7245a59df0bee99820e709f5b61
+size 2583734