Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """loan.py""" | |
| # Import necessary libraries | |
| from IPython.display import display | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import warnings | |
| import ipywidgets as widgets | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from imblearn.over_sampling import SMOTE | |
| from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score | |
| import gradio as gr | |
| from imblearn.pipeline import Pipeline as ImbPipeline | |
| import joblib | |
| from datasets import load_dataset # Import the Hugging Face dataset library | |
| # Suppress specific FutureWarnings | |
| warnings.simplefilter(action='ignore', category=FutureWarning) | |
| # Load dataset directly from Hugging Face | |
| dataset = load_dataset("AnguloM/loan_data") | |
| # Access the train and test data | |
| df_train = dataset['train'] | |
| # Convert dataset to pandas DataFrame | |
| df_train = pd.DataFrame(df_train) | |
| from sklearn.model_selection import train_test_split | |
| df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42) | |
| # Create a summary DataFrame with data types and non-null counts | |
| info_df = pd.DataFrame({ | |
| "Column": df_train.columns, | |
| "Data Type": df_train.dtypes, | |
| "Non-Null Count": df_train.notnull().sum(), | |
| "Total Count": len(df_train) | |
| }) | |
| # Calculate the percentage of non-null values in each column | |
| info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%' | |
| # Style the table | |
| info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles( | |
| [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}] | |
| ) | |
| # Apply background gradient only to numerical columns | |
| info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges") | |
| # Create a widget to display the styled table | |
| table_widget = widgets.Output() | |
| with table_widget: | |
| display(info_df_styled) | |
| # Widget for the missing values message | |
| message_widget = widgets.Output() | |
| with message_widget: | |
| print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}") | |
| # Display both widgets (table and missing values message) side by side | |
| widgets.HBox([table_widget, message_widget]) | |
| # Convert relevant columns to categorical if necessary | |
| df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category') | |
| # Select only numeric columns for correlation matrix calculation | |
| df_numeric = df_train.select_dtypes(include=[float, int]) | |
| # Create a 1x2 grid for the plots | |
| plt.figure(figsize=(12, 6)) | |
| # Create subplots for the correlation matrix and target distribution | |
| fig, axes = plt.subplots(1, 2, figsize=(14, 6)) | |
| # Plot Correlation Matrix | |
| sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f') | |
| axes[0].set_title('Correlation Matrix') | |
| # Plot Distribution of Loan Repayment Status (Target Variable) | |
| sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1]) | |
| axes[1].set_title('Distribution of Loan Repayment Status') | |
| # Show the plots | |
| plt.tight_layout() # Adjusts the layout to avoid overlapping | |
| plt.show() | |
| # OneHotEncoding for categorical columns and scaling for numeric columns | |
| # Prepare data for training | |
| data = df_train.copy() | |
| # Separate features (X) and target (y) | |
| X = data.drop('credit.policy', axis=1) # Drop the target column | |
| y = data['credit.policy'] # Target variable | |
| # Split the data into training (80%) and testing (20%) sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Preprocessing pipeline (scaling numeric features and encoding categorical features) | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', | |
| 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', | |
| 'delinq.2yrs', 'pub.rec']), | |
| ('cat', OneHotEncoder(), ['purpose']) # Ensure 'purpose' is included in categorical transformations | |
| ] | |
| ) | |
| # Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression | |
| imb_model_pipeline = ImbPipeline(steps=[ | |
| ('preprocessor', preprocessor), # First, preprocess the data (scale numeric, encode categorical) | |
| ('smote', SMOTE(random_state=42, sampling_strategy=0.5)), # Apply SMOTE to balance the dataset | |
| ('classifier', LogisticRegression(max_iter=1000000)) # Logistic Regression classifier | |
| ]) | |
| # Train the model with the full pipeline (preprocessing + SMOTE + model training) | |
| imb_model_pipeline.fit(X_train, y_train) | |
| # Make predictions on the test data | |
| y_pred = imb_model_pipeline.predict(X_test) | |
| y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for the positive class | |
| # Adjust the decision threshold to improve recall of the positive class | |
| threshold = 0.3 | |
| y_pred_adjusted = (y_pred_proba >= threshold).astype(int) | |
| # Evaluate the model using classification report | |
| classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True) | |
| # Convert the classification report to a DataFrame for display as a table with styles | |
| classification_df = pd.DataFrame(classification_rep).transpose() | |
| classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles( | |
| [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}] | |
| ) | |
| # Display the classification report as a styled table in a widget | |
| table_widget = widgets.Output() | |
| with table_widget: | |
| display(classification_df_styled) | |
| # Calculate the AUC-ROC score | |
| auc_roc = roc_auc_score(y_test, y_pred_proba) | |
| # Widget for the AUC-ROC | |
| auc_widget = widgets.Output() | |
| with auc_widget: | |
| print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}") | |
| # Display both widgets (table and AUC-ROC message) side by side | |
| display(widgets.VBox([table_widget, auc_widget])) | |
| # Display the confusion matrix | |
| cm = confusion_matrix(y_test, y_pred_adjusted) | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') | |
| plt.title("Confusion Matrix") | |
| plt.xlabel("Predicted") | |
| plt.ylabel("Actual") | |
| plt.show() | |
| from huggingface_hub import hf_hub_download | |
| import joblib | |
| model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl") | |
| pipeline = joblib.load(model_path) | |
| # Prediction function | |
| def predict_approval(int_rate, installment, log_annual_inc, dti, fico, | |
| days_with_cr_line, revol_bal, revol_util, inq_last_6mths, | |
| delinq_2yrs, pub_rec, purpose): | |
| # Prepare the input as a DataFrame | |
| input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico, | |
| days_with_cr_line, revol_bal, revol_util, | |
| inq_last_6mths, delinq_2yrs, pub_rec, purpose]], | |
| columns=['int.rate', 'installment', 'log.annual.inc', | |
| 'dti', 'fico', 'days.with.cr.line', 'revol.bal', | |
| 'revol.util', 'inq.last.6mths', 'delinq.2yrs', | |
| 'pub.rec', 'purpose']) | |
| # Make loan approval prediction | |
| result = pipeline.predict(input_data)[0] | |
| return "Loan Approved" if result == 1 else "Loan Not Approved" | |
| # Create input components for the Gradio interface | |
| inputs = [ | |
| gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"), | |
| gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"), | |
| gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"), | |
| gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"), | |
| gr.Slider(300, 850, step=1, label="FICO Credit Score"), | |
| gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"), | |
| gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"), | |
| gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"), | |
| gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"), | |
| gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"), | |
| gr.Slider(0, 5, step=1, label="Public Records"), | |
| gr.Dropdown(["credit_card", "debt_consolidation", "educational", | |
| "home_improvement", "major_purchase", "small_business", | |
| "other"], label="Loan Purpose") | |
| ] | |
| # Create the Gradio interface for loan approval prediction | |
| gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True) | |