Spaces:
Sleeping
Sleeping
File size: 6,566 Bytes
c2f8de8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
import gradio as gr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
classification_report,
accuracy_score,
precision_score,
recall_score,
f1_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
DEFAULT_DATA_PATH = "/mnt/data/Loan_Delinquent_Analysis_Dataset.csv"
TARGET_COL_DEFAULT = "Delinquency_Status"
def _build_model(model_name: str):
model_name = (model_name or "").strip()
if model_name == "Logistic Regression":
return LogisticRegression(max_iter=2000)
if model_name == "Decision Tree":
return DecisionTreeClassifier(random_state=1)
if model_name == "Random Forest":
return RandomForestClassifier(random_state=1, n_estimators=200)
if model_name == "K-Nearest Neighbors (KNN)":
return KNeighborsClassifier()
if model_name == "Support Vector Machine (SVM)":
return SVC()
raise ValueError(f"Unknown model selection: {model_name}")
def train_from_csv(
file_obj,
model_name: str,
target_col: str,
test_size: float,
random_state: int,
):
# Load CSV
if file_obj is None:
df = pd.read_csv(DEFAULT_DATA_PATH)
source = f"Loaded default dataset from: {DEFAULT_DATA_PATH}"
else:
# gr.File returns an object with a .name path
df = pd.read_csv(file_obj.name)
source = f"Loaded uploaded dataset: {file_obj.name}"
if target_col not in df.columns:
raise gr.Error(
f"Target column '{target_col}' not found. Available columns: {list(df.columns)}"
)
# Basic cleanup: drop rows with missing target
df = df.dropna(subset=[target_col]).copy()
# Split features/target
X = df.drop(columns=[target_col])
y = df[target_col]
# Identify column types
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]
# Preprocess: one-hot for categoricals, passthrough numeric
preprocess = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
("num", "passthrough", num_cols),
],
remainder="drop",
sparse_threshold=0.3,
)
model = _build_model(model_name)
# Scale for consistency across models (esp. LR/SVM/KNN). Use with_mean=False for sparse output.
pipe = Pipeline(
steps=[
("preprocess", preprocess),
("scaler", StandardScaler(with_mean=False)),
("model", model),
]
)
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=float(test_size), random_state=int(random_state)
)
# Train + predict
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
# Metrics (weighted to match common lab pattern)
train_acc = pipe.score(X_train, y_train)
test_acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
report = classification_report(y_test, y_pred, digits=4, zero_division=0)
metrics_df = pd.DataFrame(
[{
"Algorithm": model_name,
"Training_Accuracy": train_acc,
"Testing_Accuracy": test_acc,
"Precision_weighted": precision,
"Recall_weighted": recall,
"F1_weighted": f1,
}]
)
details = (
f"{source}\n"
f"Rows: {len(df):,} | Features: {X.shape[1]:,} | Target: '{target_col}'\n"
f"Train size: {len(X_train):,} | Test size: {len(X_test):,}\n"
f"Categorical cols: {len(cat_cols)} | Numeric cols: {len(num_cols)}"
)
return metrics_df, report, details
def build_demo():
with gr.Blocks(title="Loan Delinquency Model Trainer") as demo:
gr.Markdown(
"## Loan Delinquency Model Trainer\n"
"Drag-and-drop a **CSV**, choose a **model**, train, and review **Precision/Recall/F1** and the **classification report**."
)
with gr.Row():
file_in = gr.File(
label="Upload CSV (drag & drop)",
file_types=[".csv"],
)
model_in = gr.Dropdown(
label="Select Model",
choices=[
"Logistic Regression",
"Decision Tree",
"Random Forest",
"K-Nearest Neighbors (KNN)",
"Support Vector Machine (SVM)",
],
value="Logistic Regression",
)
with gr.Row():
target_in = gr.Textbox(
label="Target Column",
value=TARGET_COL_DEFAULT,
)
test_size_in = gr.Slider(
label="Test Size",
minimum=0.1,
maximum=0.5,
value=0.3,
step=0.05,
)
rs_in = gr.Number(
label="Random State",
value=1,
precision=0,
)
train_btn = gr.Button("Train Model", variant="primary")
with gr.Row():
metrics_out = gr.Dataframe(
label="Model Performance (lab metrics)",
wrap=True,
)
with gr.Row():
report_out = gr.Textbox(
label="Classification Report",
lines=14,
)
with gr.Row():
details_out = gr.Textbox(
label="Run Details",
lines=5,
)
train_btn.click(
fn=train_from_csv,
inputs=[file_in, model_in, target_in, test_size_in, rs_in],
outputs=[metrics_out, report_out, details_out],
)
gr.Markdown(
"**Note:** If you do not upload a file, the app will attempt to load the default dataset path:\n"
f"`{DEFAULT_DATA_PATH}`"
)
return demo
if __name__ == "__main__":
demo = build_demo()
demo.launch()
|