File size: 6,566 Bytes
c2f8de8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import gradio as gr
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


DEFAULT_DATA_PATH = "/mnt/data/Loan_Delinquent_Analysis_Dataset.csv"
TARGET_COL_DEFAULT = "Delinquency_Status"


def _build_model(model_name: str):
    model_name = (model_name or "").strip()

    if model_name == "Logistic Regression":
        return LogisticRegression(max_iter=2000)
    if model_name == "Decision Tree":
        return DecisionTreeClassifier(random_state=1)
    if model_name == "Random Forest":
        return RandomForestClassifier(random_state=1, n_estimators=200)
    if model_name == "K-Nearest Neighbors (KNN)":
        return KNeighborsClassifier()
    if model_name == "Support Vector Machine (SVM)":
        return SVC()

    raise ValueError(f"Unknown model selection: {model_name}")


def train_from_csv(
    file_obj,
    model_name: str,
    target_col: str,
    test_size: float,
    random_state: int,
):
    # Load CSV
    if file_obj is None:
        df = pd.read_csv(DEFAULT_DATA_PATH)
        source = f"Loaded default dataset from: {DEFAULT_DATA_PATH}"
    else:
        # gr.File returns an object with a .name path
        df = pd.read_csv(file_obj.name)
        source = f"Loaded uploaded dataset: {file_obj.name}"

    if target_col not in df.columns:
        raise gr.Error(
            f"Target column '{target_col}' not found. Available columns: {list(df.columns)}"
        )

    # Basic cleanup: drop rows with missing target
    df = df.dropna(subset=[target_col]).copy()

    # Split features/target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Identify column types
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]

    # Preprocess: one-hot for categoricals, passthrough numeric
    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )

    model = _build_model(model_name)

    # Scale for consistency across models (esp. LR/SVM/KNN). Use with_mean=False for sparse output.
    pipe = Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("scaler", StandardScaler(with_mean=False)),
            ("model", model),
        ]
    )

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(test_size), random_state=int(random_state)
    )

    # Train + predict
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Metrics (weighted to match common lab pattern)
    train_acc = pipe.score(X_train, y_train)
    test_acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    report = classification_report(y_test, y_pred, digits=4, zero_division=0)

    metrics_df = pd.DataFrame(
        [{
            "Algorithm": model_name,
            "Training_Accuracy": train_acc,
            "Testing_Accuracy": test_acc,
            "Precision_weighted": precision,
            "Recall_weighted": recall,
            "F1_weighted": f1,
        }]
    )

    details = (
        f"{source}\n"
        f"Rows: {len(df):,} | Features: {X.shape[1]:,} | Target: '{target_col}'\n"
        f"Train size: {len(X_train):,} | Test size: {len(X_test):,}\n"
        f"Categorical cols: {len(cat_cols)} | Numeric cols: {len(num_cols)}"
    )

    return metrics_df, report, details


def build_demo():
    with gr.Blocks(title="Loan Delinquency Model Trainer") as demo:
        gr.Markdown(
            "## Loan Delinquency Model Trainer\n"
            "Drag-and-drop a **CSV**, choose a **model**, train, and review **Precision/Recall/F1** and the **classification report**."
        )

        with gr.Row():
            file_in = gr.File(
                label="Upload CSV (drag & drop)",
                file_types=[".csv"],
            )
            model_in = gr.Dropdown(
                label="Select Model",
                choices=[
                    "Logistic Regression",
                    "Decision Tree",
                    "Random Forest",
                    "K-Nearest Neighbors (KNN)",
                    "Support Vector Machine (SVM)",
                ],
                value="Logistic Regression",
            )

        with gr.Row():
            target_in = gr.Textbox(
                label="Target Column",
                value=TARGET_COL_DEFAULT,
            )
            test_size_in = gr.Slider(
                label="Test Size",
                minimum=0.1,
                maximum=0.5,
                value=0.3,
                step=0.05,
            )
            rs_in = gr.Number(
                label="Random State",
                value=1,
                precision=0,
            )

        train_btn = gr.Button("Train Model", variant="primary")

        with gr.Row():
            metrics_out = gr.Dataframe(
                label="Model Performance (lab metrics)",
                wrap=True,
            )

        with gr.Row():
            report_out = gr.Textbox(
                label="Classification Report",
                lines=14,
            )

        with gr.Row():
            details_out = gr.Textbox(
                label="Run Details",
                lines=5,
            )

        train_btn.click(
            fn=train_from_csv,
            inputs=[file_in, model_in, target_in, test_size_in, rs_in],
            outputs=[metrics_out, report_out, details_out],
        )

        gr.Markdown(
            "**Note:** If you do not upload a file, the app will attempt to load the default dataset path:\n"
            f"`{DEFAULT_DATA_PATH}`"
        )

    return demo


if __name__ == "__main__":
    demo = build_demo()
    demo.launch()