import gradio as gr import pandas as pd from sklearn.model_selection import train_test_split from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.metrics import ( classification_report, accuracy_score, precision_score, recall_score, f1_score, ) from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC DEFAULT_DATA_PATH = "/mnt/data/Loan_Delinquent_Analysis_Dataset.csv" TARGET_COL_DEFAULT = "Delinquency_Status" def _build_model(model_name: str): model_name = (model_name or "").strip() if model_name == "Logistic Regression": return LogisticRegression(max_iter=2000) if model_name == "Decision Tree": return DecisionTreeClassifier(random_state=1) if model_name == "Random Forest": return RandomForestClassifier(random_state=1, n_estimators=200) if model_name == "K-Nearest Neighbors (KNN)": return KNeighborsClassifier() if model_name == "Support Vector Machine (SVM)": return SVC() raise ValueError(f"Unknown model selection: {model_name}") def train_from_csv( file_obj, model_name: str, target_col: str, test_size: float, random_state: int, ): # Load CSV if file_obj is None: df = pd.read_csv(DEFAULT_DATA_PATH) source = f"Loaded default dataset from: {DEFAULT_DATA_PATH}" else: # gr.File returns an object with a .name path df = pd.read_csv(file_obj.name) source = f"Loaded uploaded dataset: {file_obj.name}" if target_col not in df.columns: raise gr.Error( f"Target column '{target_col}' not found. Available columns: {list(df.columns)}" ) # Basic cleanup: drop rows with missing target df = df.dropna(subset=[target_col]).copy() # Split features/target X = df.drop(columns=[target_col]) y = df[target_col] # Identify column types cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() num_cols = [c for c in X.columns if c not in cat_cols] # Preprocess: one-hot for categoricals, passthrough numeric preprocess = ColumnTransformer( transformers=[ ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols), ("num", "passthrough", num_cols), ], remainder="drop", sparse_threshold=0.3, ) model = _build_model(model_name) # Scale for consistency across models (esp. LR/SVM/KNN). Use with_mean=False for sparse output. pipe = Pipeline( steps=[ ("preprocess", preprocess), ("scaler", StandardScaler(with_mean=False)), ("model", model), ] ) # Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(test_size), random_state=int(random_state) ) # Train + predict pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) # Metrics (weighted to match common lab pattern) train_acc = pipe.score(X_train, y_train) test_acc = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average="weighted", zero_division=0) recall = recall_score(y_test, y_pred, average="weighted", zero_division=0) f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0) report = classification_report(y_test, y_pred, digits=4, zero_division=0) metrics_df = pd.DataFrame( [{ "Algorithm": model_name, "Training_Accuracy": train_acc, "Testing_Accuracy": test_acc, "Precision_weighted": precision, "Recall_weighted": recall, "F1_weighted": f1, }] ) details = ( f"{source}\n" f"Rows: {len(df):,} | Features: {X.shape[1]:,} | Target: '{target_col}'\n" f"Train size: {len(X_train):,} | Test size: {len(X_test):,}\n" f"Categorical cols: {len(cat_cols)} | Numeric cols: {len(num_cols)}" ) return metrics_df, report, details def build_demo(): with gr.Blocks(title="Loan Delinquency Model Trainer") as demo: gr.Markdown( "## Loan Delinquency Model Trainer\n" "Drag-and-drop a **CSV**, choose a **model**, train, and review **Precision/Recall/F1** and the **classification report**." ) with gr.Row(): file_in = gr.File( label="Upload CSV (drag & drop)", file_types=[".csv"], ) model_in = gr.Dropdown( label="Select Model", choices=[ "Logistic Regression", "Decision Tree", "Random Forest", "K-Nearest Neighbors (KNN)", "Support Vector Machine (SVM)", ], value="Logistic Regression", ) with gr.Row(): target_in = gr.Textbox( label="Target Column", value=TARGET_COL_DEFAULT, ) test_size_in = gr.Slider( label="Test Size", minimum=0.1, maximum=0.5, value=0.3, step=0.05, ) rs_in = gr.Number( label="Random State", value=1, precision=0, ) train_btn = gr.Button("Train Model", variant="primary") with gr.Row(): metrics_out = gr.Dataframe( label="Model Performance (lab metrics)", wrap=True, ) with gr.Row(): report_out = gr.Textbox( label="Classification Report", lines=14, ) with gr.Row(): details_out = gr.Textbox( label="Run Details", lines=5, ) train_btn.click( fn=train_from_csv, inputs=[file_in, model_in, target_in, test_size_in, rs_in], outputs=[metrics_out, report_out, details_out], ) gr.Markdown( "**Note:** If you do not upload a file, the app will attempt to load the default dataset path:\n" f"`{DEFAULT_DATA_PATH}`" ) return demo if __name__ == "__main__": demo = build_demo() demo.launch()