Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| from sklearn.metrics import ( | |
| classification_report, | |
| accuracy_score, | |
| precision_score, | |
| recall_score, | |
| f1_score, | |
| ) | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.svm import SVC | |
| DEFAULT_DATA_PATH = "/mnt/data/Loan_Delinquent_Analysis_Dataset.csv" | |
| TARGET_COL_DEFAULT = "Delinquency_Status" | |
| def _build_model(model_name: str): | |
| model_name = (model_name or "").strip() | |
| if model_name == "Logistic Regression": | |
| return LogisticRegression(max_iter=2000) | |
| if model_name == "Decision Tree": | |
| return DecisionTreeClassifier(random_state=1) | |
| if model_name == "Random Forest": | |
| return RandomForestClassifier(random_state=1, n_estimators=200) | |
| if model_name == "K-Nearest Neighbors (KNN)": | |
| return KNeighborsClassifier() | |
| if model_name == "Support Vector Machine (SVM)": | |
| return SVC() | |
| raise ValueError(f"Unknown model selection: {model_name}") | |
| def train_from_csv( | |
| file_obj, | |
| model_name: str, | |
| target_col: str, | |
| test_size: float, | |
| random_state: int, | |
| ): | |
| # Load CSV | |
| if file_obj is None: | |
| df = pd.read_csv(DEFAULT_DATA_PATH) | |
| source = f"Loaded default dataset from: {DEFAULT_DATA_PATH}" | |
| else: | |
| # gr.File returns an object with a .name path | |
| df = pd.read_csv(file_obj.name) | |
| source = f"Loaded uploaded dataset: {file_obj.name}" | |
| if target_col not in df.columns: | |
| raise gr.Error( | |
| f"Target column '{target_col}' not found. Available columns: {list(df.columns)}" | |
| ) | |
| # Basic cleanup: drop rows with missing target | |
| df = df.dropna(subset=[target_col]).copy() | |
| # Split features/target | |
| X = df.drop(columns=[target_col]) | |
| y = df[target_col] | |
| # Identify column types | |
| cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() | |
| num_cols = [c for c in X.columns if c not in cat_cols] | |
| # Preprocess: one-hot for categoricals, passthrough numeric | |
| preprocess = ColumnTransformer( | |
| transformers=[ | |
| ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols), | |
| ("num", "passthrough", num_cols), | |
| ], | |
| remainder="drop", | |
| sparse_threshold=0.3, | |
| ) | |
| model = _build_model(model_name) | |
| # Scale for consistency across models (esp. LR/SVM/KNN). Use with_mean=False for sparse output. | |
| pipe = Pipeline( | |
| steps=[ | |
| ("preprocess", preprocess), | |
| ("scaler", StandardScaler(with_mean=False)), | |
| ("model", model), | |
| ] | |
| ) | |
| # Split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=float(test_size), random_state=int(random_state) | |
| ) | |
| # Train + predict | |
| pipe.fit(X_train, y_train) | |
| y_pred = pipe.predict(X_test) | |
| # Metrics (weighted to match common lab pattern) | |
| train_acc = pipe.score(X_train, y_train) | |
| test_acc = accuracy_score(y_test, y_pred) | |
| precision = precision_score(y_test, y_pred, average="weighted", zero_division=0) | |
| recall = recall_score(y_test, y_pred, average="weighted", zero_division=0) | |
| f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0) | |
| report = classification_report(y_test, y_pred, digits=4, zero_division=0) | |
| metrics_df = pd.DataFrame( | |
| [{ | |
| "Algorithm": model_name, | |
| "Training_Accuracy": train_acc, | |
| "Testing_Accuracy": test_acc, | |
| "Precision_weighted": precision, | |
| "Recall_weighted": recall, | |
| "F1_weighted": f1, | |
| }] | |
| ) | |
| details = ( | |
| f"{source}\n" | |
| f"Rows: {len(df):,} | Features: {X.shape[1]:,} | Target: '{target_col}'\n" | |
| f"Train size: {len(X_train):,} | Test size: {len(X_test):,}\n" | |
| f"Categorical cols: {len(cat_cols)} | Numeric cols: {len(num_cols)}" | |
| ) | |
| return metrics_df, report, details | |
| def build_demo(): | |
| with gr.Blocks(title="Loan Delinquency Model Trainer") as demo: | |
| gr.Markdown( | |
| "## Loan Delinquency Model Trainer\n" | |
| "Drag-and-drop a **CSV**, choose a **model**, train, and review **Precision/Recall/F1** and the **classification report**." | |
| ) | |
| with gr.Row(): | |
| file_in = gr.File( | |
| label="Upload CSV (drag & drop)", | |
| file_types=[".csv"], | |
| ) | |
| model_in = gr.Dropdown( | |
| label="Select Model", | |
| choices=[ | |
| "Logistic Regression", | |
| "Decision Tree", | |
| "Random Forest", | |
| "K-Nearest Neighbors (KNN)", | |
| "Support Vector Machine (SVM)", | |
| ], | |
| value="Logistic Regression", | |
| ) | |
| with gr.Row(): | |
| target_in = gr.Textbox( | |
| label="Target Column", | |
| value=TARGET_COL_DEFAULT, | |
| ) | |
| test_size_in = gr.Slider( | |
| label="Test Size", | |
| minimum=0.1, | |
| maximum=0.5, | |
| value=0.3, | |
| step=0.05, | |
| ) | |
| rs_in = gr.Number( | |
| label="Random State", | |
| value=1, | |
| precision=0, | |
| ) | |
| train_btn = gr.Button("Train Model", variant="primary") | |
| with gr.Row(): | |
| metrics_out = gr.Dataframe( | |
| label="Model Performance (lab metrics)", | |
| wrap=True, | |
| ) | |
| with gr.Row(): | |
| report_out = gr.Textbox( | |
| label="Classification Report", | |
| lines=14, | |
| ) | |
| with gr.Row(): | |
| details_out = gr.Textbox( | |
| label="Run Details", | |
| lines=5, | |
| ) | |
| train_btn.click( | |
| fn=train_from_csv, | |
| inputs=[file_in, model_in, target_in, test_size_in, rs_in], | |
| outputs=[metrics_out, report_out, details_out], | |
| ) | |
| gr.Markdown( | |
| "**Note:** If you do not upload a file, the app will attempt to load the default dataset path:\n" | |
| f"`{DEFAULT_DATA_PATH}`" | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_demo() | |
| demo.launch() | |