Spaces:
Sleeping
Sleeping
File size: 10,706 Bytes
66c203c 9a53dda 66c203c 9a53dda 66c203c 3c1c33b 66c203c 3c1c33b 66c203c 3c1c33b 66c203c 3c1c33b b911157 3c1c33b 66c203c 9a53dda 66c203c 3c1c33b 66c203c 9a53dda 66c203c 3c1c33b 66c203c 3c1c33b 66c203c 9a53dda 66c203c 9a53dda 66c203c 9a53dda 66c203c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
# Gradio app: CSV -> Preprocessing -> Logistic Regression with hyperparameter tuning
# Save this file as gradio_logreg_app.py and run: python gradio_logreg_app.py
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import gradio as gr
def load_csv(file_obj):
# Case: path string
if isinstance(file_obj, str):
try:
return pd.read_csv(file_obj), None
except Exception as e_csv:
try:
return pd.read_excel(file_obj), None
except Exception as e_xls:
return None, f"Failed to read file from path. CSV error: {e_csv} / Excel error: {e_xls}"
# Case: file-like object
if hasattr(file_obj, "read"):
file_obj.seek(0)
try:
return pd.read_csv(file_obj), None
except Exception as e_csv:
file_obj.seek(0)
try:
return pd.read_excel(file_obj), None
except Exception as e_xls:
return None, f"Failed to read file object. CSV error: {e_csv} / Excel error: {e_xls}"
return None, "Unsupported file type."
def on_upload(file):
if file is None:
return gr.Dropdown.update(choices=[]), "No file uploaded", None
df, err = load_csv(file)
if err:
return gr.Dropdown.update(choices=[]), f"Error: {err}", None
cols = df.columns.tolist()
default_target = cols[-1] if cols else None
return gr.Dropdown.update(choices=cols, value=default_target), f"Loaded {len(df)} rows, {len(cols)} columns", df
# Helper: build preprocessing + model pipeline
def build_pipeline(df, target_col, impute_strategy, apply_scaling, encode_categorical):
X = df.drop(columns=[target_col])
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
transformers = []
if numeric_cols:
num_transformers = []
if impute_strategy != 'none':
num_transformers.append(('imputer', SimpleImputer(strategy=impute_strategy)))
if apply_scaling:
num_transformers.append(('scaler', StandardScaler()))
if num_transformers:
from sklearn.pipeline import make_pipeline
transformers.append(('num', make_pipeline(*[t[1] for t in num_transformers]), numeric_cols))
if categorical_cols and encode_categorical:
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformers.append(('cat', cat_transformer, categorical_cols))
if transformers:
preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
else:
preprocessor = 'passthrough'
pipe = Pipeline(steps=[('preproc', preprocessor), ('clf', LogisticRegression(max_iter=200))])
return pipe
# Training function
def train_model(df, target_col, test_size, random_state, impute_strategy, apply_scaling, encode_categorical,
use_grid, c_min, c_max, c_steps, penalties, solver, cv_folds, max_iter, n_jobs):
# Basic checks
if df is None:
return "No data loaded", None, None, None
if target_col not in df.columns:
return f"Target column '{target_col}' not found", None, None, None
# Drop rows where target is missing
data = df.copy()
data = data.dropna(subset=[target_col])
# If target is not numeric, try to encode it
y = data[target_col]
if y.dtype == object or y.dtype.name == 'category' or y.dtype == bool:
y = pd.factorize(y)[0]
X = data.drop(columns=[target_col])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y if len(np.unique(y))>1 else None)
pipe = build_pipeline(pd.concat([X_train, y_train], axis=1), target_col, impute_strategy, apply_scaling, encode_categorical)
pipe.named_steps['clf'].max_iter = max_iter
if use_grid:
# build param grid for C and penalty
C_values = np.linspace(c_min, c_max, int(max(1, c_steps)))
param_grid = {}
# penalty and solver interaction needs care
selected_penalties = penalties if len(penalties)>0 else ['l2']
param_grid['clf__C'] = C_values
param_grid['clf__penalty'] = selected_penalties
param_grid['clf__solver'] = [solver]
gs = GridSearchCV(pipe, param_grid, cv=cv_folds, n_jobs=n_jobs, scoring='accuracy')
gs.fit(X_train, y_train)
best = gs.best_estimator_
best_params = gs.best_params_
model = best
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
acc = accuracy_score(y_test, test_pred)
report = classification_report(y_test, test_pred)
cm = confusion_matrix(y_test, test_pred)
extra = f"Best params: {best_params}"
else:
# set hyperparams from UI
clf = pipe.named_steps['clf']
try:
clf.set_params(C=float((c_min+c_max)/2), penalty=penalties[0] if penalties else 'l2', solver=solver)
except Exception:
# fallback: set only C
clf.set_params(C=float((c_min+c_max)/2))
pipe.fit(X_train, y_train)
model = pipe
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
acc = accuracy_score(y_test, test_pred)
report = classification_report(y_test, test_pred)
cm = confusion_matrix(y_test, test_pred)
extra = "Trained with provided hyperparameters"
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(4,4))
ax.imshow(cm, interpolation='nearest')
ax.set_title('Confusion matrix')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, str(cm[i, j]), ha='center', va='center', color='white' if cm[i,j]>cm.max()/2 else 'black')
plt.tight_layout()
return f"Accuracy: {acc:.4f}\n{extra}", fig, report, model
# Build Gradio interface
with gr.Blocks(title="CSV -> Logistic Regression (with tuning)") as demo:
gr.Markdown("""
# CSV → Preprocessing → Logistic Regression
1. Upload a CSV or Excel file.
2. Select the target (label) column.
3. Choose preprocessing options and hyperparameters.
4. Train model and view accuracy, confusion matrix and classification report.
""")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload CSV/Excel file", file_types=['.csv', '.xls', '.xlsx'])
load_status = gr.Textbox(label="File status", interactive=False)
target_dropdown = gr.Dropdown(label="Select target column", choices=[], value=None)
preview_button = gr.Button("Preview data")
preview_output = gr.Dataframe(headers=None, interactive=False)
with gr.Column(scale=1):
gr.Markdown("**Preprocessing**")
impute_radio = gr.Radio(['mean','median','most_frequent','constant','none'], value='mean', label='Numeric imputation (if needed)')
scaler_checkbox = gr.Checkbox(label='Apply Standard Scaling', value=True)
encode_checkbox = gr.Checkbox(label='One-Hot Encode categorical', value=True)
gr.Markdown("**Train / Test & Randomness**")
test_size = gr.Slider(0.05, 0.5, value=0.2, step=0.05, label='Test size')
random_state = gr.Number(value=42, precision=0, label='Random state (int)')
gr.Markdown("**Logistic Regression hyperparams**")
use_grid = gr.Checkbox(label='Use GridSearchCV for hyperparameter tuning', value=True)
c_min = gr.Number(value=0.01, label='C (min)')
c_max = gr.Number(value=10.0, label='C (max)')
c_steps = gr.Slider(1, 20, value=5, step=1, label='C steps (grid size)')
penalties = gr.CheckboxGroup(['l1','l2','elasticnet','none'], label='Penalties to try (Grid only / or choose first)', value=['l2'])
solver = gr.Dropdown(['lbfgs','liblinear','saga','sag','newton-cg'], value='lbfgs', label='Solver')
max_iter = gr.Slider(50,1000,value=200,step=10,label='Max iterations')
cv_folds = gr.Slider(2,10,value=5,step=1,label='CV folds for GridSearch')
n_jobs = gr.Slider(1,8,value=1,step=1,label='n_jobs for GridSearch')
train_btn = gr.Button("Train model")
with gr.Row():
with gr.Column():
accuracy_text = gr.Textbox(label='Accuracy & notes', interactive=False)
conf_plot = gr.Plot(label='Confusion Matrix')
with gr.Column():
class_report = gr.Textbox(label='Classification report', interactive=False)
model_obj = gr.JSON(label='Trained model (sklearn pipeline as repr)')
# State to keep dataframe
df_state = gr.State()
# Wire upload -> get columns
file_input.change(fn=on_upload, inputs=[file_input], outputs=[target_dropdown, load_status, df_state])
def preview(df):
if df is None:
return pd.DataFrame()
return df.head(20)
preview_button.click(fn=preview, inputs=[df_state], outputs=[preview_output])
def do_train(df, target, test_size_val, rand_state, impute_s, scale_flag, encode_flag,
use_grid_flag, cmin, cmax, csteps, penalties_sel, solver_sel, cv_f, max_it, n_jobs_val):
msg, fig, report, model = train_model(df, target, test_size_val, int(rand_state), impute_s, scale_flag, encode_flag,
use_grid_flag, float(cmin), float(cmax), int(csteps), penalties_sel, solver_sel, int(cv_f), int(max_it), int(n_jobs_val))
model_repr = str(model)
return msg, fig, report, model_repr
train_btn.click(fn=do_train, inputs=[df_state, target_dropdown, test_size, random_state, impute_radio, scaler_checkbox, encode_checkbox,
use_grid, c_min, c_max, c_steps, penalties, solver, cv_folds, max_iter, n_jobs],
outputs=[accuracy_text, conf_plot, class_report, model_obj])
if __name__ == '__main__':
demo.launch(server_name='0.0.0.0', share=False)
|