Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,188 +1,179 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
from sklearn.model_selection import train_test_split
|
| 4 |
from sklearn.ensemble import RandomForestClassifier
|
| 5 |
-
from sklearn.
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def preprocess_dataframe(df, quantile_binning=False, count_words=False):
|
|
|
|
| 14 |
df = df.copy()
|
| 15 |
|
| 16 |
-
#
|
| 17 |
if quantile_binning:
|
| 18 |
-
|
|
|
|
| 19 |
try:
|
| 20 |
-
df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=
|
| 21 |
-
except Exception:
|
| 22 |
-
|
| 23 |
|
| 24 |
-
#
|
| 25 |
if count_words:
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
return df
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
return None, [], pd.DataFrame(), "", ""
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
if
|
| 39 |
-
|
| 40 |
-
df = pd.read_csv(filepath, encoding='utf-8')
|
| 41 |
-
except UnicodeDecodeError:
|
| 42 |
-
df = pd.read_csv(filepath, encoding='latin1')
|
| 43 |
else:
|
| 44 |
-
df =
|
| 45 |
-
|
| 46 |
-
columns = list(df.columns)
|
| 47 |
-
preview = df.head(100)
|
| 48 |
-
missing = df.isnull().sum()
|
| 49 |
-
desc = df.describe(include='all').T
|
| 50 |
-
|
| 51 |
-
# Summary markdown table
|
| 52 |
-
summary_md = "### Data Summary\n\n| Column | Missing | Min | Max | Mean | Median | Unique |\n|---|---|---|---|---|---|---|\n"
|
| 53 |
-
for col in df.columns:
|
| 54 |
-
miss = missing[col]
|
| 55 |
-
min_val = desc.loc[col, 'min'] if 'min' in desc.columns and col in desc.index else "-"
|
| 56 |
-
max_val = desc.loc[col, 'max'] if 'max' in desc.columns and col in desc.index else "-"
|
| 57 |
-
mean_val = desc.loc[col, 'mean'] if 'mean' in desc.columns and col in desc.index else "-"
|
| 58 |
-
median_val = df[col].median() if pd.api.types.is_numeric_dtype(df[col]) else "-"
|
| 59 |
-
unique_val = df[col].nunique()
|
| 60 |
-
summary_md += f"| {col} | {miss} | {min_val} | {max_val} | {mean_val} | {median_val} | {unique_val} |\n"
|
| 61 |
-
|
| 62 |
-
return df, columns, preview, summary_md, ""
|
| 63 |
-
except Exception as e:
|
| 64 |
-
return None, [], pd.DataFrame(), "", f"β Error loading file: {e}"
|
| 65 |
-
|
| 66 |
-
# Step 3: Train RandomForest model on selected columns
|
| 67 |
-
def train_model(df, target_col, feature_cols):
|
| 68 |
-
if df is None or df.empty:
|
| 69 |
-
return "Please upload a valid dataset first.", None, ""
|
| 70 |
-
if target_col not in df.columns:
|
| 71 |
-
return "Target column not found.", None, ""
|
| 72 |
-
if not feature_cols:
|
| 73 |
-
return "Select at least one feature column.", None, ""
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
return "No data left after removing missing values.", None, ""
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
except ValueError as e:
|
| 89 |
-
return f"Error splitting data: {e}", None, ""
|
| 90 |
|
| 91 |
-
|
|
|
|
| 92 |
model.fit(X_train, y_train)
|
| 93 |
-
y_pred = model.predict(X_test)
|
| 94 |
|
|
|
|
|
|
|
| 95 |
report = classification_report(y_test, y_pred)
|
| 96 |
|
| 97 |
-
|
| 98 |
-
cm = confusion_matrix(y_test, y_pred)
|
| 99 |
-
fig, ax = plt.subplots(figsize=(6, 5))
|
| 100 |
-
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
|
| 101 |
-
ax.set(xlabel='Predicted', ylabel='True', title='Confusion Matrix')
|
| 102 |
-
plt.tight_layout()
|
| 103 |
|
| 104 |
-
|
| 105 |
-
plt.savefig(buf, format="png")
|
| 106 |
-
plt.close(fig)
|
| 107 |
-
img_html = f'<img src="data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}" alt="Confusion Matrix"/>'
|
| 108 |
|
| 109 |
-
help_text = generate_help_text(report)
|
| 110 |
-
return report, img_html, help_text
|
| 111 |
-
|
| 112 |
-
# Step 4: Auto-generate explanation of metrics
|
| 113 |
-
def generate_help_text(report_text):
|
| 114 |
-
try:
|
| 115 |
-
macro = re.search(r'macro avg\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)', report_text)
|
| 116 |
-
if macro:
|
| 117 |
-
precision = float(macro.group(1))
|
| 118 |
-
recall = float(macro.group(2))
|
| 119 |
-
f1 = float(macro.group(3))
|
| 120 |
-
text = (
|
| 121 |
-
f"### Performance Insights\n"
|
| 122 |
-
f"- **Precision (~{precision:.2f})**: Accuracy of positive predictions.\n"
|
| 123 |
-
f"- **Recall (~{recall:.2f})**: Coverage of actual positives.\n"
|
| 124 |
-
f"- **F1-score (~{f1:.2f})**: Balance between precision and recall.\n\n"
|
| 125 |
-
)
|
| 126 |
-
if precision < 0.5: text += "β οΈ Low precision: many false positives.\n"
|
| 127 |
-
if recall < 0.5: text += "β οΈ Low recall: many false negatives.\n"
|
| 128 |
-
if precision > 0.8 and recall > 0.8: text += "β
Strong performance across both metrics.\n"
|
| 129 |
-
return text + "\nReview the confusion matrix for misclassifications."
|
| 130 |
-
except Exception:
|
| 131 |
-
pass
|
| 132 |
-
return "Help will appear after training."
|
| 133 |
-
|
| 134 |
-
# Step 5: When file is uploaded, load, preprocess and update all UI elements
|
| 135 |
-
def on_file_change(file, quantile_binning, count_words):
|
| 136 |
-
df, columns, preview, summary_md, error = load_data(file)
|
| 137 |
-
if df is None:
|
| 138 |
-
return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
|
| 139 |
-
df_processed = preprocess_dataframe(df, quantile_binning, count_words)
|
| 140 |
-
return (
|
| 141 |
-
df_processed, # Store processed dataframe in state
|
| 142 |
-
gr.update(choices=list(df_processed.columns)), # Update target dropdown
|
| 143 |
-
gr.update(choices=list(df_processed.columns)), # Update feature checkboxes
|
| 144 |
-
preview, # Show raw preview
|
| 145 |
-
summary_md, # Show summary
|
| 146 |
-
df_processed.head(100), # Show processed preview
|
| 147 |
-
"", # Clear classification report
|
| 148 |
-
"", # Clear help text
|
| 149 |
-
)
|
| 150 |
-
|
| 151 |
-
# Step 6: Build the Gradio interface
|
| 152 |
with gr.Blocks() as demo:
|
| 153 |
-
gr.Markdown("#
|
| 154 |
|
| 155 |
-
|
| 156 |
df_state = gr.State(None)
|
| 157 |
|
| 158 |
-
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
gr.
|
| 161 |
-
|
| 162 |
-
|
| 163 |
|
| 164 |
-
gr.Markdown("### Step 3: Preview the original and processed data")
|
| 165 |
with gr.Row():
|
| 166 |
-
|
| 167 |
-
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
|
| 173 |
with gr.Row():
|
| 174 |
-
|
| 175 |
-
feature_cols = gr.CheckboxGroup(label="Select Feature Columns (used to make predictions)")
|
| 176 |
|
| 177 |
-
gr.
|
| 178 |
-
|
|
|
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
# Trigger when file is uploaded or options changed
|
| 186 |
file_input.change(
|
| 187 |
fn=on_file_change,
|
| 188 |
inputs=[file_input, quantile_option, wordcount_option],
|
|
@@ -194,17 +185,32 @@ with gr.Blocks() as demo:
|
|
| 194 |
data_summary,
|
| 195 |
processed_preview,
|
| 196 |
output,
|
| 197 |
-
help_box
|
| 198 |
]
|
| 199 |
)
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
fn=train_model,
|
| 204 |
inputs=[df_state, target_col, feature_cols],
|
| 205 |
-
outputs=
|
| 206 |
)
|
| 207 |
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
if __name__ == "__main__":
|
| 210 |
demo.launch(share=True)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
from sklearn.model_selection import train_test_split
|
| 5 |
from sklearn.ensemble import RandomForestClassifier
|
| 6 |
+
from sklearn.preprocessing import LabelEncoder
|
| 7 |
+
from sklearn.metrics import classification_report
|
| 8 |
+
|
| 9 |
+
# ----------- Helper Functions -----------
|
| 10 |
+
|
| 11 |
+
def load_data(file):
|
| 12 |
+
try:
|
| 13 |
+
if file.name.endswith(".csv"):
|
| 14 |
+
df = pd.read_csv(file.name)
|
| 15 |
+
else:
|
| 16 |
+
df = pd.read_excel(file.name)
|
| 17 |
+
|
| 18 |
+
# Show first 5 rows of the uploaded file
|
| 19 |
+
preview = df.head(5)
|
| 20 |
+
|
| 21 |
+
# Create a short summary with column types and missing values
|
| 22 |
+
summary = pd.DataFrame({
|
| 23 |
+
"Column": df.columns,
|
| 24 |
+
"Data Type": [df[col].dtype for col in df.columns],
|
| 25 |
+
"Missing (%)": [df[col].isnull().mean() * 100 for col in df.columns]
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
return df, df.columns.tolist(), preview, summary.to_markdown(), ""
|
| 29 |
+
except Exception as e:
|
| 30 |
+
return None, [], pd.DataFrame(), "", f"β Error loading file: {e}"
|
| 31 |
+
|
| 32 |
def preprocess_dataframe(df, quantile_binning=False, count_words=False):
|
| 33 |
+
# Copy the original DataFrame to avoid overwriting
|
| 34 |
df = df.copy()
|
| 35 |
|
| 36 |
+
# If user selects quantile binning
|
| 37 |
if quantile_binning:
|
| 38 |
+
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
|
| 39 |
+
for col in numeric_cols:
|
| 40 |
try:
|
| 41 |
+
df[col + "_qbin"] = pd.qcut(df[col], q=4, labels=["Q1", "Q2", "Q3", "Q4"])
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Warning (qbin failed for {col}):", e)
|
| 44 |
|
| 45 |
+
# If user selects count_words for text columns
|
| 46 |
if count_words:
|
| 47 |
+
text_cols = df.select_dtypes(include=["object"]).columns
|
| 48 |
+
for col in text_cols:
|
| 49 |
+
try:
|
| 50 |
+
df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Warning (wordcount failed for {col}):", e)
|
| 53 |
|
| 54 |
return df
|
| 55 |
|
| 56 |
+
def train_model(df, target_column, feature_columns):
|
| 57 |
+
# Remove rows with missing target values
|
| 58 |
+
df = df.dropna(subset=[target_column])
|
|
|
|
| 59 |
|
| 60 |
+
# Fill missing values in feature columns
|
| 61 |
+
for col in feature_columns:
|
| 62 |
+
if df[col].dtype == "O":
|
| 63 |
+
df[col] = df[col].fillna("missing")
|
|
|
|
|
|
|
|
|
|
| 64 |
else:
|
| 65 |
+
df[col] = df[col].fillna(df[col].median())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
X = df[feature_columns]
|
| 68 |
+
y = df[target_column]
|
|
|
|
| 69 |
|
| 70 |
+
# Encode categorical features
|
| 71 |
+
for col in X.select_dtypes(include=["object"]).columns:
|
| 72 |
+
X[col] = LabelEncoder().fit_transform(X[col])
|
| 73 |
|
| 74 |
+
# Encode target if it's categorical
|
| 75 |
+
if y.dtype == "O":
|
| 76 |
+
y = LabelEncoder().fit_transform(y)
|
| 77 |
|
| 78 |
+
# Split into train and test sets
|
| 79 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
# Train a simple Random Forest model
|
| 82 |
+
model = RandomForestClassifier()
|
| 83 |
model.fit(X_train, y_train)
|
|
|
|
| 84 |
|
| 85 |
+
# Predict and show classification report
|
| 86 |
+
y_pred = model.predict(X_test)
|
| 87 |
report = classification_report(y_test, y_pred)
|
| 88 |
|
| 89 |
+
return report
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
# ----------- Gradio Interface Setup -----------
|
|
|
|
|
|
|
|
|
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
with gr.Blocks() as demo:
|
| 94 |
+
gr.Markdown("# π§ CSV/XLSX Classifier with Auto Summary and Visualization")
|
| 95 |
|
| 96 |
+
# Store the DataFrame in memory
|
| 97 |
df_state = gr.State(None)
|
| 98 |
|
| 99 |
+
# Upload section
|
| 100 |
+
with gr.Row():
|
| 101 |
+
file_input = gr.File(label="π Upload CSV or Excel File", file_types=[".csv", ".xlsx", ".xls"])
|
| 102 |
|
| 103 |
+
with gr.Row():
|
| 104 |
+
quantile_option = gr.Checkbox(label="π Discretize Numeric Columns into Quartiles")
|
| 105 |
+
wordcount_option = gr.Checkbox(label="π Count Words in Text Columns")
|
| 106 |
|
|
|
|
| 107 |
with gr.Row():
|
| 108 |
+
target_col = gr.Dropdown(label="π― Target Column (What you want to predict)", choices=[])
|
| 109 |
+
feature_cols = gr.CheckboxGroup(label="π§Ύ Feature Columns (Used to predict target)", choices=[])
|
| 110 |
|
| 111 |
+
# Buttons
|
| 112 |
+
with gr.Row():
|
| 113 |
+
train_button = gr.Button("π Train Model")
|
| 114 |
+
clear_button = gr.Button("π Clear All")
|
| 115 |
|
| 116 |
+
# Outputs
|
| 117 |
with gr.Row():
|
| 118 |
+
output = gr.Textbox(label="π Model Output (Classification Report)", lines=10)
|
|
|
|
| 119 |
|
| 120 |
+
with gr.Row():
|
| 121 |
+
data_summary = gr.Textbox(label="π Data Summary", lines=10)
|
| 122 |
+
help_box = gr.Textbox(label="π‘ Help", lines=5, value="βοΈ Upload a dataset, choose preprocessing options, then train.")
|
| 123 |
|
| 124 |
+
# Data Previews
|
| 125 |
+
with gr.Row():
|
| 126 |
+
table_preview = gr.DataFrame(label="π Original Data Preview")
|
| 127 |
+
processed_preview = gr.DataFrame(label="π§ͺ Processed Data Preview (with new columns)")
|
| 128 |
+
|
| 129 |
+
# ----------- Define App Logic -----------
|
| 130 |
+
|
| 131 |
+
# Handle file upload and update column options
|
| 132 |
+
def on_file_change(file, quantile_binning, count_words):
|
| 133 |
+
df, _, preview, summary_md, error = load_data(file)
|
| 134 |
+
if df is None:
|
| 135 |
+
return None, gr.update(choices=[], value=None), gr.update(choices=[], value=[]), pd.DataFrame(), "", "", "", error
|
| 136 |
+
|
| 137 |
+
# Apply preprocessing
|
| 138 |
+
df_processed = preprocess_dataframe(df, quantile_binning, count_words)
|
| 139 |
+
|
| 140 |
+
# Update selectors with new columns from the processed DataFrame
|
| 141 |
+
columns = list(df_processed.columns)
|
| 142 |
+
|
| 143 |
+
return (
|
| 144 |
+
df_processed, # Store processed df in state
|
| 145 |
+
gr.update(choices=columns, value=None),
|
| 146 |
+
gr.update(choices=columns, value=[]),
|
| 147 |
+
preview,
|
| 148 |
+
summary_md,
|
| 149 |
+
df_processed.head(100), # Show processed preview
|
| 150 |
+
"", # Clear model output
|
| 151 |
+
"" # Clear help
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# Handle training the model
|
| 155 |
+
def on_train(df, target, features):
|
| 156 |
+
if df is None:
|
| 157 |
+
return "β οΈ Please upload a file first."
|
| 158 |
+
if target is None or not features:
|
| 159 |
+
return "β οΈ Please select target and feature columns."
|
| 160 |
+
return train_model(df, target, features)
|
| 161 |
+
|
| 162 |
+
# Clear all interface elements
|
| 163 |
+
def on_clear():
|
| 164 |
+
return (
|
| 165 |
+
None, # df_state
|
| 166 |
+
None, # target_col
|
| 167 |
+
[], # feature_cols
|
| 168 |
+
pd.DataFrame(),
|
| 169 |
+
"",
|
| 170 |
+
pd.DataFrame(),
|
| 171 |
+
"",
|
| 172 |
+
"βοΈ Upload a dataset, choose preprocessing options, then train."
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
# ----------- Connect Actions to Widgets -----------
|
| 176 |
|
|
|
|
| 177 |
file_input.change(
|
| 178 |
fn=on_file_change,
|
| 179 |
inputs=[file_input, quantile_option, wordcount_option],
|
|
|
|
| 185 |
data_summary,
|
| 186 |
processed_preview,
|
| 187 |
output,
|
| 188 |
+
help_box
|
| 189 |
]
|
| 190 |
)
|
| 191 |
|
| 192 |
+
train_button.click(
|
| 193 |
+
fn=on_train,
|
|
|
|
| 194 |
inputs=[df_state, target_col, feature_cols],
|
| 195 |
+
outputs=output
|
| 196 |
)
|
| 197 |
|
| 198 |
+
clear_button.click(
|
| 199 |
+
fn=on_clear,
|
| 200 |
+
inputs=[],
|
| 201 |
+
outputs=[
|
| 202 |
+
df_state,
|
| 203 |
+
target_col,
|
| 204 |
+
feature_cols,
|
| 205 |
+
table_preview,
|
| 206 |
+
data_summary,
|
| 207 |
+
processed_preview,
|
| 208 |
+
output,
|
| 209 |
+
help_box
|
| 210 |
+
]
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# ----------- Launch the App -----------
|
| 214 |
+
|
| 215 |
if __name__ == "__main__":
|
| 216 |
demo.launch(share=True)
|