Update app.py
Browse files
app.py
CHANGED
|
@@ -17,6 +17,7 @@ from huggingface_hub import login
|
|
| 17 |
from sklearn.ensemble import RandomForestClassifier
|
| 18 |
from sklearn.model_selection import train_test_split, cross_val_score
|
| 19 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
|
|
| 20 |
from sklearn.preprocessing import LabelEncoder
|
| 21 |
from PIL import Image
|
| 22 |
|
|
@@ -151,6 +152,35 @@ def analyze_data(csv_file, additional_notes=""):
|
|
| 151 |
run.finish()
|
| 152 |
return format_analysis_report(analysis_result, visuals)
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
|
|
@@ -171,6 +201,61 @@ def train_model(_):
|
|
| 171 |
|
| 172 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
def objective(trial):
|
| 175 |
params = {
|
| 176 |
"n_estimators": trial.suggest_int("n_estimators", 50, 200),
|
|
|
|
| 17 |
from sklearn.ensemble import RandomForestClassifier
|
| 18 |
from sklearn.model_selection import train_test_split, cross_val_score
|
| 19 |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
| 20 |
+
from sklearn.metrics import ConfusionMatrixDisplay
|
| 21 |
from sklearn.preprocessing import LabelEncoder
|
| 22 |
from PIL import Image
|
| 23 |
|
|
|
|
| 152 |
run.finish()
|
| 153 |
return format_analysis_report(analysis_result, visuals)
|
| 154 |
|
| 155 |
+
def compare_models():
|
| 156 |
+
if df_global is None:
|
| 157 |
+
return "Please upload and preprocess a dataset first."
|
| 158 |
+
|
| 159 |
+
target = df_global.columns[-1]
|
| 160 |
+
X = df_global.drop(target, axis=1)
|
| 161 |
+
y = df_global[target]
|
| 162 |
+
|
| 163 |
+
if y.dtype == 'object':
|
| 164 |
+
y = LabelEncoder().fit_transform(y)
|
| 165 |
+
|
| 166 |
+
models = {
|
| 167 |
+
"RandomForest": RandomForestClassifier(),
|
| 168 |
+
"LogisticRegression": LogisticRegression(max_iter=1000),
|
| 169 |
+
"SVC": SVC()
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
results = []
|
| 173 |
+
for name, model in models.items():
|
| 174 |
+
scores = cross_val_score(model, X, y, cv=5)
|
| 175 |
+
results.append({
|
| 176 |
+
"Model": name,
|
| 177 |
+
"CV Mean Accuracy": np.mean(scores),
|
| 178 |
+
"CV Std Dev": np.std(scores)
|
| 179 |
+
})
|
| 180 |
+
wandb.log({f"{name}_cv_mean": np.mean(scores), f"{name}_cv_std": np.std(scores)})
|
| 181 |
+
|
| 182 |
+
results_df = pd.DataFrame(results)
|
| 183 |
+
return results_df
|
| 184 |
|
| 185 |
|
| 186 |
|
|
|
|
| 201 |
|
| 202 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
| 203 |
|
| 204 |
+
# Error analysis
|
| 205 |
+
error_df = X_test.copy()
|
| 206 |
+
error_df["actual"] = y_test
|
| 207 |
+
error_df["predicted"] = y_pred
|
| 208 |
+
error_df["error"] = error_df["actual"] != error_df["predicted"]
|
| 209 |
+
common_errors = error_df[error_df["error"]].groupby(["actual", "predicted"]).size().reset_index(name='count')
|
| 210 |
+
|
| 211 |
+
def generate_report(metrics_df, trials_df, common_errors_df):
|
| 212 |
+
report = f"""
|
| 213 |
+
# Model Training Report
|
| 214 |
+
|
| 215 |
+
## Metrics
|
| 216 |
+
{metrics_df.to_markdown(index=False)}
|
| 217 |
+
|
| 218 |
+
## Top Trials
|
| 219 |
+
{trials_df.to_markdown(index=False)}
|
| 220 |
+
|
| 221 |
+
## Common Errors
|
| 222 |
+
{common_errors_df.to_markdown(index=False)}
|
| 223 |
+
|
| 224 |
+
_Generated on {time.strftime('%Y-%m-%d %H:%M:%S')}_
|
| 225 |
+
"""
|
| 226 |
+
with open("model_report.md", "w") as f:
|
| 227 |
+
f.write(report)
|
| 228 |
+
return "Report saved to model_report.md"
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
| 234 |
+
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax)
|
| 235 |
+
plt.savefig("confusion_matrix.png")
|
| 236 |
+
wandb.log({"confusion_matrix": wandb.Image("confusion_matrix.png")})
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# Inside your layout:
|
| 241 |
+
compare_button = gr.Button("Compare Models")
|
| 242 |
+
compare_output = gr.Dataframe()
|
| 243 |
+
|
| 244 |
+
compare_button.click(fn=compare_models, outputs=compare_output)
|
| 245 |
+
|
| 246 |
+
report_button = gr.Button("Generate Report")
|
| 247 |
+
report_status = gr.Textbox()
|
| 248 |
+
|
| 249 |
+
report_button.click(
|
| 250 |
+
fn=lambda: generate_report(metrics_df, trials_df, common_errors),
|
| 251 |
+
outputs=report_status
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
# Log common misclassifications to wandb
|
| 256 |
+
wandb.log({"common_errors": wandb.Table(dataframe=common_errors)})
|
| 257 |
+
|
| 258 |
+
|
| 259 |
def objective(trial):
|
| 260 |
params = {
|
| 261 |
"n_estimators": trial.suggest_int("n_estimators", 50, 200),
|