Spaces:

clementBE
/

smart_xlsx

Sleeping

App Files Files Community

smart_xlsx / app.py

clementBE

Update app.py

e2cdf36 verified 7 months ago

raw

history blame contribute delete

6.76 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from io import BytesIO
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import classification_report, accuracy_score
	from PIL import Image

	original_df = None
	processed_df = None
	trained_model = None
	processed_X_columns = None # Keep processed features list for importances

	def load_data(file):
	global original_df
	try:
	if file.name.endswith('.csv'):
	original_df = pd.read_csv(file)
	else:
	original_df = pd.read_excel(file)
	help_text = (
	"Step 1: Data loaded successfully!\n"
	"- Preview shows first 10 rows.\n"
	"- Next: Click 'Process Data' to discretize numeric columns and add word counts for text."
	)
	return original_df.head(10), "✅ File loaded successfully.", help_text
	except Exception as e:
	return pd.DataFrame(), f"❌ Error loading file: {e}", "Please upload a valid CSV or Excel file."

	def process_data():
	global original_df, processed_df
	if original_df is None:
	return pd.DataFrame(), gr.update(choices=[]), gr.update(choices=[]), "⚠️ Please load a dataset first.", ""
	df = original_df.copy()
	# Quartiles discretization
	for col in df.select_dtypes(include=np.number).columns:
	try:
	df[col + "_qbin"] = pd.qcut(df[col], 4, labels=False, duplicates='drop')
	except Exception:
	pass
	# Deciles discretization
	for col in df.select_dtypes(include=np.number).columns:
	try:
	df[col + "_decil"] = pd.qcut(df[col], 10, labels=False, duplicates='drop')
	except Exception:
	pass
	# Word counts for text columns
	for col in df.select_dtypes(include='object').columns:
	df[col + "_wordcount"] = df[col].astype(str).apply(lambda x: len(x.split()))
	processed_df = df.copy()
	all_columns = df.columns.tolist()
	help_text = (
	"Step 2: Data processed!\n"
	"- Numeric columns discretized into quartiles and deciles.\n"
	"- Word counts added for text columns.\n"
	"- You can now select your target and feature columns."
	)
	return df.head(10), gr.update(choices=all_columns), gr.update(choices=all_columns), "✅ Data processed.", help_text

	def train_model(target_col, feature_cols):
	global processed_df, trained_model, processed_X_columns
	if processed_df is None:
	return "⚠️ Please process your data first.", None, ""
	if not target_col or not feature_cols:
	return "⚠️ Please select a target and at least one feature.", None, ""

	try:
	X = processed_df[feature_cols]
	y = processed_df[target_col]

	# One-hot encoding categorical features if any
	X = pd.get_dummies(X)
	processed_X_columns = X.columns.tolist()

	# Train/test split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Train Random Forest Classifier
	clf = RandomForestClassifier(random_state=42)
	clf.fit(X_train, y_train)
	trained_model = clf

	# Predict & evaluate
	y_pred = clf.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred)

	# Feature importances
	fi = clf.feature_importances_
	fi_df = pd.DataFrame({'Feature': processed_X_columns, 'Importance': fi})
	fi_df = fi_df.sort_values(by='Importance', ascending=False).head(20)

	plt.figure(figsize=(10, 6))
	sns.heatmap(fi_df.set_index('Feature').T, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Feature Importance'})
	plt.title("Feature Importances Heatmap (Top 20)")
	plt.tight_layout()

	buf = BytesIO()
	plt.savefig(buf, format="png")
	plt.close()
	buf.seek(0)
	img = Image.open(buf)

	# Detailed help text
	help_text = (
	f"📊 Model type: Random Forest Classifier\n"
	f"🎯 Target: '{target_col}'\n"
	f"🧪 Features used: {len(feature_cols)}\n"
	f"✅ Accuracy on test set: {accuracy:.2%}\n\n"
	"📋 Classification Report Explanation:\n"
	"- Precision: Of predicted positives, how many are correct?\n"
	"- Recall: Of actual positives, how many were found?\n"
	"- F1-Score: Harmonic mean of precision & recall.\n\n"
	"🌡️ Heatmap Explanation:\n"
	"- Shows top 20 most important features by model.\n"
	"- Darker cells = higher influence on predictions.\n"
	"- Use this to understand which variables drive decisions."
	)

	return report, img, help_text

	except Exception as e:
	return f"❌ Model training failed: {e}", None, ""

	with gr.Blocks(title="Step-by-Step Model Trainer with Help and Heatmap") as app:
	gr.Markdown("## 🧠 Step-by-Step Model Trainer\nUpload your data, process it, train a model, and get help at each step!")

	with gr.Row():
	file_input = gr.File(label="📁 Upload CSV or Excel file")
	load_status = gr.Textbox(label="ℹ️ File Load Status", interactive=False)

	original_preview = gr.DataFrame(label="🔍 Original Data Preview (first 10 rows)")
	load_help = gr.Textbox(label="📖 Step 1 Help", interactive=False)

	process_button = gr.Button("⚙️ Process Data")
	processed_preview = gr.DataFrame(label="🔬 Processed Data Preview (first 10 rows)")
	process_status = gr.Textbox(label="ℹ️ Process Status", interactive=False)
	process_help = gr.Textbox(label="📖 Step 2 Help", interactive=False)

	target_selector = gr.Dropdown(label="🎯 Select Target Column", choices=[])
	feature_selector = gr.CheckboxGroup(label="📊 Select Feature Columns", choices=[])

	train_button = gr.Button("🚀 Train Model")
	train_output = gr.Textbox(label="📈 Classification Report", lines=15)
	heatmap_output = gr.Image(label="🌡️ Feature Importance Heatmap")
	train_help = gr.Textbox(label="📝 Help to read results", interactive=False, lines=12)

	# Callbacks
	file_input.change(
	fn=load_data,
	inputs=[file_input],
	outputs=[original_preview, load_status, load_help]
	)

	process_button.click(
	fn=process_data,
	inputs=[],
	outputs=[processed_preview, target_selector, feature_selector, process_status, process_help]
	)

	train_button.click(
	fn=train_model,
	inputs=[target_selector, feature_selector],
	outputs=[train_output, heatmap_output, train_help]
	)

	app.launch()