Spaces:

clementBE
/

Trainer4Xlsx

Paused

App Files Files Community

Trainer4Xlsx / app.py

clementBE

Update app.py

16b89ff verified 7 months ago

raw

history blame contribute delete

8.55 kB

	import gradio as gr
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report, accuracy_score, precision_score

	df_train = None
	model = None
	vectorizer = None
	test_metrics = None

	df_predict = None # for batch prediction file
	df_predict_results = None # to store batch prediction results for export

	def load_training_file(file):
	global df_train
	if file is None:
	return "❌ Please upload a file.", gr.update(choices=[], value=None), gr.update(choices=[], value=None)

	df_train = pd.read_excel(file.name)
	col_names = list(df_train.columns)

	return f"✅ Loaded training file with {len(df_train)} rows", gr.update(choices=col_names, value=col_names[0]), gr.update(choices=col_names, value=col_names[-1])

	def interpret_score(score):
	# Simple interpretation based on accuracy score
	if score < 0.6:
	return "🔴 The model performance is LOW. Consider improving your data or features."
	elif score < 0.8:
	return "🟠 The model performance is MODERATE. It may work but could be improved."
	else:
	return "🟢 The model performance is STRONG. The model is reliable."

	def train_model(text_column, target_column):
	global model, vectorizer, test_metrics, df_train

	if df_train is None:
	return "❌ No training data loaded."

	if text_column not in df_train.columns or target_column not in df_train.columns:
	return "❌ Invalid column selection."

	df_filtered = df_train.dropna(subset=[text_column, target_column])

	if len(df_filtered) < 10:
	return "❌ Not enough data after filtering for training. Need at least 10 samples."

	X_train, X_test, y_train, y_test = train_test_split(
	df_filtered[text_column], df_filtered[target_column], test_size=0.2, random_state=42
	)

	vectorizer = TfidfVectorizer()
	X_train_vec = vectorizer.fit_transform(X_train)
	X_test_vec = vectorizer.transform(X_test)

	model = LogisticRegression(max_iter=1000)
	model.fit(X_train_vec, y_train)

	y_pred = model.predict(X_test_vec)

	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
	report = classification_report(y_test, y_pred, zero_division=0)

	performance_msg = interpret_score(accuracy)

	test_metrics = (
	f"Accuracy: {accuracy:.2%}\n"
	f"Precision (weighted): {precision:.2%}\n\n"
	f"{performance_msg}\n\n"
	f"Classification Report:\n{report}"
	)

	return f"✅ Model trained on {len(df_filtered)} examples.\n\nTest set evaluation:\n{test_metrics}"

	def predict_label(text_input):
	if model is None or vectorizer is None:
	return "❌ Model is not trained yet."

	X = vectorizer.transform([text_input])
	prediction = model.predict(X)[0]
	proba = model.predict_proba(X).max()

	return f"🔮 Prediction: {prediction} (confidence: {proba:.2%})"

	def load_prediction_file(file):
	global df_predict
	if file is None:
	return "❌ Please upload a prediction file.", gr.update(choices=[], value=None)
	df_predict = pd.read_excel(file.name)
	col_names = list(df_predict.columns)
	return f"✅ Loaded prediction file with {len(df_predict)} rows", gr.update(choices=col_names, value=col_names[0])

	def run_batch_prediction(text_column):
	global df_predict, model, vectorizer, df_predict_results
	if model is None or vectorizer is None:
	return "❌ Model is not trained yet.", None
	if df_predict is None:
	return "❌ No prediction file loaded.", None
	if text_column not in df_predict.columns:
	return "❌ Invalid text column selected.", None

	df_filtered = df_predict.dropna(subset=[text_column]).copy()
	X = vectorizer.transform(df_filtered[text_column])
	preds = model.predict(X)
	probs = model.predict_proba(X).max(axis=1)

	df_filtered["Prediction"] = preds
	df_filtered["Confidence"] = probs

	df_predict_results = df_filtered # save for export

	# Show preview of first 10 rows
	return f"✅ Batch prediction completed on {len(df_filtered)} rows.", df_filtered.head(10)

	def export_predictions():
	global df_predict_results
	if df_predict_results is None:
	return None
	export_path = "/mnt/data/predictions_output.xlsx" # Gradio environment allows writing here
	df_predict_results.to_excel(export_path, index=False)
	return export_path

	with gr.Blocks() as demo:
	gr.Markdown("# 🧠 Text Classification App")

	gr.Markdown(
	"""
	### How does this model work?
	This app uses a Logistic Regression model trained on your text data.
	- Text data is transformed into numbers using TF-IDF vectorization, which converts text into features based on word importance.
	- The model learns patterns from labeled examples you provide.
	- After training, it can predict the label/category of new text inputs.
	\n
	Note: Model performance depends heavily on quality and quantity of your data.
	"""
	)

	gr.Markdown(
	"### Step 1: Upload your training data\n"
	"Upload an Excel file (`.xlsx`) containing your texts and corresponding labels."
	)

	with gr.Row():
	file_input = gr.File(label="Upload Training Excel File (.xlsx)", file_types=[".xlsx"],
	interactive=True)
	load_button = gr.Button("📂 Load Training File")

	status_output = gr.Markdown()

	gr.Markdown(
	"After loading, select the text and target columns for training."
	)

	with gr.Row():
	text_column_dropdown = gr.Dropdown(label="Text column",
	interactive=True,
	info="Select the column that contains the text data.")
	target_column_dropdown = gr.Dropdown(label="Target column",
	interactive=True,
	info="Select the column that contains the labels to predict.")

	train_button = gr.Button("🚀 Train Model")
	training_status = gr.Markdown()

	gr.Markdown(
	"### Step 2: Predict on single texts\n"
	"Enter a text below to get the model's predicted label."
	)

	with gr.Row():
	input_text = gr.Textbox(label="Enter text to classify", placeholder="Type some text here...")
	predict_button = gr.Button("🔍 Predict Single")

	prediction_output = gr.Markdown()

	gr.Markdown(
	"### Step 3: Batch prediction\n"
	"Upload a new Excel file with texts to predict multiple labels at once."
	)

	with gr.Row():
	pred_file_input = gr.File(label="Upload Prediction Excel File (.xlsx)", file_types=[".xlsx"])
	load_pred_button = gr.Button("📂 Load Prediction File")

	pred_status = gr.Markdown()

	pred_text_column_dropdown = gr.Dropdown(label="Text column for Prediction",
	info="Select the column in your prediction file containing text to classify.")

	batch_pred_button = gr.Button("⚡ Run Batch Prediction")
	batch_pred_status = gr.Markdown()
	batch_pred_preview = gr.Dataframe(headers=None, interactive=False)

	export_button = gr.Button("⬇️ Export Predictions")
	gr.Markdown(
	"Click Export Predictions to download the batch prediction results as an Excel file."
	)

	# Button connections
	load_button.click(
	fn=load_training_file,
	inputs=file_input,
	outputs=[status_output, text_column_dropdown, target_column_dropdown]
	)

	train_button.click(
	fn=train_model,
	inputs=[text_column_dropdown, target_column_dropdown],
	outputs=training_status
	)

	predict_button.click(
	fn=predict_label,
	inputs=input_text,
	outputs=prediction_output
	)

	load_pred_button.click(
	fn=load_prediction_file,
	inputs=pred_file_input,
	outputs=[pred_status, pred_text_column_dropdown]
	)

	batch_pred_button.click(
	fn=run_batch_prediction,
	inputs=pred_text_column_dropdown,
	outputs=[batch_pred_status, batch_pred_preview]
	)

	export_button.click(
	fn=export_predictions,
	inputs=[],
	outputs=gr.File(file_types=[".xlsx"])
	)

	if __name__ == "__main__":
	demo.launch()