Trainer4Xlsx / app.py
clementBE's picture
Update app.py
16b89ff verified
import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score
df_train = None
model = None
vectorizer = None
test_metrics = None
df_predict = None # for batch prediction file
df_predict_results = None # to store batch prediction results for export
def load_training_file(file):
global df_train
if file is None:
return "โŒ Please upload a file.", gr.update(choices=[], value=None), gr.update(choices=[], value=None)
df_train = pd.read_excel(file.name)
col_names = list(df_train.columns)
return f"โœ… Loaded training file with {len(df_train)} rows", gr.update(choices=col_names, value=col_names[0]), gr.update(choices=col_names, value=col_names[-1])
def interpret_score(score):
# Simple interpretation based on accuracy score
if score < 0.6:
return "๐Ÿ”ด The model performance is LOW. Consider improving your data or features."
elif score < 0.8:
return "๐ŸŸ  The model performance is MODERATE. It may work but could be improved."
else:
return "๐ŸŸข The model performance is STRONG. The model is reliable."
def train_model(text_column, target_column):
global model, vectorizer, test_metrics, df_train
if df_train is None:
return "โŒ No training data loaded."
if text_column not in df_train.columns or target_column not in df_train.columns:
return "โŒ Invalid column selection."
df_filtered = df_train.dropna(subset=[text_column, target_column])
if len(df_filtered) < 10:
return "โŒ Not enough data after filtering for training. Need at least 10 samples."
X_train, X_test, y_train, y_test = train_test_split(
df_filtered[text_column], df_filtered[target_column], test_size=0.2, random_state=42
)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
report = classification_report(y_test, y_pred, zero_division=0)
performance_msg = interpret_score(accuracy)
test_metrics = (
f"Accuracy: {accuracy:.2%}\n"
f"Precision (weighted): {precision:.2%}\n\n"
f"{performance_msg}\n\n"
f"Classification Report:\n{report}"
)
return f"โœ… Model trained on {len(df_filtered)} examples.\n\nTest set evaluation:\n{test_metrics}"
def predict_label(text_input):
if model is None or vectorizer is None:
return "โŒ Model is not trained yet."
X = vectorizer.transform([text_input])
prediction = model.predict(X)[0]
proba = model.predict_proba(X).max()
return f"๐Ÿ”ฎ Prediction: {prediction} (confidence: {proba:.2%})"
def load_prediction_file(file):
global df_predict
if file is None:
return "โŒ Please upload a prediction file.", gr.update(choices=[], value=None)
df_predict = pd.read_excel(file.name)
col_names = list(df_predict.columns)
return f"โœ… Loaded prediction file with {len(df_predict)} rows", gr.update(choices=col_names, value=col_names[0])
def run_batch_prediction(text_column):
global df_predict, model, vectorizer, df_predict_results
if model is None or vectorizer is None:
return "โŒ Model is not trained yet.", None
if df_predict is None:
return "โŒ No prediction file loaded.", None
if text_column not in df_predict.columns:
return "โŒ Invalid text column selected.", None
df_filtered = df_predict.dropna(subset=[text_column]).copy()
X = vectorizer.transform(df_filtered[text_column])
preds = model.predict(X)
probs = model.predict_proba(X).max(axis=1)
df_filtered["Prediction"] = preds
df_filtered["Confidence"] = probs
df_predict_results = df_filtered # save for export
# Show preview of first 10 rows
return f"โœ… Batch prediction completed on {len(df_filtered)} rows.", df_filtered.head(10)
def export_predictions():
global df_predict_results
if df_predict_results is None:
return None
export_path = "/mnt/data/predictions_output.xlsx" # Gradio environment allows writing here
df_predict_results.to_excel(export_path, index=False)
return export_path
with gr.Blocks() as demo:
gr.Markdown("# ๐Ÿง  Text Classification App")
gr.Markdown(
"""
### How does this model work?
This app uses a **Logistic Regression** model trained on your text data.
- Text data is transformed into numbers using **TF-IDF vectorization**, which converts text into features based on word importance.
- The model learns patterns from labeled examples you provide.
- After training, it can predict the label/category of new text inputs.
\n
**Note:** Model performance depends heavily on quality and quantity of your data.
"""
)
gr.Markdown(
"### Step 1: Upload your training data\n"
"Upload an Excel file (`.xlsx`) containing your texts and corresponding labels."
)
with gr.Row():
file_input = gr.File(label="Upload Training Excel File (.xlsx)", file_types=[".xlsx"],
interactive=True)
load_button = gr.Button("๐Ÿ“‚ Load Training File")
status_output = gr.Markdown()
gr.Markdown(
"After loading, select the text and target columns for training."
)
with gr.Row():
text_column_dropdown = gr.Dropdown(label="Text column",
interactive=True,
info="Select the column that contains the text data.")
target_column_dropdown = gr.Dropdown(label="Target column",
interactive=True,
info="Select the column that contains the labels to predict.")
train_button = gr.Button("๐Ÿš€ Train Model")
training_status = gr.Markdown()
gr.Markdown(
"### Step 2: Predict on single texts\n"
"Enter a text below to get the model's predicted label."
)
with gr.Row():
input_text = gr.Textbox(label="Enter text to classify", placeholder="Type some text here...")
predict_button = gr.Button("๐Ÿ” Predict Single")
prediction_output = gr.Markdown()
gr.Markdown(
"### Step 3: Batch prediction\n"
"Upload a new Excel file with texts to predict multiple labels at once."
)
with gr.Row():
pred_file_input = gr.File(label="Upload Prediction Excel File (.xlsx)", file_types=[".xlsx"])
load_pred_button = gr.Button("๐Ÿ“‚ Load Prediction File")
pred_status = gr.Markdown()
pred_text_column_dropdown = gr.Dropdown(label="Text column for Prediction",
info="Select the column in your prediction file containing text to classify.")
batch_pred_button = gr.Button("โšก Run Batch Prediction")
batch_pred_status = gr.Markdown()
batch_pred_preview = gr.Dataframe(headers=None, interactive=False)
export_button = gr.Button("โฌ‡๏ธ Export Predictions")
gr.Markdown(
"Click **Export Predictions** to download the batch prediction results as an Excel file."
)
# Button connections
load_button.click(
fn=load_training_file,
inputs=file_input,
outputs=[status_output, text_column_dropdown, target_column_dropdown]
)
train_button.click(
fn=train_model,
inputs=[text_column_dropdown, target_column_dropdown],
outputs=training_status
)
predict_button.click(
fn=predict_label,
inputs=input_text,
outputs=prediction_output
)
load_pred_button.click(
fn=load_prediction_file,
inputs=pred_file_input,
outputs=[pred_status, pred_text_column_dropdown]
)
batch_pred_button.click(
fn=run_batch_prediction,
inputs=pred_text_column_dropdown,
outputs=[batch_pred_status, batch_pred_preview]
)
export_button.click(
fn=export_predictions,
inputs=[],
outputs=gr.File(file_types=[".xlsx"])
)
if __name__ == "__main__":
demo.launch()