import gradio as gr import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score, precision_score df_train = None model = None vectorizer = None test_metrics = None df_predict = None # for batch prediction file df_predict_results = None # to store batch prediction results for export def load_training_file(file): global df_train if file is None: return "❌ Please upload a file.", gr.update(choices=[], value=None), gr.update(choices=[], value=None) df_train = pd.read_excel(file.name) col_names = list(df_train.columns) return f"✅ Loaded training file with {len(df_train)} rows", gr.update(choices=col_names, value=col_names[0]), gr.update(choices=col_names, value=col_names[-1]) def interpret_score(score): # Simple interpretation based on accuracy score if score < 0.6: return "🔴 The model performance is LOW. Consider improving your data or features." elif score < 0.8: return "🟠 The model performance is MODERATE. It may work but could be improved." else: return "🟢 The model performance is STRONG. The model is reliable." def train_model(text_column, target_column): global model, vectorizer, test_metrics, df_train if df_train is None: return "❌ No training data loaded." if text_column not in df_train.columns or target_column not in df_train.columns: return "❌ Invalid column selection." df_filtered = df_train.dropna(subset=[text_column, target_column]) if len(df_filtered) < 10: return "❌ Not enough data after filtering for training. Need at least 10 samples." X_train, X_test, y_train, y_test = train_test_split( df_filtered[text_column], df_filtered[target_column], test_size=0.2, random_state=42 ) vectorizer = TfidfVectorizer() X_train_vec = vectorizer.fit_transform(X_train) X_test_vec = vectorizer.transform(X_test) model = LogisticRegression(max_iter=1000) model.fit(X_train_vec, y_train) y_pred = model.predict(X_test_vec) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) report = classification_report(y_test, y_pred, zero_division=0) performance_msg = interpret_score(accuracy) test_metrics = ( f"Accuracy: {accuracy:.2%}\n" f"Precision (weighted): {precision:.2%}\n\n" f"{performance_msg}\n\n" f"Classification Report:\n{report}" ) return f"✅ Model trained on {len(df_filtered)} examples.\n\nTest set evaluation:\n{test_metrics}" def predict_label(text_input): if model is None or vectorizer is None: return "❌ Model is not trained yet." X = vectorizer.transform([text_input]) prediction = model.predict(X)[0] proba = model.predict_proba(X).max() return f"🔮 Prediction: {prediction} (confidence: {proba:.2%})" def load_prediction_file(file): global df_predict if file is None: return "❌ Please upload a prediction file.", gr.update(choices=[], value=None) df_predict = pd.read_excel(file.name) col_names = list(df_predict.columns) return f"✅ Loaded prediction file with {len(df_predict)} rows", gr.update(choices=col_names, value=col_names[0]) def run_batch_prediction(text_column): global df_predict, model, vectorizer, df_predict_results if model is None or vectorizer is None: return "❌ Model is not trained yet.", None if df_predict is None: return "❌ No prediction file loaded.", None if text_column not in df_predict.columns: return "❌ Invalid text column selected.", None df_filtered = df_predict.dropna(subset=[text_column]).copy() X = vectorizer.transform(df_filtered[text_column]) preds = model.predict(X) probs = model.predict_proba(X).max(axis=1) df_filtered["Prediction"] = preds df_filtered["Confidence"] = probs df_predict_results = df_filtered # save for export # Show preview of first 10 rows return f"✅ Batch prediction completed on {len(df_filtered)} rows.", df_filtered.head(10) def export_predictions(): global df_predict_results if df_predict_results is None: return None export_path = "/mnt/data/predictions_output.xlsx" # Gradio environment allows writing here df_predict_results.to_excel(export_path, index=False) return export_path with gr.Blocks() as demo: gr.Markdown("# 🧠 Text Classification App") gr.Markdown( """ ### How does this model work? This app uses a **Logistic Regression** model trained on your text data. - Text data is transformed into numbers using **TF-IDF vectorization**, which converts text into features based on word importance. - The model learns patterns from labeled examples you provide. - After training, it can predict the label/category of new text inputs. \n **Note:** Model performance depends heavily on quality and quantity of your data. """ ) gr.Markdown( "### Step 1: Upload your training data\n" "Upload an Excel file (`.xlsx`) containing your texts and corresponding labels." ) with gr.Row(): file_input = gr.File(label="Upload Training Excel File (.xlsx)", file_types=[".xlsx"], interactive=True) load_button = gr.Button("📂 Load Training File") status_output = gr.Markdown() gr.Markdown( "After loading, select the text and target columns for training." ) with gr.Row(): text_column_dropdown = gr.Dropdown(label="Text column", interactive=True, info="Select the column that contains the text data.") target_column_dropdown = gr.Dropdown(label="Target column", interactive=True, info="Select the column that contains the labels to predict.") train_button = gr.Button("🚀 Train Model") training_status = gr.Markdown() gr.Markdown( "### Step 2: Predict on single texts\n" "Enter a text below to get the model's predicted label." ) with gr.Row(): input_text = gr.Textbox(label="Enter text to classify", placeholder="Type some text here...") predict_button = gr.Button("🔍 Predict Single") prediction_output = gr.Markdown() gr.Markdown( "### Step 3: Batch prediction\n" "Upload a new Excel file with texts to predict multiple labels at once." ) with gr.Row(): pred_file_input = gr.File(label="Upload Prediction Excel File (.xlsx)", file_types=[".xlsx"]) load_pred_button = gr.Button("📂 Load Prediction File") pred_status = gr.Markdown() pred_text_column_dropdown = gr.Dropdown(label="Text column for Prediction", info="Select the column in your prediction file containing text to classify.") batch_pred_button = gr.Button("⚡ Run Batch Prediction") batch_pred_status = gr.Markdown() batch_pred_preview = gr.Dataframe(headers=None, interactive=False) export_button = gr.Button("⬇️ Export Predictions") gr.Markdown( "Click **Export Predictions** to download the batch prediction results as an Excel file." ) # Button connections load_button.click( fn=load_training_file, inputs=file_input, outputs=[status_output, text_column_dropdown, target_column_dropdown] ) train_button.click( fn=train_model, inputs=[text_column_dropdown, target_column_dropdown], outputs=training_status ) predict_button.click( fn=predict_label, inputs=input_text, outputs=prediction_output ) load_pred_button.click( fn=load_prediction_file, inputs=pred_file_input, outputs=[pred_status, pred_text_column_dropdown] ) batch_pred_button.click( fn=run_batch_prediction, inputs=pred_text_column_dropdown, outputs=[batch_pred_status, batch_pred_preview] ) export_button.click( fn=export_predictions, inputs=[], outputs=gr.File(file_types=[".xlsx"]) ) if __name__ == "__main__": demo.launch()