Spaces:

clementBE
/

Trainer4Xlsx

Paused

File size: 8,548 Bytes

ea1fb77
 
 
98af5f3
95316bb
 
ea1fb77
aa123f2
6c51406
b412fe9
6849a4f
 
 
94e6de7
98af5f3
6c51406
aa123f2
6c51406
 
 
b412fe9
6c51406
b412fe9
6849a4f
6c51406
16b89ff
 
 
 
 
 
 
 
 
6c51406
95316bb
b412fe9
6c51406
b412fe9
98af5f3
6c51406
b412fe9
6c51406
 
 
16b89ff
 
 
95316bb
 
 
 
b412fe9
95316bb
 
6c51406
b412fe9
95316bb
 
 
 
 
6849a4f
95316bb
 
16b89ff
 
 
 
 
 
 
 
98af5f3
95316bb
b412fe9
 
 
 
 
 
 
6849a4f
def006a
 
ea1fb77
6849a4f
 
 
 
 
 
 
 
 
94e6de7
6849a4f
94e6de7
6849a4f
94e6de7
6849a4f
94e6de7
6849a4f
 
 
 
 
 
 
 
 
94e6de7
 
 
 
 
 
 
 
 
16b89ff
d3db3d3
 
6849a4f
ea1fb77
b412fe9
 
16b89ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b8217b
16b89ff
 
6849a4f
ea1fb77
b412fe9
16b89ff
 
 
 
 
98af5f3
16b89ff
 
 
 
 
 
b412fe9
 
 
aa123f2
16b89ff
 
 
 
 
6c51406
16b89ff
6849a4f
b412fe9
 
 
16b89ff
 
 
 
 
6849a4f
 
 
 
 
16b89ff
 
 
6849a4f
 
94e6de7
 
6849a4f
94e6de7
16b89ff
 
 
94e6de7
 
b412fe9
 
 
 
 
 
 
 
 
 
 
98af5f3
b412fe9
 
 
 
 
ea1fb77
6849a4f
 
 
 
 
 
 
 
 
94e6de7
 
 
 
 
 
 
6849a4f
 
b412fe9

import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score

df_train = None
model = None
vectorizer = None
test_metrics = None

df_predict = None  # for batch prediction file
df_predict_results = None  # to store batch prediction results for export

def load_training_file(file):
    global df_train
    if file is None:
        return "❌ Please upload a file.", gr.update(choices=[], value=None), gr.update(choices=[], value=None)

    df_train = pd.read_excel(file.name)
    col_names = list(df_train.columns)

    return f"✅ Loaded training file with {len(df_train)} rows", gr.update(choices=col_names, value=col_names[0]), gr.update(choices=col_names, value=col_names[-1])

def interpret_score(score):
    # Simple interpretation based on accuracy score
    if score < 0.6:
        return "🔴 The model performance is LOW. Consider improving your data or features."
    elif score < 0.8:
        return "🟠 The model performance is MODERATE. It may work but could be improved."
    else:
        return "🟢 The model performance is STRONG. The model is reliable."

def train_model(text_column, target_column):
    global model, vectorizer, test_metrics, df_train

    if df_train is None:
        return "❌ No training data loaded."

    if text_column not in df_train.columns or target_column not in df_train.columns:
        return "❌ Invalid column selection."

    df_filtered = df_train.dropna(subset=[text_column, target_column])

    if len(df_filtered) < 10:
        return "❌ Not enough data after filtering for training. Need at least 10 samples."

    X_train, X_test, y_train, y_test = train_test_split(
        df_filtered[text_column], df_filtered[target_column], test_size=0.2, random_state=42
    )

    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    report = classification_report(y_test, y_pred, zero_division=0)

    performance_msg = interpret_score(accuracy)

    test_metrics = (
        f"Accuracy: {accuracy:.2%}\n"
        f"Precision (weighted): {precision:.2%}\n\n"
        f"{performance_msg}\n\n"
        f"Classification Report:\n{report}"
    )

    return f"✅ Model trained on {len(df_filtered)} examples.\n\nTest set evaluation:\n{test_metrics}"

def predict_label(text_input):
    if model is None or vectorizer is None:
        return "❌ Model is not trained yet."

    X = vectorizer.transform([text_input])
    prediction = model.predict(X)[0]
    proba = model.predict_proba(X).max()

    return f"🔮 Prediction: {prediction} (confidence: {proba:.2%})"

def load_prediction_file(file):
    global df_predict
    if file is None:
        return "❌ Please upload a prediction file.", gr.update(choices=[], value=None)
    df_predict = pd.read_excel(file.name)
    col_names = list(df_predict.columns)
    return f"✅ Loaded prediction file with {len(df_predict)} rows", gr.update(choices=col_names, value=col_names[0])

def run_batch_prediction(text_column):
    global df_predict, model, vectorizer, df_predict_results
    if model is None or vectorizer is None:
        return "❌ Model is not trained yet.", None
    if df_predict is None:
        return "❌ No prediction file loaded.", None
    if text_column not in df_predict.columns:
        return "❌ Invalid text column selected.", None

    df_filtered = df_predict.dropna(subset=[text_column]).copy()
    X = vectorizer.transform(df_filtered[text_column])
    preds = model.predict(X)
    probs = model.predict_proba(X).max(axis=1)

    df_filtered["Prediction"] = preds
    df_filtered["Confidence"] = probs

    df_predict_results = df_filtered  # save for export

    # Show preview of first 10 rows
    return f"✅ Batch prediction completed on {len(df_filtered)} rows.", df_filtered.head(10)

def export_predictions():
    global df_predict_results
    if df_predict_results is None:
        return None
    export_path = "/mnt/data/predictions_output.xlsx"  # Gradio environment allows writing here
    df_predict_results.to_excel(export_path, index=False)
    return export_path

with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Text Classification App")

    gr.Markdown(
        """
        ### How does this model work?
        This app uses a **Logistic Regression** model trained on your text data.
        - Text data is transformed into numbers using **TF-IDF vectorization**, which converts text into features based on word importance.
        - The model learns patterns from labeled examples you provide.
        - After training, it can predict the label/category of new text inputs.
        \n
        **Note:** Model performance depends heavily on quality and quantity of your data.
        """
    )

    gr.Markdown(
        "### Step 1: Upload your training data\n"
        "Upload an Excel file (`.xlsx`) containing your texts and corresponding labels."
    )

    with gr.Row():
        file_input = gr.File(label="Upload Training Excel File (.xlsx)", file_types=[".xlsx"],
                             interactive=True)
        load_button = gr.Button("📂 Load Training File")

    status_output = gr.Markdown()

    gr.Markdown(
        "After loading, select the text and target columns for training."
    )

    with gr.Row():
        text_column_dropdown = gr.Dropdown(label="Text column",
                                           interactive=True,
                                           info="Select the column that contains the text data.")
        target_column_dropdown = gr.Dropdown(label="Target column",
                                             interactive=True,
                                             info="Select the column that contains the labels to predict.")

    train_button = gr.Button("🚀 Train Model")
    training_status = gr.Markdown()

    gr.Markdown(
        "### Step 2: Predict on single texts\n"
        "Enter a text below to get the model's predicted label."
    )

    with gr.Row():
        input_text = gr.Textbox(label="Enter text to classify", placeholder="Type some text here...")
        predict_button = gr.Button("🔍 Predict Single")

    prediction_output = gr.Markdown()

    gr.Markdown(
        "### Step 3: Batch prediction\n"
        "Upload a new Excel file with texts to predict multiple labels at once."
    )

    with gr.Row():
        pred_file_input = gr.File(label="Upload Prediction Excel File (.xlsx)", file_types=[".xlsx"])
        load_pred_button = gr.Button("📂 Load Prediction File")

    pred_status = gr.Markdown()

    pred_text_column_dropdown = gr.Dropdown(label="Text column for Prediction",
                                            info="Select the column in your prediction file containing text to classify.")

    batch_pred_button = gr.Button("⚡ Run Batch Prediction")
    batch_pred_status = gr.Markdown()
    batch_pred_preview = gr.Dataframe(headers=None, interactive=False)

    export_button = gr.Button("⬇️ Export Predictions")
    gr.Markdown(
        "Click **Export Predictions** to download the batch prediction results as an Excel file."
    )

    # Button connections
    load_button.click(
        fn=load_training_file,
        inputs=file_input,
        outputs=[status_output, text_column_dropdown, target_column_dropdown]
    )

    train_button.click(
        fn=train_model,
        inputs=[text_column_dropdown, target_column_dropdown],
        outputs=training_status
    )

    predict_button.click(
        fn=predict_label,
        inputs=input_text,
        outputs=prediction_output
    )

    load_pred_button.click(
        fn=load_prediction_file,
        inputs=pred_file_input,
        outputs=[pred_status, pred_text_column_dropdown]
    )

    batch_pred_button.click(
        fn=run_batch_prediction,
        inputs=pred_text_column_dropdown,
        outputs=[batch_pred_status, batch_pred_preview]
    )

    export_button.click(
        fn=export_predictions,
        inputs=[],
        outputs=gr.File(file_types=[".xlsx"])
    )

if __name__ == "__main__":
    demo.launch()