File size: 8,548 Bytes
ea1fb77
 
 
98af5f3
95316bb
 
ea1fb77
aa123f2
6c51406
b412fe9
6849a4f
 
 
94e6de7
98af5f3
6c51406
aa123f2
6c51406
 
 
b412fe9
6c51406
b412fe9
6849a4f
6c51406
16b89ff
 
 
 
 
 
 
 
 
6c51406
95316bb
b412fe9
6c51406
b412fe9
98af5f3
6c51406
b412fe9
6c51406
 
 
16b89ff
 
 
95316bb
 
 
 
b412fe9
95316bb
 
6c51406
b412fe9
95316bb
 
 
 
 
6849a4f
95316bb
 
16b89ff
 
 
 
 
 
 
 
98af5f3
95316bb
b412fe9
 
 
 
 
 
 
6849a4f
def006a
 
ea1fb77
6849a4f
 
 
 
 
 
 
 
 
94e6de7
6849a4f
94e6de7
6849a4f
94e6de7
6849a4f
94e6de7
6849a4f
 
 
 
 
 
 
 
 
94e6de7
 
 
 
 
 
 
 
 
16b89ff
d3db3d3
 
6849a4f
ea1fb77
b412fe9
 
16b89ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b8217b
16b89ff
 
6849a4f
ea1fb77
b412fe9
16b89ff
 
 
 
 
98af5f3
16b89ff
 
 
 
 
 
b412fe9
 
 
aa123f2
16b89ff
 
 
 
 
6c51406
16b89ff
6849a4f
b412fe9
 
 
16b89ff
 
 
 
 
6849a4f
 
 
 
 
16b89ff
 
 
6849a4f
 
94e6de7
 
6849a4f
94e6de7
16b89ff
 
 
94e6de7
 
b412fe9
 
 
 
 
 
 
 
 
 
 
98af5f3
b412fe9
 
 
 
 
ea1fb77
6849a4f
 
 
 
 
 
 
 
 
94e6de7
 
 
 
 
 
 
6849a4f
 
b412fe9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score

df_train = None
model = None
vectorizer = None
test_metrics = None

df_predict = None  # for batch prediction file
df_predict_results = None  # to store batch prediction results for export

def load_training_file(file):
    global df_train
    if file is None:
        return "โŒ Please upload a file.", gr.update(choices=[], value=None), gr.update(choices=[], value=None)

    df_train = pd.read_excel(file.name)
    col_names = list(df_train.columns)

    return f"โœ… Loaded training file with {len(df_train)} rows", gr.update(choices=col_names, value=col_names[0]), gr.update(choices=col_names, value=col_names[-1])

def interpret_score(score):
    # Simple interpretation based on accuracy score
    if score < 0.6:
        return "๐Ÿ”ด The model performance is LOW. Consider improving your data or features."
    elif score < 0.8:
        return "๐ŸŸ  The model performance is MODERATE. It may work but could be improved."
    else:
        return "๐ŸŸข The model performance is STRONG. The model is reliable."

def train_model(text_column, target_column):
    global model, vectorizer, test_metrics, df_train

    if df_train is None:
        return "โŒ No training data loaded."

    if text_column not in df_train.columns or target_column not in df_train.columns:
        return "โŒ Invalid column selection."

    df_filtered = df_train.dropna(subset=[text_column, target_column])

    if len(df_filtered) < 10:
        return "โŒ Not enough data after filtering for training. Need at least 10 samples."

    X_train, X_test, y_train, y_test = train_test_split(
        df_filtered[text_column], df_filtered[target_column], test_size=0.2, random_state=42
    )

    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    report = classification_report(y_test, y_pred, zero_division=0)

    performance_msg = interpret_score(accuracy)

    test_metrics = (
        f"Accuracy: {accuracy:.2%}\n"
        f"Precision (weighted): {precision:.2%}\n\n"
        f"{performance_msg}\n\n"
        f"Classification Report:\n{report}"
    )

    return f"โœ… Model trained on {len(df_filtered)} examples.\n\nTest set evaluation:\n{test_metrics}"

def predict_label(text_input):
    if model is None or vectorizer is None:
        return "โŒ Model is not trained yet."

    X = vectorizer.transform([text_input])
    prediction = model.predict(X)[0]
    proba = model.predict_proba(X).max()

    return f"๐Ÿ”ฎ Prediction: {prediction} (confidence: {proba:.2%})"

def load_prediction_file(file):
    global df_predict
    if file is None:
        return "โŒ Please upload a prediction file.", gr.update(choices=[], value=None)
    df_predict = pd.read_excel(file.name)
    col_names = list(df_predict.columns)
    return f"โœ… Loaded prediction file with {len(df_predict)} rows", gr.update(choices=col_names, value=col_names[0])

def run_batch_prediction(text_column):
    global df_predict, model, vectorizer, df_predict_results
    if model is None or vectorizer is None:
        return "โŒ Model is not trained yet.", None
    if df_predict is None:
        return "โŒ No prediction file loaded.", None
    if text_column not in df_predict.columns:
        return "โŒ Invalid text column selected.", None

    df_filtered = df_predict.dropna(subset=[text_column]).copy()
    X = vectorizer.transform(df_filtered[text_column])
    preds = model.predict(X)
    probs = model.predict_proba(X).max(axis=1)

    df_filtered["Prediction"] = preds
    df_filtered["Confidence"] = probs

    df_predict_results = df_filtered  # save for export

    # Show preview of first 10 rows
    return f"โœ… Batch prediction completed on {len(df_filtered)} rows.", df_filtered.head(10)

def export_predictions():
    global df_predict_results
    if df_predict_results is None:
        return None
    export_path = "/mnt/data/predictions_output.xlsx"  # Gradio environment allows writing here
    df_predict_results.to_excel(export_path, index=False)
    return export_path

with gr.Blocks() as demo:
    gr.Markdown("# ๐Ÿง  Text Classification App")

    gr.Markdown(
        """
        ### How does this model work?
        This app uses a **Logistic Regression** model trained on your text data.
        - Text data is transformed into numbers using **TF-IDF vectorization**, which converts text into features based on word importance.
        - The model learns patterns from labeled examples you provide.
        - After training, it can predict the label/category of new text inputs.
        \n
        **Note:** Model performance depends heavily on quality and quantity of your data.
        """
    )

    gr.Markdown(
        "### Step 1: Upload your training data\n"
        "Upload an Excel file (`.xlsx`) containing your texts and corresponding labels."
    )

    with gr.Row():
        file_input = gr.File(label="Upload Training Excel File (.xlsx)", file_types=[".xlsx"],
                             interactive=True)
        load_button = gr.Button("๐Ÿ“‚ Load Training File")

    status_output = gr.Markdown()

    gr.Markdown(
        "After loading, select the text and target columns for training."
    )

    with gr.Row():
        text_column_dropdown = gr.Dropdown(label="Text column",
                                           interactive=True,
                                           info="Select the column that contains the text data.")
        target_column_dropdown = gr.Dropdown(label="Target column",
                                             interactive=True,
                                             info="Select the column that contains the labels to predict.")

    train_button = gr.Button("๐Ÿš€ Train Model")
    training_status = gr.Markdown()

    gr.Markdown(
        "### Step 2: Predict on single texts\n"
        "Enter a text below to get the model's predicted label."
    )

    with gr.Row():
        input_text = gr.Textbox(label="Enter text to classify", placeholder="Type some text here...")
        predict_button = gr.Button("๐Ÿ” Predict Single")

    prediction_output = gr.Markdown()

    gr.Markdown(
        "### Step 3: Batch prediction\n"
        "Upload a new Excel file with texts to predict multiple labels at once."
    )

    with gr.Row():
        pred_file_input = gr.File(label="Upload Prediction Excel File (.xlsx)", file_types=[".xlsx"])
        load_pred_button = gr.Button("๐Ÿ“‚ Load Prediction File")

    pred_status = gr.Markdown()

    pred_text_column_dropdown = gr.Dropdown(label="Text column for Prediction",
                                            info="Select the column in your prediction file containing text to classify.")

    batch_pred_button = gr.Button("โšก Run Batch Prediction")
    batch_pred_status = gr.Markdown()
    batch_pred_preview = gr.Dataframe(headers=None, interactive=False)

    export_button = gr.Button("โฌ‡๏ธ Export Predictions")
    gr.Markdown(
        "Click **Export Predictions** to download the batch prediction results as an Excel file."
    )

    # Button connections
    load_button.click(
        fn=load_training_file,
        inputs=file_input,
        outputs=[status_output, text_column_dropdown, target_column_dropdown]
    )

    train_button.click(
        fn=train_model,
        inputs=[text_column_dropdown, target_column_dropdown],
        outputs=training_status
    )

    predict_button.click(
        fn=predict_label,
        inputs=input_text,
        outputs=prediction_output
    )

    load_pred_button.click(
        fn=load_prediction_file,
        inputs=pred_file_input,
        outputs=[pred_status, pred_text_column_dropdown]
    )

    batch_pred_button.click(
        fn=run_batch_prediction,
        inputs=pred_text_column_dropdown,
        outputs=[batch_pred_status, batch_pred_preview]
    )

    export_button.click(
        fn=export_predictions,
        inputs=[],
        outputs=gr.File(file_types=[".xlsx"])
    )

if __name__ == "__main__":
    demo.launch()