File size: 7,052 Bytes
680ff8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dd1ac9
680ff8e
 
 
bb3f86e
680ff8e
 
 
0dd1ac9
 
 
 
680ff8e
 
bb3f86e
680ff8e
 
 
 
 
 
 
bb3f86e
680ff8e
 
8a99441
680ff8e
8a99441
680ff8e
bb3f86e
680ff8e
 
 
 
161de8d
bb3f86e
680ff8e
 
bb3f86e
680ff8e
 
 
 
bb3f86e
 
680ff8e
db2468f
 
 
 
 
 
 
 
 
 
680ff8e
 
 
 
bb3f86e
56388de
 
9451101
680ff8e
 
bb3f86e
680ff8e
 
0dd1ac9
 
 
 
9451101
 
 
 
680ff8e
 
 
 
 
 
d9801a1
 
680ff8e
bb3f86e
93debb2
 
 
 
680ff8e
 
 
bb3f86e
680ff8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb3f86e
680ff8e
 
 
 
 
 
 
 
8896cb3
680ff8e
 
bb3f86e
680ff8e
 
 
 
 
 
 
bb3f86e
680ff8e
 
 
bb3f86e
680ff8e
64545b8
56388de
680ff8e
 
 
 
 
bb3f86e
680ff8e
 
 
 
 
3439038
680ff8e
 
bb3f86e
680ff8e
56388de
 
3439038
680ff8e
 
 
 
 
 
 
cba7173
 
3439038
680ff8e
 
 
 
 
 
3439038
680ff8e
 
bb3f86e
cba7173
680ff8e
 
 
 
bb3f86e
efbb21f
680ff8e
 
bb3f86e
935c52f
680ff8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import gradio as gr
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import plotly.express as px
import plotly.graph_objects as go
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score

warnings.filterwarnings("ignore")

# Hugging face dataset import
print("Loading dataset...")
ds = load_dataset("uhoui/text-tone-classifier")

# Optional: download csv (colab)
# df = ds['train'].to_pandas()
# df.to_csv("text_tone_classifier.csv", index=False)

df = pd.DataFrame(ds["train"])

# Console Log dataset and class
print(f"Dataset size: {len(df)} entries")
print(f"Columns: {df.columns}")

label_counts = df['label'].value_counts()
print("\nClass distribution:")
print(label_counts)

# Labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
print(label_encoder)
num_classes = len(label_encoder.classes_)
print(num_classes)

# Train  testsplit
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['label_encoded'], 
    test_size=0.2, 
    random_state=42,
    stratify=None
)

# TFIDF Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# SMOTE
print("Handling class imbalance (via SNOTE)...")
try:
    smallest_class_size = min(np.bincount(y_train)[np.bincount(y_train) > 0])
    k_neighbors = min(5, smallest_class_size - 1)
    
    if k_neighbors > 0:
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
        print(f"After SMOTE: {X_train_resampled.shape}")
    else:
        print("Classes too small for SMOTE, using original data.")
        X_train_resampled, y_train_resampled = X_train_tfidf, y_train
except ValueError as e:
    print(f"SMOTE error: {e}. Using original data.")
    X_train_resampled, y_train_resampled = X_train_tfidf, y_train

# Logistic Regression Model
# max iter exceeding 200 doesnt improve anything
# Don't set C low, set to 100+ default. 200 works better.
model = LogisticRegression(C=200, max_iter=200, n_jobs=-1, solver='lbfgs', multi_class='multinomial')
model.fit(X_train_resampled, y_train_resampled)

# Evaluate Model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {(1 - accuracy) * 100:.2f}%")
print(f"Precision: {(1 - precision) * 100:.2f}%")
print(f"Recall: {(1 - recall) * 100:.2f}%")
print(f"F1 Score: {(1 - f1) * 100:.2f}%")

def predict_tone(text):
    text_tfidf = tfidf.transform([text])
    
    probs = model.predict_proba(text_tfidf)[0]
    
    pred_class_encoded = model.classes_[np.argmax(probs)]
    pred_class = label_encoder.inverse_transform([pred_class_encoded])[0]
    
    trained_labels = model.classes_  

    trained_label_names = label_encoder.inverse_transform(trained_labels)

    results = {label: float(prob) for label, prob in zip(trained_label_names, probs)}
    
    sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}
    
    top_n = 5  # Top 5, adjust later if needed
    top_labels = list(sorted_results.keys())[:top_n]
    top_probs = list(sorted_results.values())[:top_n]
    
    colors = ["rgba(64, 128, 255, " + str(min(1.0, p + 0.3)) + ")" for p in top_probs]
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=top_probs,
        y=top_labels,
        orientation='h',
        marker_color=colors,
        text=[f"{p:.1%}" for p in top_probs],
        textposition='auto'
    ))
    
    fig.update_layout(
        title="Emotion Probability",
        xaxis_title="Probability",
        yaxis_title="Emotion",
        height=400,
        margin=dict(l=20, r=20, t=40, b=20),
        xaxis=dict(range=[0, 1])
    )
    
    # Fetch examples
    example_texts = df[df['label'] == pred_class]['text'].sample(min(3, len(df[df['label'] == pred_class]))).tolist()
    
    return pred_class, sorted_results, fig, example_texts

def get_tone_examples(tone):
    examples = df[df['label'] == tone]['text'].sample(min(5, len(df[df['label'] == tone]))).tolist()
    return examples

# Gradio interface
def analyze_tone(text, selected_tone=None):
    if not text:
        return "Enter the text to analyze:", {}, None, []
    
    if selected_tone and not text:
        examples = get_tone_examples(selected_tone)
        return f"Examples of '{selected_tone}' tone:", {}, None, examples
    
    predicted_tone, all_probs, fig, examples = predict_tone(text)
    
    message = f"The tone is: **{predicted_tone}**"
    
    return message, all_probs, fig, examples

# Gradio interface Creation
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.Markdown("# Text Tone Sentimental Analyzer")
    gr.Markdown("Be mindful of punctuation as it affects results. Slang is unaccounted for due to dataset constraints.")
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Enter your text here",
                placeholder="Example: The satisfaction of completing a difficult puzzle is indescribable.",
                lines=5
            )
            analyze_button = gr.Button("Analyze Tone", variant="primary")
            
        with gr.Column(scale=2):
            # Example Tones Dropdown
            tone_dropdown = gr.Dropdown(
                choices=sorted(df['label'].unique().tolist()),
                label="Select a tone to view an example below."
            )

    gr.Markdown("<br>", elem_id="line-break-1")
    
    with gr.Row():
        with gr.Column(scale=1):
            result_message = gr.Markdown()
        
    with gr.Row():
        with gr.Column(scale=2):
            plot_output = gr.Plot(label="Tone Probabilities")
        with gr.Column(scale=1):
            all_probs_output = gr.JSON(label="All Probabilities")
            
    with gr.Row():
        examples_output = gr.Dataframe(
            headers=["Examples of similar texts"],
            datatype=["str"],
            label="Example texts with similar tone"
        )
    
    analyze_button.click(
        fn=analyze_tone,
        inputs=[text_input, tone_dropdown],  
        outputs=[result_message, all_probs_output, plot_output, examples_output]
    )
    
    tone_dropdown.change(
        fn=get_tone_examples,
        inputs=tone_dropdown,  
        outputs=examples_output
    )

# Main
if __name__ == "__main__":
    demo.launch()