|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
import matplotlib.pyplot as plt |
|
|
from imblearn.over_sampling import SMOTE |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
import warnings |
|
|
from sklearn.metrics import precision_score, recall_score, f1_score |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
print("Loading dataset...") |
|
|
ds = load_dataset("uhoui/text-tone-classifier") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(ds["train"]) |
|
|
|
|
|
|
|
|
print(f"Dataset size: {len(df)} entries") |
|
|
print(f"Columns: {df.columns}") |
|
|
|
|
|
label_counts = df['label'].value_counts() |
|
|
print("\nClass distribution:") |
|
|
print(label_counts) |
|
|
|
|
|
|
|
|
label_encoder = LabelEncoder() |
|
|
df['label_encoded'] = label_encoder.fit_transform(df['label']) |
|
|
print(label_encoder) |
|
|
num_classes = len(label_encoder.classes_) |
|
|
print(num_classes) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
df['text'], |
|
|
df['label_encoded'], |
|
|
test_size=0.2, |
|
|
random_state=42, |
|
|
stratify=None |
|
|
) |
|
|
|
|
|
|
|
|
tfidf = TfidfVectorizer(max_features=5000) |
|
|
X_train_tfidf = tfidf.fit_transform(X_train) |
|
|
X_test_tfidf = tfidf.transform(X_test) |
|
|
|
|
|
|
|
|
print("Handling class imbalance (via SNOTE)...") |
|
|
try: |
|
|
smallest_class_size = min(np.bincount(y_train)[np.bincount(y_train) > 0]) |
|
|
k_neighbors = min(5, smallest_class_size - 1) |
|
|
|
|
|
if k_neighbors > 0: |
|
|
smote = SMOTE(random_state=42, k_neighbors=k_neighbors) |
|
|
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train) |
|
|
print(f"After SMOTE: {X_train_resampled.shape}") |
|
|
else: |
|
|
print("Classes too small for SMOTE, using original data.") |
|
|
X_train_resampled, y_train_resampled = X_train_tfidf, y_train |
|
|
except ValueError as e: |
|
|
print(f"SMOTE error: {e}. Using original data.") |
|
|
X_train_resampled, y_train_resampled = X_train_tfidf, y_train |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = LogisticRegression(C=200, max_iter=200, n_jobs=-1, solver='lbfgs', multi_class='multinomial') |
|
|
model.fit(X_train_resampled, y_train_resampled) |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test_tfidf) |
|
|
accuracy = accuracy_score(y_test, y_pred) |
|
|
precision = precision_score(y_test, y_pred, average='weighted') |
|
|
recall = recall_score(y_test, y_pred, average='weighted') |
|
|
f1 = f1_score(y_test, y_pred, average='weighted') |
|
|
|
|
|
print(f"Accuracy: {(1 - accuracy) * 100:.2f}%") |
|
|
print(f"Precision: {(1 - precision) * 100:.2f}%") |
|
|
print(f"Recall: {(1 - recall) * 100:.2f}%") |
|
|
print(f"F1 Score: {(1 - f1) * 100:.2f}%") |
|
|
|
|
|
def predict_tone(text): |
|
|
text_tfidf = tfidf.transform([text]) |
|
|
|
|
|
probs = model.predict_proba(text_tfidf)[0] |
|
|
|
|
|
pred_class_encoded = model.classes_[np.argmax(probs)] |
|
|
pred_class = label_encoder.inverse_transform([pred_class_encoded])[0] |
|
|
|
|
|
trained_labels = model.classes_ |
|
|
|
|
|
trained_label_names = label_encoder.inverse_transform(trained_labels) |
|
|
|
|
|
results = {label: float(prob) for label, prob in zip(trained_label_names, probs)} |
|
|
|
|
|
sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)} |
|
|
|
|
|
top_n = 5 |
|
|
top_labels = list(sorted_results.keys())[:top_n] |
|
|
top_probs = list(sorted_results.values())[:top_n] |
|
|
|
|
|
colors = ["rgba(64, 128, 255, " + str(min(1.0, p + 0.3)) + ")" for p in top_probs] |
|
|
|
|
|
fig = go.Figure() |
|
|
fig.add_trace(go.Bar( |
|
|
x=top_probs, |
|
|
y=top_labels, |
|
|
orientation='h', |
|
|
marker_color=colors, |
|
|
text=[f"{p:.1%}" for p in top_probs], |
|
|
textposition='auto' |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Emotion Probability", |
|
|
xaxis_title="Probability", |
|
|
yaxis_title="Emotion", |
|
|
height=400, |
|
|
margin=dict(l=20, r=20, t=40, b=20), |
|
|
xaxis=dict(range=[0, 1]) |
|
|
) |
|
|
|
|
|
|
|
|
example_texts = df[df['label'] == pred_class]['text'].sample(min(3, len(df[df['label'] == pred_class]))).tolist() |
|
|
|
|
|
return pred_class, sorted_results, fig, example_texts |
|
|
|
|
|
def get_tone_examples(tone): |
|
|
examples = df[df['label'] == tone]['text'].sample(min(5, len(df[df['label'] == tone]))).tolist() |
|
|
return examples |
|
|
|
|
|
|
|
|
def analyze_tone(text, selected_tone=None): |
|
|
if not text: |
|
|
return "Enter the text to analyze:", {}, None, [] |
|
|
|
|
|
if selected_tone and not text: |
|
|
examples = get_tone_examples(selected_tone) |
|
|
return f"Examples of '{selected_tone}' tone:", {}, None, examples |
|
|
|
|
|
predicted_tone, all_probs, fig, examples = predict_tone(text) |
|
|
|
|
|
message = f"The tone is: **{predicted_tone}**" |
|
|
|
|
|
return message, all_probs, fig, examples |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: |
|
|
gr.Markdown("# Text Tone Sentimental Analyzer") |
|
|
gr.Markdown("Be mindful of punctuation as it affects results. Slang is unaccounted for due to dataset constraints.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
text_input = gr.Textbox( |
|
|
label="Enter your text here", |
|
|
placeholder="Example: The satisfaction of completing a difficult puzzle is indescribable.", |
|
|
lines=5 |
|
|
) |
|
|
analyze_button = gr.Button("Analyze Tone", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
|
|
|
tone_dropdown = gr.Dropdown( |
|
|
choices=sorted(df['label'].unique().tolist()), |
|
|
label="Select a tone to view an example below." |
|
|
) |
|
|
|
|
|
gr.Markdown("<br>", elem_id="line-break-1") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
result_message = gr.Markdown() |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
plot_output = gr.Plot(label="Tone Probabilities") |
|
|
with gr.Column(scale=1): |
|
|
all_probs_output = gr.JSON(label="All Probabilities") |
|
|
|
|
|
with gr.Row(): |
|
|
examples_output = gr.Dataframe( |
|
|
headers=["Examples of similar texts"], |
|
|
datatype=["str"], |
|
|
label="Example texts with similar tone" |
|
|
) |
|
|
|
|
|
analyze_button.click( |
|
|
fn=analyze_tone, |
|
|
inputs=[text_input, tone_dropdown], |
|
|
outputs=[result_message, all_probs_output, plot_output, examples_output] |
|
|
) |
|
|
|
|
|
tone_dropdown.change( |
|
|
fn=get_tone_examples, |
|
|
inputs=tone_dropdown, |
|
|
outputs=examples_output |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |