Spaces:

Agnist
/

Text-Tone-Sentimental-Analysis

Sleeping

File size: 7,052 Bytes

680ff8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dd1ac9
680ff8e
 
 
bb3f86e
680ff8e
 
 
0dd1ac9
 
 
 
680ff8e
 
bb3f86e
680ff8e
 
 
 
 
 
 
bb3f86e
680ff8e
 
8a99441
680ff8e
8a99441
680ff8e
bb3f86e
680ff8e
 
 
 
161de8d
bb3f86e
680ff8e
 
bb3f86e
680ff8e
 
 
 
bb3f86e
 
680ff8e
db2468f
 
 
 
 
 
 
 
 
 
680ff8e
 
 
 
bb3f86e
56388de
 
9451101
680ff8e
 
bb3f86e
680ff8e
 
0dd1ac9
 
 
 
9451101
 
 
 
680ff8e
 
 
 
 
 
d9801a1
 
680ff8e
bb3f86e
93debb2
 
 
 
680ff8e
 
 
bb3f86e
680ff8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb3f86e
680ff8e
 
 
 
 
 
 
 
8896cb3
680ff8e
 
bb3f86e
680ff8e
 
 
 
 
 
 
bb3f86e
680ff8e
 
 
bb3f86e
680ff8e
64545b8
56388de
680ff8e
 
 
 
 
bb3f86e
680ff8e
 
 
 
 
3439038
680ff8e
 
bb3f86e
680ff8e
56388de
 
3439038
680ff8e
 
 
 
 
 
 
cba7173
 
3439038
680ff8e
 
 
 
 
 
3439038
680ff8e
 
bb3f86e
cba7173
680ff8e
 
 
 
bb3f86e
efbb21f
680ff8e
 
bb3f86e
935c52f
680ff8e

import gradio as gr
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import plotly.express as px
import plotly.graph_objects as go
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score

warnings.filterwarnings("ignore")

# Hugging face dataset import
print("Loading dataset...")
ds = load_dataset("uhoui/text-tone-classifier")

# Optional: download csv (colab)
# df = ds['train'].to_pandas()
# df.to_csv("text_tone_classifier.csv", index=False)

df = pd.DataFrame(ds["train"])

# Console Log dataset and class
print(f"Dataset size: {len(df)} entries")
print(f"Columns: {df.columns}")

label_counts = df['label'].value_counts()
print("\nClass distribution:")
print(label_counts)

# Labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
print(label_encoder)
num_classes = len(label_encoder.classes_)
print(num_classes)

# Train  testsplit
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['label_encoded'], 
    test_size=0.2, 
    random_state=42,
    stratify=None
)

# TFIDF Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# SMOTE
print("Handling class imbalance (via SNOTE)...")
try:
    smallest_class_size = min(np.bincount(y_train)[np.bincount(y_train) > 0])
    k_neighbors = min(5, smallest_class_size - 1)
    
    if k_neighbors > 0:
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
        print(f"After SMOTE: {X_train_resampled.shape}")
    else:
        print("Classes too small for SMOTE, using original data.")
        X_train_resampled, y_train_resampled = X_train_tfidf, y_train
except ValueError as e:
    print(f"SMOTE error: {e}. Using original data.")
    X_train_resampled, y_train_resampled = X_train_tfidf, y_train

# Logistic Regression Model
# max iter exceeding 200 doesnt improve anything
# Don't set C low, set to 100+ default. 200 works better.
model = LogisticRegression(C=200, max_iter=200, n_jobs=-1, solver='lbfgs', multi_class='multinomial')
model.fit(X_train_resampled, y_train_resampled)

# Evaluate Model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {(1 - accuracy) * 100:.2f}%")
print(f"Precision: {(1 - precision) * 100:.2f}%")
print(f"Recall: {(1 - recall) * 100:.2f}%")
print(f"F1 Score: {(1 - f1) * 100:.2f}%")

def predict_tone(text):
    text_tfidf = tfidf.transform([text])
    
    probs = model.predict_proba(text_tfidf)[0]
    
    pred_class_encoded = model.classes_[np.argmax(probs)]
    pred_class = label_encoder.inverse_transform([pred_class_encoded])[0]
    
    trained_labels = model.classes_  

    trained_label_names = label_encoder.inverse_transform(trained_labels)

    results = {label: float(prob) for label, prob in zip(trained_label_names, probs)}
    
    sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}
    
    top_n = 5  # Top 5, adjust later if needed
    top_labels = list(sorted_results.keys())[:top_n]
    top_probs = list(sorted_results.values())[:top_n]
    
    colors = ["rgba(64, 128, 255, " + str(min(1.0, p + 0.3)) + ")" for p in top_probs]
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=top_probs,
        y=top_labels,
        orientation='h',
        marker_color=colors,
        text=[f"{p:.1%}" for p in top_probs],
        textposition='auto'
    ))
    
    fig.update_layout(
        title="Emotion Probability",
        xaxis_title="Probability",
        yaxis_title="Emotion",
        height=400,
        margin=dict(l=20, r=20, t=40, b=20),
        xaxis=dict(range=[0, 1])
    )
    
    # Fetch examples
    example_texts = df[df['label'] == pred_class]['text'].sample(min(3, len(df[df['label'] == pred_class]))).tolist()
    
    return pred_class, sorted_results, fig, example_texts

def get_tone_examples(tone):
    examples = df[df['label'] == tone]['text'].sample(min(5, len(df[df['label'] == tone]))).tolist()
    return examples

# Gradio interface
def analyze_tone(text, selected_tone=None):
    if not text:
        return "Enter the text to analyze:", {}, None, []
    
    if selected_tone and not text:
        examples = get_tone_examples(selected_tone)
        return f"Examples of '{selected_tone}' tone:", {}, None, examples
    
    predicted_tone, all_probs, fig, examples = predict_tone(text)
    
    message = f"The tone is: **{predicted_tone}**"
    
    return message, all_probs, fig, examples

# Gradio interface Creation
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.Markdown("# Text Tone Sentimental Analyzer")
    gr.Markdown("Be mindful of punctuation as it affects results. Slang is unaccounted for due to dataset constraints.")
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Enter your text here",
                placeholder="Example: The satisfaction of completing a difficult puzzle is indescribable.",
                lines=5
            )
            analyze_button = gr.Button("Analyze Tone", variant="primary")
            
        with gr.Column(scale=2):
            # Example Tones Dropdown
            tone_dropdown = gr.Dropdown(
                choices=sorted(df['label'].unique().tolist()),
                label="Select a tone to view an example below."
            )

    gr.Markdown("<br>", elem_id="line-break-1")
    
    with gr.Row():
        with gr.Column(scale=1):
            result_message = gr.Markdown()
        
    with gr.Row():
        with gr.Column(scale=2):
            plot_output = gr.Plot(label="Tone Probabilities")
        with gr.Column(scale=1):
            all_probs_output = gr.JSON(label="All Probabilities")
            
    with gr.Row():
        examples_output = gr.Dataframe(
            headers=["Examples of similar texts"],
            datatype=["str"],
            label="Example texts with similar tone"
        )
    
    analyze_button.click(
        fn=analyze_tone,
        inputs=[text_input, tone_dropdown],  
        outputs=[result_message, all_probs_output, plot_output, examples_output]
    )
    
    tone_dropdown.change(
        fn=get_tone_examples,
        inputs=tone_dropdown,  
        outputs=examples_output
    )

# Main
if __name__ == "__main__":
    demo.launch()