import gradio as gr import pandas as pd import numpy as np import torch from datasets import load_dataset from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report from sklearn.preprocessing import LabelEncoder import matplotlib.pyplot as plt from imblearn.over_sampling import SMOTE import plotly.express as px import plotly.graph_objects as go import warnings from sklearn.metrics import precision_score, recall_score, f1_score warnings.filterwarnings("ignore") # Hugging face dataset import print("Loading dataset...") ds = load_dataset("uhoui/text-tone-classifier") # Optional: download csv (colab) # df = ds['train'].to_pandas() # df.to_csv("text_tone_classifier.csv", index=False) df = pd.DataFrame(ds["train"]) # Console Log dataset and class print(f"Dataset size: {len(df)} entries") print(f"Columns: {df.columns}") label_counts = df['label'].value_counts() print("\nClass distribution:") print(label_counts) # Labels label_encoder = LabelEncoder() df['label_encoded'] = label_encoder.fit_transform(df['label']) print(label_encoder) num_classes = len(label_encoder.classes_) print(num_classes) # Train testsplit X_train, X_test, y_train, y_test = train_test_split( df['text'], df['label_encoded'], test_size=0.2, random_state=42, stratify=None ) # TFIDF Feature extraction tfidf = TfidfVectorizer(max_features=5000) X_train_tfidf = tfidf.fit_transform(X_train) X_test_tfidf = tfidf.transform(X_test) # SMOTE print("Handling class imbalance (via SNOTE)...") try: smallest_class_size = min(np.bincount(y_train)[np.bincount(y_train) > 0]) k_neighbors = min(5, smallest_class_size - 1) if k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train) print(f"After SMOTE: {X_train_resampled.shape}") else: print("Classes too small for SMOTE, using original data.") X_train_resampled, y_train_resampled = X_train_tfidf, y_train except ValueError as e: print(f"SMOTE error: {e}. Using original data.") X_train_resampled, y_train_resampled = X_train_tfidf, y_train # Logistic Regression Model # max iter exceeding 200 doesnt improve anything # Don't set C low, set to 100+ default. 200 works better. model = LogisticRegression(C=200, max_iter=200, n_jobs=-1, solver='lbfgs', multi_class='multinomial') model.fit(X_train_resampled, y_train_resampled) # Evaluate Model y_pred = model.predict(X_test_tfidf) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') print(f"Accuracy: {(1 - accuracy) * 100:.2f}%") print(f"Precision: {(1 - precision) * 100:.2f}%") print(f"Recall: {(1 - recall) * 100:.2f}%") print(f"F1 Score: {(1 - f1) * 100:.2f}%") def predict_tone(text): text_tfidf = tfidf.transform([text]) probs = model.predict_proba(text_tfidf)[0] pred_class_encoded = model.classes_[np.argmax(probs)] pred_class = label_encoder.inverse_transform([pred_class_encoded])[0] trained_labels = model.classes_ trained_label_names = label_encoder.inverse_transform(trained_labels) results = {label: float(prob) for label, prob in zip(trained_label_names, probs)} sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)} top_n = 5 # Top 5, adjust later if needed top_labels = list(sorted_results.keys())[:top_n] top_probs = list(sorted_results.values())[:top_n] colors = ["rgba(64, 128, 255, " + str(min(1.0, p + 0.3)) + ")" for p in top_probs] fig = go.Figure() fig.add_trace(go.Bar( x=top_probs, y=top_labels, orientation='h', marker_color=colors, text=[f"{p:.1%}" for p in top_probs], textposition='auto' )) fig.update_layout( title="Emotion Probability", xaxis_title="Probability", yaxis_title="Emotion", height=400, margin=dict(l=20, r=20, t=40, b=20), xaxis=dict(range=[0, 1]) ) # Fetch examples example_texts = df[df['label'] == pred_class]['text'].sample(min(3, len(df[df['label'] == pred_class]))).tolist() return pred_class, sorted_results, fig, example_texts def get_tone_examples(tone): examples = df[df['label'] == tone]['text'].sample(min(5, len(df[df['label'] == tone]))).tolist() return examples # Gradio interface def analyze_tone(text, selected_tone=None): if not text: return "Enter the text to analyze:", {}, None, [] if selected_tone and not text: examples = get_tone_examples(selected_tone) return f"Examples of '{selected_tone}' tone:", {}, None, examples predicted_tone, all_probs, fig, examples = predict_tone(text) message = f"The tone is: **{predicted_tone}**" return message, all_probs, fig, examples # Gradio interface Creation with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown("# Text Tone Sentimental Analyzer") gr.Markdown("Be mindful of punctuation as it affects results. Slang is unaccounted for due to dataset constraints.") with gr.Row(): with gr.Column(scale=3): text_input = gr.Textbox( label="Enter your text here", placeholder="Example: The satisfaction of completing a difficult puzzle is indescribable.", lines=5 ) analyze_button = gr.Button("Analyze Tone", variant="primary") with gr.Column(scale=2): # Example Tones Dropdown tone_dropdown = gr.Dropdown( choices=sorted(df['label'].unique().tolist()), label="Select a tone to view an example below." ) gr.Markdown("
", elem_id="line-break-1") with gr.Row(): with gr.Column(scale=1): result_message = gr.Markdown() with gr.Row(): with gr.Column(scale=2): plot_output = gr.Plot(label="Tone Probabilities") with gr.Column(scale=1): all_probs_output = gr.JSON(label="All Probabilities") with gr.Row(): examples_output = gr.Dataframe( headers=["Examples of similar texts"], datatype=["str"], label="Example texts with similar tone" ) analyze_button.click( fn=analyze_tone, inputs=[text_input, tone_dropdown], outputs=[result_message, all_probs_output, plot_output, examples_output] ) tone_dropdown.change( fn=get_tone_examples, inputs=tone_dropdown, outputs=examples_output ) # Main if __name__ == "__main__": demo.launch()