Spaces:

Agnist
/

Text-Tone-Sentimental-Analysis

Running

App Files Files Community

Text-Tone-Sentimental-Analysis / app.py

Agnist

Update app.py

8a99441 verified 7 months ago

raw

history blame contribute delete

7.05 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import torch
	from datasets import load_dataset
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, classification_report
	from sklearn.preprocessing import LabelEncoder
	import matplotlib.pyplot as plt
	from imblearn.over_sampling import SMOTE
	import plotly.express as px
	import plotly.graph_objects as go
	import warnings
	from sklearn.metrics import precision_score, recall_score, f1_score

	warnings.filterwarnings("ignore")

	# Hugging face dataset import
	print("Loading dataset...")
	ds = load_dataset("uhoui/text-tone-classifier")

	# Optional: download csv (colab)
	# df = ds['train'].to_pandas()
	# df.to_csv("text_tone_classifier.csv", index=False)

	df = pd.DataFrame(ds["train"])

	# Console Log dataset and class
	print(f"Dataset size: {len(df)} entries")
	print(f"Columns: {df.columns}")

	label_counts = df['label'].value_counts()
	print("\nClass distribution:")
	print(label_counts)

	# Labels
	label_encoder = LabelEncoder()
	df['label_encoded'] = label_encoder.fit_transform(df['label'])
	print(label_encoder)
	num_classes = len(label_encoder.classes_)
	print(num_classes)

	# Train testsplit
	X_train, X_test, y_train, y_test = train_test_split(
	df['text'],
	df['label_encoded'],
	test_size=0.2,
	random_state=42,
	stratify=None
	)

	# TFIDF Feature extraction
	tfidf = TfidfVectorizer(max_features=5000)
	X_train_tfidf = tfidf.fit_transform(X_train)
	X_test_tfidf = tfidf.transform(X_test)

	# SMOTE
	print("Handling class imbalance (via SNOTE)...")
	try:
	smallest_class_size = min(np.bincount(y_train)[np.bincount(y_train) > 0])
	k_neighbors = min(5, smallest_class_size - 1)

	if k_neighbors > 0:
	smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
	X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
	print(f"After SMOTE: {X_train_resampled.shape}")
	else:
	print("Classes too small for SMOTE, using original data.")
	X_train_resampled, y_train_resampled = X_train_tfidf, y_train
	except ValueError as e:
	print(f"SMOTE error: {e}. Using original data.")
	X_train_resampled, y_train_resampled = X_train_tfidf, y_train

	# Logistic Regression Model
	# max iter exceeding 200 doesnt improve anything
	# Don't set C low, set to 100+ default. 200 works better.
	model = LogisticRegression(C=200, max_iter=200, n_jobs=-1, solver='lbfgs', multi_class='multinomial')
	model.fit(X_train_resampled, y_train_resampled)

	# Evaluate Model
	y_pred = model.predict(X_test_tfidf)
	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred, average='weighted')
	recall = recall_score(y_test, y_pred, average='weighted')
	f1 = f1_score(y_test, y_pred, average='weighted')

	print(f"Accuracy: {(1 - accuracy) * 100:.2f}%")
	print(f"Precision: {(1 - precision) * 100:.2f}%")
	print(f"Recall: {(1 - recall) * 100:.2f}%")
	print(f"F1 Score: {(1 - f1) * 100:.2f}%")

	def predict_tone(text):
	text_tfidf = tfidf.transform([text])

	probs = model.predict_proba(text_tfidf)[0]

	pred_class_encoded = model.classes_[np.argmax(probs)]
	pred_class = label_encoder.inverse_transform([pred_class_encoded])[0]

	trained_labels = model.classes_

	trained_label_names = label_encoder.inverse_transform(trained_labels)

	results = {label: float(prob) for label, prob in zip(trained_label_names, probs)}

	sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}

	top_n = 5 # Top 5, adjust later if needed
	top_labels = list(sorted_results.keys())[:top_n]
	top_probs = list(sorted_results.values())[:top_n]

	colors = ["rgba(64, 128, 255, " + str(min(1.0, p + 0.3)) + ")" for p in top_probs]

	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=top_probs,
	y=top_labels,
	orientation='h',
	marker_color=colors,
	text=[f"{p:.1%}" for p in top_probs],
	textposition='auto'
	))

	fig.update_layout(
	title="Emotion Probability",
	xaxis_title="Probability",
	yaxis_title="Emotion",
	height=400,
	margin=dict(l=20, r=20, t=40, b=20),
	xaxis=dict(range=[0, 1])
	)

	# Fetch examples
	example_texts = df[df['label'] == pred_class]['text'].sample(min(3, len(df[df['label'] == pred_class]))).tolist()

	return pred_class, sorted_results, fig, example_texts

	def get_tone_examples(tone):
	examples = df[df['label'] == tone]['text'].sample(min(5, len(df[df['label'] == tone]))).tolist()
	return examples

	# Gradio interface
	def analyze_tone(text, selected_tone=None):
	if not text:
	return "Enter the text to analyze:", {}, None, []

	if selected_tone and not text:
	examples = get_tone_examples(selected_tone)
	return f"Examples of '{selected_tone}' tone:", {}, None, examples

	predicted_tone, all_probs, fig, examples = predict_tone(text)

	message = f"The tone is: {predicted_tone}"

	return message, all_probs, fig, examples

	# Gradio interface Creation
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
	gr.Markdown("# Text Tone Sentimental Analyzer")
	gr.Markdown("Be mindful of punctuation as it affects results. Slang is unaccounted for due to dataset constraints.")

	with gr.Row():
	with gr.Column(scale=3):
	text_input = gr.Textbox(
	label="Enter your text here",
	placeholder="Example: The satisfaction of completing a difficult puzzle is indescribable.",
	lines=5
	)
	analyze_button = gr.Button("Analyze Tone", variant="primary")

	with gr.Column(scale=2):
	# Example Tones Dropdown
	tone_dropdown = gr.Dropdown(
	choices=sorted(df['label'].unique().tolist()),
	label="Select a tone to view an example below."
	)

	gr.Markdown("<br>", elem_id="line-break-1")

	with gr.Row():
	with gr.Column(scale=1):
	result_message = gr.Markdown()

	with gr.Row():
	with gr.Column(scale=2):
	plot_output = gr.Plot(label="Tone Probabilities")
	with gr.Column(scale=1):
	all_probs_output = gr.JSON(label="All Probabilities")

	with gr.Row():
	examples_output = gr.Dataframe(
	headers=["Examples of similar texts"],
	datatype=["str"],
	label="Example texts with similar tone"
	)

	analyze_button.click(
	fn=analyze_tone,
	inputs=[text_input, tone_dropdown],
	outputs=[result_message, all_probs_output, plot_output, examples_output]
	)

	tone_dropdown.change(
	fn=get_tone_examples,
	inputs=tone_dropdown,
	outputs=examples_output
	)

	# Main
	if __name__ == "__main__":
	demo.launch()