Spaces:

sdbrgo
/

Peopulse

Sleeping

App Files Files Community

Peopulse / phase /sentiment_modeling.py

sdbrgo

Update phase/sentiment_modeling.py

a4f16dd verified 14 days ago

raw

history blame contribute delete

4.86 kB

	import pandas as pd
	from typing import Tuple
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

	# --- globals ---
	clf_pipeline = None
	reg_pipeline = None

	model_name_cls = "sdbrgo/roberta-tagalog-sentiment-multiclass-classifier"
	model_name_reg = "sdbrgo/roberta-tagalog-sentiment-intensity-regressor"

	def load_models():
	global clf_pipeline, reg_pipeline

	if clf_pipeline is None:
	tokenizer_cls = AutoTokenizer.from_pretrained(model_name_cls)
	model_cls = AutoModelForSequenceClassification.from_pretrained(model_name_cls)
	clf_pipeline = pipeline(
	"text-classification",
	model=model_cls,
	tokenizer=tokenizer_cls,
	top_k=1,
	device=-1
	)

	if reg_pipeline is None:
	tokenizer_reg = AutoTokenizer.from_pretrained(model_name_reg)
	model_reg = AutoModelForSequenceClassification.from_pretrained(model_name_reg)
	reg_pipeline = pipeline(
	"feature-extraction",
	model=model_reg,
	tokenizer=tokenizer_reg,
	device=-1
	)

	def transform_sentiments(df):

	load_models()

	processed_df = df.copy() #DataFrame for classification and regression

	# Feature Validation: ensures 'text' column exists
	if "text" not in processed_df.columns:
	raise ValueError("Input DataFrame must contain a 'text' column")

	# force cast to string and handle NaNs
	processed_df["text"] = (
	processed_df["text"]
	.fillna("") # replace NaN with empty string
	.astype(str) # force everything to string
	)

	texts = processed_df["text"].tolist()

	# --- sentiment classification ---
	cls_outputs = clf_pipeline(texts)

	#DEBUG MODE
	print(cls_outputs[0])

	cls_outputs = [o[0] if isinstance(o, list) else o for o in cls_outputs]

	processed_df["label"] = [o["label"] for o in cls_outputs]
	processed_df["sentiment_confidence"] = [o["score"] for o in cls_outputs]

	# --- sentiment intensity regression ---
	reg_outputs = reg_pipeline(texts)

	processed_df["intensity"] = [
	float(o[0][0]) for o in reg_outputs
	]

	# --- sorting sentiments by intensity (DESC) ---
	sorted_sentiments = processed_df.sort_values("intensity", ascending=False)
	sorted_sentiments = sorted_sentiments[["text", "label", "intensity"]]

	#DEBUG MODE:
	print(df.columns)
	print(processed_df.columns)
	print(sorted_sentiments.columns)

	# features of processed_df: "text", "label", "sentiment_confidence", "intensity"
	# features of sorted_sentiments: "text", "label", "intensity"
	return processed_df, sorted_sentiments

	def compute_sentiment_metrics(processed_df, feedback_volume, multiplier_cap=0.7):
	if feedback_volume == 0:
	return {
	"w_pos": 0.0,
	"w_neg": 0.0
	}

	df = processed_df.copy()

	LABEL_MAP = {
	"LABEL_0": "neg",
	"LABEL_1": "neu",
	"LABEL_2": "pos"
	}
	df["label"] = df["label"].map(LABEL_MAP)

	# ----- 1. get ratio of pos, neg, neu labels -----
	label_counts = df["label"].value_counts()

	# ===== DEBUGGING =====
	label_dtype = df["label"].dtype
	print(label_dtype)
	# =====================

	raw_sentiment_ratios = {
	label: round(label_counts.get(label, 0) / feedback_volume, 2)
	for label in ["neg", "neu", "pos"]
	}

	# ----- 2. get sum of intensity scores per label -----
	intensity_sums = df.groupby("label")["intensity"].sum().to_dict()

	# ensure all labels exist
	intensity_sums = {
	label: intensity_sums.get(label, 0.0)
	for label in ["neg", "pos"]
	}

	# ----- 3. compute intensity ratio per label -----
	total_intensity = sum(intensity_sums.values())

	if total_intensity == 0:
	intensity_ratios = {label: 0.0 for label in intensity_sums} # nothing expressed
	else:
	intensity_ratios = {
	label: round(intensity_sums[label] / total_intensity * 100, 2)
	for label in intensity_sums
	}

	# ----- 4. calculate SPA (sentiment participation assymetry) -----
	spa = intensity_ratios["pos"] - intensity_ratios["neg"]
	final_spa = round(spa / 100, 2)

	spa_meta = {
	"Value": final_spa,
	"Description": f"SPA {'> 0' if final_spa>0 else '< 0' if final_spa<0 else '= 0'} indicates that {'positive' if final_spa>0 else 'negative' if final_spa<0 else 'balanced'} sentiment dominates."
	}

	sentiment_dict = {
	"Raw Sentiment Ratios": raw_sentiment_ratios,
	"Sentiment Intensity Ratios": f"{intensity_ratios["pos"]}% of expressed intensity is positive, {intensity_ratios["neg"]}% is negative.",
	"SPA (Sentiment-Participation Assymetry)": spa_meta
	}

	return sentiment_dict