Peopulse / phase /sentiment_modeling.py
sdbrgo's picture
Update phase/sentiment_modeling.py
a4f16dd verified
import pandas as pd
from typing import Tuple
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# --- globals ---
clf_pipeline = None
reg_pipeline = None
model_name_cls = "sdbrgo/roberta-tagalog-sentiment-multiclass-classifier"
model_name_reg = "sdbrgo/roberta-tagalog-sentiment-intensity-regressor"
def load_models():
global clf_pipeline, reg_pipeline
if clf_pipeline is None:
tokenizer_cls = AutoTokenizer.from_pretrained(model_name_cls)
model_cls = AutoModelForSequenceClassification.from_pretrained(model_name_cls)
clf_pipeline = pipeline(
"text-classification",
model=model_cls,
tokenizer=tokenizer_cls,
top_k=1,
device=-1
)
if reg_pipeline is None:
tokenizer_reg = AutoTokenizer.from_pretrained(model_name_reg)
model_reg = AutoModelForSequenceClassification.from_pretrained(model_name_reg)
reg_pipeline = pipeline(
"feature-extraction",
model=model_reg,
tokenizer=tokenizer_reg,
device=-1
)
def transform_sentiments(df):
load_models()
processed_df = df.copy() #DataFrame for classification and regression
# Feature Validation: ensures 'text' column exists
if "text" not in processed_df.columns:
raise ValueError("Input DataFrame must contain a 'text' column")
# force cast to string and handle NaNs
processed_df["text"] = (
processed_df["text"]
.fillna("") # replace NaN with empty string
.astype(str) # force everything to string
)
texts = processed_df["text"].tolist()
# --- sentiment classification ---
cls_outputs = clf_pipeline(texts)
#DEBUG MODE
print(cls_outputs[0])
cls_outputs = [o[0] if isinstance(o, list) else o for o in cls_outputs]
processed_df["label"] = [o["label"] for o in cls_outputs]
processed_df["sentiment_confidence"] = [o["score"] for o in cls_outputs]
# --- sentiment intensity regression ---
reg_outputs = reg_pipeline(texts)
processed_df["intensity"] = [
float(o[0][0]) for o in reg_outputs
]
# --- sorting sentiments by intensity (DESC) ---
sorted_sentiments = processed_df.sort_values("intensity", ascending=False)
sorted_sentiments = sorted_sentiments[["text", "label", "intensity"]]
#DEBUG MODE:
print(df.columns)
print(processed_df.columns)
print(sorted_sentiments.columns)
# features of processed_df: "text", "label", "sentiment_confidence", "intensity"
# features of sorted_sentiments: "text", "label", "intensity"
return processed_df, sorted_sentiments
def compute_sentiment_metrics(processed_df, feedback_volume, multiplier_cap=0.7):
if feedback_volume == 0:
return {
"w_pos": 0.0,
"w_neg": 0.0
}
df = processed_df.copy()
LABEL_MAP = {
"LABEL_0": "neg",
"LABEL_1": "neu",
"LABEL_2": "pos"
}
df["label"] = df["label"].map(LABEL_MAP)
# ----- 1. get ratio of pos, neg, neu labels -----
label_counts = df["label"].value_counts()
# ===== DEBUGGING =====
label_dtype = df["label"].dtype
print(label_dtype)
# =====================
raw_sentiment_ratios = {
label: round(label_counts.get(label, 0) / feedback_volume, 2)
for label in ["neg", "neu", "pos"]
}
# ----- 2. get sum of intensity scores per label -----
intensity_sums = df.groupby("label")["intensity"].sum().to_dict()
# ensure all labels exist
intensity_sums = {
label: intensity_sums.get(label, 0.0)
for label in ["neg", "pos"]
}
# ----- 3. compute intensity ratio per label -----
total_intensity = sum(intensity_sums.values())
if total_intensity == 0:
intensity_ratios = {label: 0.0 for label in intensity_sums} # nothing expressed
else:
intensity_ratios = {
label: round(intensity_sums[label] / total_intensity * 100, 2)
for label in intensity_sums
}
# ----- 4. calculate SPA (sentiment participation assymetry) -----
spa = intensity_ratios["pos"] - intensity_ratios["neg"]
final_spa = round(spa / 100, 2)
spa_meta = {
"Value": final_spa,
"Description": f"SPA {'> 0' if final_spa>0 else '< 0' if final_spa<0 else '= 0'} indicates that {'positive' if final_spa>0 else 'negative' if final_spa<0 else 'balanced'} sentiment dominates."
}
sentiment_dict = {
"Raw Sentiment Ratios": raw_sentiment_ratios,
"Sentiment Intensity Ratios": f"{intensity_ratios["pos"]}% of expressed intensity is positive, {intensity_ratios["neg"]}% is negative.",
"SPA (Sentiment-Participation Assymetry)": spa_meta
}
return sentiment_dict