import pandas as pd from typing import Tuple from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # --- globals --- clf_pipeline = None reg_pipeline = None model_name_cls = "sdbrgo/roberta-tagalog-sentiment-multiclass-classifier" model_name_reg = "sdbrgo/roberta-tagalog-sentiment-intensity-regressor" def load_models(): global clf_pipeline, reg_pipeline if clf_pipeline is None: tokenizer_cls = AutoTokenizer.from_pretrained(model_name_cls) model_cls = AutoModelForSequenceClassification.from_pretrained(model_name_cls) clf_pipeline = pipeline( "text-classification", model=model_cls, tokenizer=tokenizer_cls, top_k=1, device=-1 ) if reg_pipeline is None: tokenizer_reg = AutoTokenizer.from_pretrained(model_name_reg) model_reg = AutoModelForSequenceClassification.from_pretrained(model_name_reg) reg_pipeline = pipeline( "feature-extraction", model=model_reg, tokenizer=tokenizer_reg, device=-1 ) def transform_sentiments(df): load_models() processed_df = df.copy() #DataFrame for classification and regression # Feature Validation: ensures 'text' column exists if "text" not in processed_df.columns: raise ValueError("Input DataFrame must contain a 'text' column") # force cast to string and handle NaNs processed_df["text"] = ( processed_df["text"] .fillna("") # replace NaN with empty string .astype(str) # force everything to string ) texts = processed_df["text"].tolist() # --- sentiment classification --- cls_outputs = clf_pipeline(texts) #DEBUG MODE print(cls_outputs[0]) cls_outputs = [o[0] if isinstance(o, list) else o for o in cls_outputs] processed_df["label"] = [o["label"] for o in cls_outputs] processed_df["sentiment_confidence"] = [o["score"] for o in cls_outputs] # --- sentiment intensity regression --- reg_outputs = reg_pipeline(texts) processed_df["intensity"] = [ float(o[0][0]) for o in reg_outputs ] # --- sorting sentiments by intensity (DESC) --- sorted_sentiments = processed_df.sort_values("intensity", ascending=False) sorted_sentiments = sorted_sentiments[["text", "label", "intensity"]] #DEBUG MODE: print(df.columns) print(processed_df.columns) print(sorted_sentiments.columns) # features of processed_df: "text", "label", "sentiment_confidence", "intensity" # features of sorted_sentiments: "text", "label", "intensity" return processed_df, sorted_sentiments def compute_sentiment_metrics(processed_df, feedback_volume, multiplier_cap=0.7): if feedback_volume == 0: return { "w_pos": 0.0, "w_neg": 0.0 } df = processed_df.copy() LABEL_MAP = { "LABEL_0": "neg", "LABEL_1": "neu", "LABEL_2": "pos" } df["label"] = df["label"].map(LABEL_MAP) # ----- 1. get ratio of pos, neg, neu labels ----- label_counts = df["label"].value_counts() # ===== DEBUGGING ===== label_dtype = df["label"].dtype print(label_dtype) # ===================== raw_sentiment_ratios = { label: round(label_counts.get(label, 0) / feedback_volume, 2) for label in ["neg", "neu", "pos"] } # ----- 2. get sum of intensity scores per label ----- intensity_sums = df.groupby("label")["intensity"].sum().to_dict() # ensure all labels exist intensity_sums = { label: intensity_sums.get(label, 0.0) for label in ["neg", "pos"] } # ----- 3. compute intensity ratio per label ----- total_intensity = sum(intensity_sums.values()) if total_intensity == 0: intensity_ratios = {label: 0.0 for label in intensity_sums} # nothing expressed else: intensity_ratios = { label: round(intensity_sums[label] / total_intensity * 100, 2) for label in intensity_sums } # ----- 4. calculate SPA (sentiment participation assymetry) ----- spa = intensity_ratios["pos"] - intensity_ratios["neg"] final_spa = round(spa / 100, 2) spa_meta = { "Value": final_spa, "Description": f"SPA {'> 0' if final_spa>0 else '< 0' if final_spa<0 else '= 0'} indicates that {'positive' if final_spa>0 else 'negative' if final_spa<0 else 'balanced'} sentiment dominates." } sentiment_dict = { "Raw Sentiment Ratios": raw_sentiment_ratios, "Sentiment Intensity Ratios": f"{intensity_ratios["pos"]}% of expressed intensity is positive, {intensity_ratios["neg"]}% is negative.", "SPA (Sentiment-Participation Assymetry)": spa_meta } return sentiment_dict