import pandas as pd
from typing import Tuple
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# --- globals ---
clf_pipeline = None
reg_pipeline = None

model_name_cls = "sdbrgo/roberta-tagalog-sentiment-multiclass-classifier"
model_name_reg = "sdbrgo/roberta-tagalog-sentiment-intensity-regressor"

def load_models():
    global clf_pipeline, reg_pipeline

    if clf_pipeline is None:
        tokenizer_cls = AutoTokenizer.from_pretrained(model_name_cls)
        model_cls = AutoModelForSequenceClassification.from_pretrained(model_name_cls)
        clf_pipeline = pipeline(
            "text-classification",
            model=model_cls,
            tokenizer=tokenizer_cls,
            top_k=1,
            device=-1
        )

    if reg_pipeline is None:
        tokenizer_reg = AutoTokenizer.from_pretrained(model_name_reg)
        model_reg = AutoModelForSequenceClassification.from_pretrained(model_name_reg)
        reg_pipeline = pipeline(
            "feature-extraction",
            model=model_reg,
            tokenizer=tokenizer_reg,
            device=-1
        )
        
def transform_sentiments(df): 
    
    load_models()
    
    processed_df = df.copy() #DataFrame for classification and regression    
    
    # Feature Validation: ensures 'text' column exists
    if "text" not in processed_df.columns:
        raise ValueError("Input DataFrame must contain a 'text' column")

    # force cast to string and handle NaNs
    processed_df["text"] = (
        processed_df["text"]
        .fillna("") # replace NaN with empty string
        .astype(str) # force everything to string
    )
    
    texts = processed_df["text"].tolist()

    # --- sentiment classification ---
    cls_outputs = clf_pipeline(texts)

    #DEBUG MODE
    print(cls_outputs[0])

    cls_outputs = [o[0] if isinstance(o, list) else o for o in cls_outputs]
    
    processed_df["label"] = [o["label"] for o in cls_outputs]
    processed_df["sentiment_confidence"] = [o["score"] for o in cls_outputs]

    # --- sentiment intensity regression ---
    reg_outputs = reg_pipeline(texts)

    processed_df["intensity"] = [
        float(o[0][0]) for o in reg_outputs
    ]

    # --- sorting sentiments by intensity (DESC) ---
    sorted_sentiments = processed_df.sort_values("intensity", ascending=False)
    sorted_sentiments = sorted_sentiments[["text", "label", "intensity"]]

    #DEBUG MODE:
    print(df.columns)
    print(processed_df.columns)
    print(sorted_sentiments.columns)

    # features of processed_df: "text", "label", "sentiment_confidence", "intensity"
    # features of sorted_sentiments: "text", "label", "intensity"
    return processed_df, sorted_sentiments

def compute_sentiment_metrics(processed_df, feedback_volume, multiplier_cap=0.7):
    if feedback_volume == 0:
        return {
            "w_pos": 0.0,
            "w_neg": 0.0
        }
    
    df = processed_df.copy()

    LABEL_MAP = {
        "LABEL_0": "neg",
        "LABEL_1": "neu",
        "LABEL_2": "pos"
    }
    df["label"] = df["label"].map(LABEL_MAP)

    # ----- 1. get ratio of pos, neg, neu labels -----
    label_counts = df["label"].value_counts()
    
    # ===== DEBUGGING =====
    label_dtype = df["label"].dtype
    print(label_dtype)
    # =====================

    raw_sentiment_ratios = {
        label: round(label_counts.get(label, 0) / feedback_volume, 2)
        for label in ["neg", "neu", "pos"]
    }

    # ----- 2. get sum of intensity scores per label -----
    intensity_sums = df.groupby("label")["intensity"].sum().to_dict()

    # ensure all labels exist
    intensity_sums = {
        label: intensity_sums.get(label, 0.0)
        for label in ["neg", "pos"]
    }

    # ----- 3. compute intensity ratio per label -----
    total_intensity = sum(intensity_sums.values())

    if total_intensity == 0:
        intensity_ratios = {label: 0.0 for label in intensity_sums}  # nothing expressed
    else:
        intensity_ratios = {
            label: round(intensity_sums[label] / total_intensity * 100, 2)
            for label in intensity_sums
        }

    # ----- 4. calculate SPA (sentiment participation assymetry) -----
    spa = intensity_ratios["pos"] - intensity_ratios["neg"]
    final_spa = round(spa / 100, 2)

    spa_meta = {
        "Value": final_spa,
        "Description": f"SPA {'> 0' if final_spa>0 else '< 0' if final_spa<0 else '= 0'} indicates that {'positive' if final_spa>0 else 'negative' if final_spa<0 else 'balanced'} sentiment dominates."
    }

    sentiment_dict = {
        "Raw Sentiment Ratios": raw_sentiment_ratios,
        "Sentiment Intensity Ratios": f"{intensity_ratios["pos"]}% of expressed intensity is positive, {intensity_ratios["neg"]}% is negative.",
        "SPA (Sentiment-Participation Assymetry)": spa_meta
    }

    return sentiment_dict