Spaces:

cjen1008
/

Patient-Report-Measures-NLP-Sentiments

Running

File size: 4,383 Bytes

d401ec0

"""
Type definitions and model configuration for the sentiment analysis pipeline.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Tuple


class ModelType(str, Enum):
    DEFAULT = "default"   # DistilBERT SST-2
    ROBERTA = "roberta"   # NLP Town BERT Multilingual
    EMOTION = "emotion"   # GoEmotions DistilRoBERTa
    AMAZON  = "amazon"    # Amazon Reviews DistilBERT
    TWITTER = "twitter"   # CardiffNLP Twitter RoBERTa
    SST2     = "sst2"      # BERT base uncased SST-2
    ZEROSHOT = "zeroshot"  # BART Large MNLI (zero-shot)


SUPPORTED_MODELS: Dict[str, Dict] = {
    ModelType.DEFAULT: {
        "hf_id":       "distilbert-base-uncased-finetuned-sst-2-english",
        "labels":      ["NEGATIVE", "POSITIVE"],
        "display":     "DistilBERT SST-2",
        "task":        "POSITIVE / NEGATIVE",
    },
    ModelType.ROBERTA: {
        "hf_id":       "nlptown/bert-base-multilingual-uncased-sentiment",
        "labels":      ["1 STAR", "2 STARS", "3 STARS", "4 STARS", "5 STARS"],
        "label_map":   {
            "1 star":  "1 STAR",
            "2 stars": "2 STARS",
            "3 stars": "3 STARS",
            "4 stars": "4 STARS",
            "5 stars": "5 STARS",
        },
        "display":     "BERT Multilingual",
        "task":        "1–5 star rating",
    },
    ModelType.EMOTION: {
        "hf_id":       "j-hartmann/emotion-english-distilroberta-base",
        "labels":      ["ANGER", "DISGUST", "FEAR", "JOY", "NEUTRAL", "SADNESS", "SURPRISE"],
        "display":     "GoEmotions",
        "task":        "7-class emotion",
    },
    ModelType.AMAZON: {
        "hf_id":       "sohan-ai/sentiment-analysis-model-amazon-reviews",
        "tokenizer":   "distilbert-base-uncased",
        "labels":      ["NEGATIVE", "POSITIVE"],
        "label_map":   {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"},
        "display":     "Amazon Reviews BERT",
        "task":        "POSITIVE / NEGATIVE",
    },
    ModelType.TWITTER: {
        "hf_id":       "cardiffnlp/twitter-roberta-base-sentiment-latest",
        "labels":      ["NEGATIVE", "NEUTRAL", "POSITIVE"],
        "label_map":   {"Negative": "NEGATIVE", "Neutral": "NEUTRAL", "Positive": "POSITIVE"},
        "display":     "RoBERTa Twitter",
        "task":        "NEGATIVE / NEUTRAL / POSITIVE",
    },
    ModelType.SST2: {
        "hf_id":       "textattack/bert-base-uncased-SST-2",
        "tokenizer":   "bert-base-uncased",
        "labels":      ["NEGATIVE", "POSITIVE"],
        "label_map":   {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"},
        "display":     "BERT SST-2",
        "task":        "POSITIVE / NEGATIVE",
    },
    ModelType.ZEROSHOT: {
        "hf_id":            "facebook/bart-large-mnli",
        "pipeline_task":    "zero-shot-classification",
        "candidate_labels": ["positive", "negative", "neutral"],
        "labels":           ["POSITIVE", "NEGATIVE", "NEUTRAL"],
        "display":          "BART Large MNLI",
        "task":             "Zero-shot Sentiment",
    },
}

# Human-readable dropdown labels → ModelType
MODEL_LABEL_TO_TYPE: Dict[str, str] = {
    "DistilBERT SST-2  (POSITIVE / NEGATIVE)":          ModelType.DEFAULT,
    "BERT Multilingual  (1–5 star rating)":              ModelType.ROBERTA,
    "GoEmotions  (7 emotions)":                          ModelType.EMOTION,
    "Amazon Reviews BERT  (POSITIVE / NEGATIVE)":        ModelType.AMAZON,
    "RoBERTa Twitter  (NEGATIVE / NEUTRAL / POSITIVE)":  ModelType.TWITTER,
    "BERT SST-2  (POSITIVE / NEGATIVE)":                 ModelType.SST2,
    "BART Large MNLI  (Zero-shot Sentiment)":             ModelType.ZEROSHOT,
}


@dataclass
class PreprocessResult:
    original_text:   str
    cleaned_text:    str
    removed_text:    str
    normalized_text: str
    tokenized_text:  List[str]
    stemmed_text:    List[str]
    lemmatized_text: List[str]
    ner:             List[Tuple[str, str]]
    pos:             List[Tuple[str, str]]


@dataclass
class WordDistribution:
    distribution: Dict[str, int]        # label → count
    word_lists:   Dict[str, List[str]]  # label → words


@dataclass
class SentimentResult:
    sentiment:     str
    probabilities: List[float]
    model_type:    str
    labels:        List[str]
    preprocess:    PreprocessResult
    word_dist:     WordDistribution