File size: 4,383 Bytes
d401ec0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Type definitions and model configuration for the sentiment analysis pipeline.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Tuple


class ModelType(str, Enum):
    DEFAULT = "default"   # DistilBERT SST-2
    ROBERTA = "roberta"   # NLP Town BERT Multilingual
    EMOTION = "emotion"   # GoEmotions DistilRoBERTa
    AMAZON  = "amazon"    # Amazon Reviews DistilBERT
    TWITTER = "twitter"   # CardiffNLP Twitter RoBERTa
    SST2     = "sst2"      # BERT base uncased SST-2
    ZEROSHOT = "zeroshot"  # BART Large MNLI (zero-shot)


SUPPORTED_MODELS: Dict[str, Dict] = {
    ModelType.DEFAULT: {
        "hf_id":       "distilbert-base-uncased-finetuned-sst-2-english",
        "labels":      ["NEGATIVE", "POSITIVE"],
        "display":     "DistilBERT SST-2",
        "task":        "POSITIVE / NEGATIVE",
    },
    ModelType.ROBERTA: {
        "hf_id":       "nlptown/bert-base-multilingual-uncased-sentiment",
        "labels":      ["1 STAR", "2 STARS", "3 STARS", "4 STARS", "5 STARS"],
        "label_map":   {
            "1 star":  "1 STAR",
            "2 stars": "2 STARS",
            "3 stars": "3 STARS",
            "4 stars": "4 STARS",
            "5 stars": "5 STARS",
        },
        "display":     "BERT Multilingual",
        "task":        "1–5 star rating",
    },
    ModelType.EMOTION: {
        "hf_id":       "j-hartmann/emotion-english-distilroberta-base",
        "labels":      ["ANGER", "DISGUST", "FEAR", "JOY", "NEUTRAL", "SADNESS", "SURPRISE"],
        "display":     "GoEmotions",
        "task":        "7-class emotion",
    },
    ModelType.AMAZON: {
        "hf_id":       "sohan-ai/sentiment-analysis-model-amazon-reviews",
        "tokenizer":   "distilbert-base-uncased",
        "labels":      ["NEGATIVE", "POSITIVE"],
        "label_map":   {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"},
        "display":     "Amazon Reviews BERT",
        "task":        "POSITIVE / NEGATIVE",
    },
    ModelType.TWITTER: {
        "hf_id":       "cardiffnlp/twitter-roberta-base-sentiment-latest",
        "labels":      ["NEGATIVE", "NEUTRAL", "POSITIVE"],
        "label_map":   {"Negative": "NEGATIVE", "Neutral": "NEUTRAL", "Positive": "POSITIVE"},
        "display":     "RoBERTa Twitter",
        "task":        "NEGATIVE / NEUTRAL / POSITIVE",
    },
    ModelType.SST2: {
        "hf_id":       "textattack/bert-base-uncased-SST-2",
        "tokenizer":   "bert-base-uncased",
        "labels":      ["NEGATIVE", "POSITIVE"],
        "label_map":   {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"},
        "display":     "BERT SST-2",
        "task":        "POSITIVE / NEGATIVE",
    },
    ModelType.ZEROSHOT: {
        "hf_id":            "facebook/bart-large-mnli",
        "pipeline_task":    "zero-shot-classification",
        "candidate_labels": ["positive", "negative", "neutral"],
        "labels":           ["POSITIVE", "NEGATIVE", "NEUTRAL"],
        "display":          "BART Large MNLI",
        "task":             "Zero-shot Sentiment",
    },
}

# Human-readable dropdown labels β†’ ModelType
MODEL_LABEL_TO_TYPE: Dict[str, str] = {
    "DistilBERT SST-2  (POSITIVE / NEGATIVE)":          ModelType.DEFAULT,
    "BERT Multilingual  (1–5 star rating)":              ModelType.ROBERTA,
    "GoEmotions  (7 emotions)":                          ModelType.EMOTION,
    "Amazon Reviews BERT  (POSITIVE / NEGATIVE)":        ModelType.AMAZON,
    "RoBERTa Twitter  (NEGATIVE / NEUTRAL / POSITIVE)":  ModelType.TWITTER,
    "BERT SST-2  (POSITIVE / NEGATIVE)":                 ModelType.SST2,
    "BART Large MNLI  (Zero-shot Sentiment)":             ModelType.ZEROSHOT,
}


@dataclass
class PreprocessResult:
    original_text:   str
    cleaned_text:    str
    removed_text:    str
    normalized_text: str
    tokenized_text:  List[str]
    stemmed_text:    List[str]
    lemmatized_text: List[str]
    ner:             List[Tuple[str, str]]
    pos:             List[Tuple[str, str]]


@dataclass
class WordDistribution:
    distribution: Dict[str, int]        # label β†’ count
    word_lists:   Dict[str, List[str]]  # label β†’ words


@dataclass
class SentimentResult:
    sentiment:     str
    probabilities: List[float]
    model_type:    str
    labels:        List[str]
    preprocess:    PreprocessResult
    word_dist:     WordDistribution