""" Type definitions and model configuration for the sentiment analysis pipeline. """ from dataclasses import dataclass, field from enum import Enum from typing import Dict, List, Optional, Tuple class ModelType(str, Enum): DEFAULT = "default" # DistilBERT SST-2 ROBERTA = "roberta" # NLP Town BERT Multilingual EMOTION = "emotion" # GoEmotions DistilRoBERTa AMAZON = "amazon" # Amazon Reviews DistilBERT TWITTER = "twitter" # CardiffNLP Twitter RoBERTa SST2 = "sst2" # BERT base uncased SST-2 ZEROSHOT = "zeroshot" # BART Large MNLI (zero-shot) SUPPORTED_MODELS: Dict[str, Dict] = { ModelType.DEFAULT: { "hf_id": "distilbert-base-uncased-finetuned-sst-2-english", "labels": ["NEGATIVE", "POSITIVE"], "display": "DistilBERT SST-2", "task": "POSITIVE / NEGATIVE", }, ModelType.ROBERTA: { "hf_id": "nlptown/bert-base-multilingual-uncased-sentiment", "labels": ["1 STAR", "2 STARS", "3 STARS", "4 STARS", "5 STARS"], "label_map": { "1 star": "1 STAR", "2 stars": "2 STARS", "3 stars": "3 STARS", "4 stars": "4 STARS", "5 stars": "5 STARS", }, "display": "BERT Multilingual", "task": "1–5 star rating", }, ModelType.EMOTION: { "hf_id": "j-hartmann/emotion-english-distilroberta-base", "labels": ["ANGER", "DISGUST", "FEAR", "JOY", "NEUTRAL", "SADNESS", "SURPRISE"], "display": "GoEmotions", "task": "7-class emotion", }, ModelType.AMAZON: { "hf_id": "sohan-ai/sentiment-analysis-model-amazon-reviews", "tokenizer": "distilbert-base-uncased", "labels": ["NEGATIVE", "POSITIVE"], "label_map": {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"}, "display": "Amazon Reviews BERT", "task": "POSITIVE / NEGATIVE", }, ModelType.TWITTER: { "hf_id": "cardiffnlp/twitter-roberta-base-sentiment-latest", "labels": ["NEGATIVE", "NEUTRAL", "POSITIVE"], "label_map": {"Negative": "NEGATIVE", "Neutral": "NEUTRAL", "Positive": "POSITIVE"}, "display": "RoBERTa Twitter", "task": "NEGATIVE / NEUTRAL / POSITIVE", }, ModelType.SST2: { "hf_id": "textattack/bert-base-uncased-SST-2", "tokenizer": "bert-base-uncased", "labels": ["NEGATIVE", "POSITIVE"], "label_map": {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"}, "display": "BERT SST-2", "task": "POSITIVE / NEGATIVE", }, ModelType.ZEROSHOT: { "hf_id": "facebook/bart-large-mnli", "pipeline_task": "zero-shot-classification", "candidate_labels": ["positive", "negative", "neutral"], "labels": ["POSITIVE", "NEGATIVE", "NEUTRAL"], "display": "BART Large MNLI", "task": "Zero-shot Sentiment", }, } # Human-readable dropdown labels → ModelType MODEL_LABEL_TO_TYPE: Dict[str, str] = { "DistilBERT SST-2 (POSITIVE / NEGATIVE)": ModelType.DEFAULT, "BERT Multilingual (1–5 star rating)": ModelType.ROBERTA, "GoEmotions (7 emotions)": ModelType.EMOTION, "Amazon Reviews BERT (POSITIVE / NEGATIVE)": ModelType.AMAZON, "RoBERTa Twitter (NEGATIVE / NEUTRAL / POSITIVE)": ModelType.TWITTER, "BERT SST-2 (POSITIVE / NEGATIVE)": ModelType.SST2, "BART Large MNLI (Zero-shot Sentiment)": ModelType.ZEROSHOT, } @dataclass class PreprocessResult: original_text: str cleaned_text: str removed_text: str normalized_text: str tokenized_text: List[str] stemmed_text: List[str] lemmatized_text: List[str] ner: List[Tuple[str, str]] pos: List[Tuple[str, str]] @dataclass class WordDistribution: distribution: Dict[str, int] # label → count word_lists: Dict[str, List[str]] # label → words @dataclass class SentimentResult: sentiment: str probabilities: List[float] model_type: str labels: List[str] preprocess: PreprocessResult word_dist: WordDistribution