File size: 5,240 Bytes
e0f2d0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from transformers import pipeline, AutoTokenizer
from schemas.text_schemas import EmotionResult, EmotionDetector
from typing import List, Dict

class TransformersEmotionDetector(EmotionDetector):
    """Emotion detector using a lightweight DistilRoBERTa model from Hugging Face."""

    def __init__(self, model_name: str = "j-hartmann/emotion-english-distilroberta-base"):
        """

        Initialize the emotion detection model.

        Args:

            model_name: Pretrained Hugging Face model for emotion detection.

        """
        self.model_name = model_name
        self.pipeline = pipeline(
            "text-classification",
            model=model_name,
            return_all_scores=True
        )
        # Load tokenizer for proper text truncation
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def analyze(self, text: str) -> EmotionResult:
        """

        Analyze emotion in the given text using chunked processing for long texts.

        Args:

            text: The text to analyze.

        Returns:

            EmotionResult: Structured result containing dominant emotion and confidence.

        """
        # Split text into chunks if it's too long
        max_tokens_per_chunk = 400  # Conservative limit to leave room for special tokens
        chunks = self._split_text_into_chunks(text, max_tokens_per_chunk)

        if len(chunks) == 1:
            # Single chunk - use original logic
            results: List[List[Dict[str, float]]] = self.pipeline(chunks[0])
        else:
            # Multiple chunks - analyze each and aggregate results
            results: List[List[Dict[str, float]]] = self.pipeline(chunks)

        # Aggregate emotion scores across all chunks
        aggregated_scores = self._aggregate_emotion_scores(results)

        # Get the most likely emotion
        dominant_emotion = max(aggregated_scores, key=aggregated_scores.get)
        confidence = aggregated_scores[dominant_emotion]

        return EmotionResult(
            dominant_emotion=dominant_emotion,
            confidence=confidence,
            all_scores=aggregated_scores
        )

    def _split_text_into_chunks(self, text: str, max_tokens_per_chunk: int) -> List[str]:
        """

        Split text into chunks that fit within token limits.

        Args:

            text: The text to split

            max_tokens_per_chunk: Maximum tokens per chunk

        Returns:

            List of text chunks

        """
        # Split text into sentences first for better chunk boundaries
        sentences = text.split('. ')
        chunks = []
        current_chunk = ""
        current_tokens = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Add period back if it was removed
            if not sentence.endswith('.'):
                sentence += '.'

            sentence_tokens = len(self.tokenizer.tokenize(sentence))

            # If adding this sentence would exceed limit, start new chunk
            if current_tokens + sentence_tokens > max_tokens_per_chunk and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
                current_tokens = sentence_tokens
            else:
                if current_chunk:
                    current_chunk += " " + sentence
                else:
                    current_chunk = sentence
                current_tokens += sentence_tokens

        # Add the last chunk if it exists
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def _aggregate_emotion_scores(self, results: List[List[Dict[str, float]]]) -> Dict[str, float]:
        """

        Aggregate emotion scores from multiple chunks.

        Args:

            results: List of emotion classification results from each chunk

        Returns:

            Dictionary of aggregated emotion scores

        """
        if not results:
            return {}

        # Collect all emotion scores with weights
        emotion_totals = {}
        emotion_weights = {}

        for chunk_results in results:
            # Get confidence scores for this chunk
            chunk_scores = {entry["label"]: entry["score"] for entry in chunk_results}

            # Weight by confidence (more confident predictions get higher weight)
            total_confidence = sum(chunk_scores.values())

            for emotion, score in chunk_scores.items():
                weight = score / total_confidence if total_confidence > 0 else 0
                emotion_totals[emotion] = emotion_totals.get(emotion, 0) + score
                emotion_weights[emotion] = emotion_weights.get(emotion, 0) + 1  # Simple count for now

        # Average the scores across chunks
        aggregated_scores = {}
        for emotion in emotion_totals:
            # Use weighted average based on number of chunks that detected this emotion
            aggregated_scores[emotion] = emotion_totals[emotion] / emotion_weights[emotion]

        return aggregated_scores