Spaces:
Runtime error
Runtime error
File size: 5,240 Bytes
e0f2d0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from transformers import pipeline, AutoTokenizer
from schemas.text_schemas import EmotionResult, EmotionDetector
from typing import List, Dict
class TransformersEmotionDetector(EmotionDetector):
"""Emotion detector using a lightweight DistilRoBERTa model from Hugging Face."""
def __init__(self, model_name: str = "j-hartmann/emotion-english-distilroberta-base"):
"""
Initialize the emotion detection model.
Args:
model_name: Pretrained Hugging Face model for emotion detection.
"""
self.model_name = model_name
self.pipeline = pipeline(
"text-classification",
model=model_name,
return_all_scores=True
)
# Load tokenizer for proper text truncation
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def analyze(self, text: str) -> EmotionResult:
"""
Analyze emotion in the given text using chunked processing for long texts.
Args:
text: The text to analyze.
Returns:
EmotionResult: Structured result containing dominant emotion and confidence.
"""
# Split text into chunks if it's too long
max_tokens_per_chunk = 400 # Conservative limit to leave room for special tokens
chunks = self._split_text_into_chunks(text, max_tokens_per_chunk)
if len(chunks) == 1:
# Single chunk - use original logic
results: List[List[Dict[str, float]]] = self.pipeline(chunks[0])
else:
# Multiple chunks - analyze each and aggregate results
results: List[List[Dict[str, float]]] = self.pipeline(chunks)
# Aggregate emotion scores across all chunks
aggregated_scores = self._aggregate_emotion_scores(results)
# Get the most likely emotion
dominant_emotion = max(aggregated_scores, key=aggregated_scores.get)
confidence = aggregated_scores[dominant_emotion]
return EmotionResult(
dominant_emotion=dominant_emotion,
confidence=confidence,
all_scores=aggregated_scores
)
def _split_text_into_chunks(self, text: str, max_tokens_per_chunk: int) -> List[str]:
"""
Split text into chunks that fit within token limits.
Args:
text: The text to split
max_tokens_per_chunk: Maximum tokens per chunk
Returns:
List of text chunks
"""
# Split text into sentences first for better chunk boundaries
sentences = text.split('. ')
chunks = []
current_chunk = ""
current_tokens = 0
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# Add period back if it was removed
if not sentence.endswith('.'):
sentence += '.'
sentence_tokens = len(self.tokenizer.tokenize(sentence))
# If adding this sentence would exceed limit, start new chunk
if current_tokens + sentence_tokens > max_tokens_per_chunk and current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
current_tokens = sentence_tokens
else:
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
current_tokens += sentence_tokens
# Add the last chunk if it exists
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def _aggregate_emotion_scores(self, results: List[List[Dict[str, float]]]) -> Dict[str, float]:
"""
Aggregate emotion scores from multiple chunks.
Args:
results: List of emotion classification results from each chunk
Returns:
Dictionary of aggregated emotion scores
"""
if not results:
return {}
# Collect all emotion scores with weights
emotion_totals = {}
emotion_weights = {}
for chunk_results in results:
# Get confidence scores for this chunk
chunk_scores = {entry["label"]: entry["score"] for entry in chunk_results}
# Weight by confidence (more confident predictions get higher weight)
total_confidence = sum(chunk_scores.values())
for emotion, score in chunk_scores.items():
weight = score / total_confidence if total_confidence > 0 else 0
emotion_totals[emotion] = emotion_totals.get(emotion, 0) + score
emotion_weights[emotion] = emotion_weights.get(emotion, 0) + 1 # Simple count for now
# Average the scores across chunks
aggregated_scores = {}
for emotion in emotion_totals:
# Use weighted average based on number of chunks that detected this emotion
aggregated_scores[emotion] = emotion_totals[emotion] / emotion_weights[emotion]
return aggregated_scores
|