from transformers import pipeline, AutoTokenizer from schemas.text_schemas import EmotionResult, EmotionDetector from typing import List, Dict class TransformersEmotionDetector(EmotionDetector): """Emotion detector using a lightweight DistilRoBERTa model from Hugging Face.""" def __init__(self, model_name: str = "j-hartmann/emotion-english-distilroberta-base"): """ Initialize the emotion detection model. Args: model_name: Pretrained Hugging Face model for emotion detection. """ self.model_name = model_name self.pipeline = pipeline( "text-classification", model=model_name, return_all_scores=True ) # Load tokenizer for proper text truncation self.tokenizer = AutoTokenizer.from_pretrained(model_name) def analyze(self, text: str) -> EmotionResult: """ Analyze emotion in the given text using chunked processing for long texts. Args: text: The text to analyze. Returns: EmotionResult: Structured result containing dominant emotion and confidence. """ # Split text into chunks if it's too long max_tokens_per_chunk = 400 # Conservative limit to leave room for special tokens chunks = self._split_text_into_chunks(text, max_tokens_per_chunk) if len(chunks) == 1: # Single chunk - use original logic results: List[List[Dict[str, float]]] = self.pipeline(chunks[0]) else: # Multiple chunks - analyze each and aggregate results results: List[List[Dict[str, float]]] = self.pipeline(chunks) # Aggregate emotion scores across all chunks aggregated_scores = self._aggregate_emotion_scores(results) # Get the most likely emotion dominant_emotion = max(aggregated_scores, key=aggregated_scores.get) confidence = aggregated_scores[dominant_emotion] return EmotionResult( dominant_emotion=dominant_emotion, confidence=confidence, all_scores=aggregated_scores ) def _split_text_into_chunks(self, text: str, max_tokens_per_chunk: int) -> List[str]: """ Split text into chunks that fit within token limits. Args: text: The text to split max_tokens_per_chunk: Maximum tokens per chunk Returns: List of text chunks """ # Split text into sentences first for better chunk boundaries sentences = text.split('. ') chunks = [] current_chunk = "" current_tokens = 0 for sentence in sentences: sentence = sentence.strip() if not sentence: continue # Add period back if it was removed if not sentence.endswith('.'): sentence += '.' sentence_tokens = len(self.tokenizer.tokenize(sentence)) # If adding this sentence would exceed limit, start new chunk if current_tokens + sentence_tokens > max_tokens_per_chunk and current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence current_tokens = sentence_tokens else: if current_chunk: current_chunk += " " + sentence else: current_chunk = sentence current_tokens += sentence_tokens # Add the last chunk if it exists if current_chunk: chunks.append(current_chunk.strip()) return chunks def _aggregate_emotion_scores(self, results: List[List[Dict[str, float]]]) -> Dict[str, float]: """ Aggregate emotion scores from multiple chunks. Args: results: List of emotion classification results from each chunk Returns: Dictionary of aggregated emotion scores """ if not results: return {} # Collect all emotion scores with weights emotion_totals = {} emotion_weights = {} for chunk_results in results: # Get confidence scores for this chunk chunk_scores = {entry["label"]: entry["score"] for entry in chunk_results} # Weight by confidence (more confident predictions get higher weight) total_confidence = sum(chunk_scores.values()) for emotion, score in chunk_scores.items(): weight = score / total_confidence if total_confidence > 0 else 0 emotion_totals[emotion] = emotion_totals.get(emotion, 0) + score emotion_weights[emotion] = emotion_weights.get(emotion, 0) + 1 # Simple count for now # Average the scores across chunks aggregated_scores = {} for emotion in emotion_totals: # Use weighted average based on number of chunks that detected this emotion aggregated_scores[emotion] = emotion_totals[emotion] / emotion_weights[emotion] return aggregated_scores