Spaces:

DeepActionPotential
/

FactSight

Runtime error

File size: 5,240 Bytes

e0f2d0e

from transformers import pipeline, AutoTokenizer
from schemas.text_schemas import EmotionResult, EmotionDetector
from typing import List, Dict

class TransformersEmotionDetector(EmotionDetector):
    """Emotion detector using a lightweight DistilRoBERTa model from Hugging Face."""

    def __init__(self, model_name: str = "j-hartmann/emotion-english-distilroberta-base"):
        """

        Initialize the emotion detection model.

        Args:

            model_name: Pretrained Hugging Face model for emotion detection.

        """
        self.model_name = model_name
        self.pipeline = pipeline(
            "text-classification",
            model=model_name,
            return_all_scores=True
        )
        # Load tokenizer for proper text truncation
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def analyze(self, text: str) -> EmotionResult:
        """

        Analyze emotion in the given text using chunked processing for long texts.

        Args:

            text: The text to analyze.

        Returns:

            EmotionResult: Structured result containing dominant emotion and confidence.

        """
        # Split text into chunks if it's too long
        max_tokens_per_chunk = 400  # Conservative limit to leave room for special tokens
        chunks = self._split_text_into_chunks(text, max_tokens_per_chunk)

        if len(chunks) == 1:
            # Single chunk - use original logic
            results: List[List[Dict[str, float]]] = self.pipeline(chunks[0])
        else:
            # Multiple chunks - analyze each and aggregate results
            results: List[List[Dict[str, float]]] = self.pipeline(chunks)

        # Aggregate emotion scores across all chunks
        aggregated_scores = self._aggregate_emotion_scores(results)

        # Get the most likely emotion
        dominant_emotion = max(aggregated_scores, key=aggregated_scores.get)
        confidence = aggregated_scores[dominant_emotion]

        return EmotionResult(
            dominant_emotion=dominant_emotion,
            confidence=confidence,
            all_scores=aggregated_scores
        )

    def _split_text_into_chunks(self, text: str, max_tokens_per_chunk: int) -> List[str]:
        """

        Split text into chunks that fit within token limits.

        Args:

            text: The text to split

            max_tokens_per_chunk: Maximum tokens per chunk

        Returns:

            List of text chunks

        """
        # Split text into sentences first for better chunk boundaries
        sentences = text.split('. ')
        chunks = []
        current_chunk = ""
        current_tokens = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Add period back if it was removed
            if not sentence.endswith('.'):
                sentence += '.'

            sentence_tokens = len(self.tokenizer.tokenize(sentence))

            # If adding this sentence would exceed limit, start new chunk
            if current_tokens + sentence_tokens > max_tokens_per_chunk and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
                current_tokens = sentence_tokens
            else:
                if current_chunk:
                    current_chunk += " " + sentence
                else:
                    current_chunk = sentence
                current_tokens += sentence_tokens

        # Add the last chunk if it exists
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def _aggregate_emotion_scores(self, results: List[List[Dict[str, float]]]) -> Dict[str, float]:
        """

        Aggregate emotion scores from multiple chunks.

        Args:

            results: List of emotion classification results from each chunk

        Returns:

            Dictionary of aggregated emotion scores

        """
        if not results:
            return {}

        # Collect all emotion scores with weights
        emotion_totals = {}
        emotion_weights = {}

        for chunk_results in results:
            # Get confidence scores for this chunk
            chunk_scores = {entry["label"]: entry["score"] for entry in chunk_results}

            # Weight by confidence (more confident predictions get higher weight)
            total_confidence = sum(chunk_scores.values())

            for emotion, score in chunk_scores.items():
                weight = score / total_confidence if total_confidence > 0 else 0
                emotion_totals[emotion] = emotion_totals.get(emotion, 0) + score
                emotion_weights[emotion] = emotion_weights.get(emotion, 0) + 1  # Simple count for now

        # Average the scores across chunks
        aggregated_scores = {}
        for emotion in emotion_totals:
            # Use weighted average based on number of chunks that detected this emotion
            aggregated_scores[emotion] = emotion_totals[emotion] / emotion_weights[emotion]

        return aggregated_scores