Spaces:

DeepActionPotential
/

FactSight

Runtime error

App Files Files Community

FactSight / services /text_emotion_service.py

DeepActionPotential

Initial project upload via Python API for Flask Space

e0f2d0e verified 4 months ago

raw

history blame contribute delete

5.24 kB

	from transformers import pipeline, AutoTokenizer
	from schemas.text_schemas import EmotionResult, EmotionDetector
	from typing import List, Dict

	class TransformersEmotionDetector(EmotionDetector):
	"""Emotion detector using a lightweight DistilRoBERTa model from Hugging Face."""

	def __init__(self, model_name: str = "j-hartmann/emotion-english-distilroberta-base"):
	"""
	Initialize the emotion detection model.
	Args:
	model_name: Pretrained Hugging Face model for emotion detection.
	"""
	self.model_name = model_name
	self.pipeline = pipeline(
	"text-classification",
	model=model_name,
	return_all_scores=True
	)
	# Load tokenizer for proper text truncation
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	def analyze(self, text: str) -> EmotionResult:
	"""
	Analyze emotion in the given text using chunked processing for long texts.
	Args:
	text: The text to analyze.
	Returns:
	EmotionResult: Structured result containing dominant emotion and confidence.
	"""
	# Split text into chunks if it's too long
	max_tokens_per_chunk = 400 # Conservative limit to leave room for special tokens
	chunks = self._split_text_into_chunks(text, max_tokens_per_chunk)

	if len(chunks) == 1:
	# Single chunk - use original logic
	results: List[List[Dict[str, float]]] = self.pipeline(chunks[0])
	else:
	# Multiple chunks - analyze each and aggregate results
	results: List[List[Dict[str, float]]] = self.pipeline(chunks)

	# Aggregate emotion scores across all chunks
	aggregated_scores = self._aggregate_emotion_scores(results)

	# Get the most likely emotion
	dominant_emotion = max(aggregated_scores, key=aggregated_scores.get)
	confidence = aggregated_scores[dominant_emotion]

	return EmotionResult(
	dominant_emotion=dominant_emotion,
	confidence=confidence,
	all_scores=aggregated_scores
	)

	def _split_text_into_chunks(self, text: str, max_tokens_per_chunk: int) -> List[str]:
	"""
	Split text into chunks that fit within token limits.
	Args:
	text: The text to split
	max_tokens_per_chunk: Maximum tokens per chunk
	Returns:
	List of text chunks
	"""
	# Split text into sentences first for better chunk boundaries
	sentences = text.split('. ')
	chunks = []
	current_chunk = ""
	current_tokens = 0

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	# Add period back if it was removed
	if not sentence.endswith('.'):
	sentence += '.'

	sentence_tokens = len(self.tokenizer.tokenize(sentence))

	# If adding this sentence would exceed limit, start new chunk
	if current_tokens + sentence_tokens > max_tokens_per_chunk and current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence
	current_tokens = sentence_tokens
	else:
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence
	current_tokens += sentence_tokens

	# Add the last chunk if it exists
	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	def _aggregate_emotion_scores(self, results: List[List[Dict[str, float]]]) -> Dict[str, float]:
	"""
	Aggregate emotion scores from multiple chunks.
	Args:
	results: List of emotion classification results from each chunk
	Returns:
	Dictionary of aggregated emotion scores
	"""
	if not results:
	return {}

	# Collect all emotion scores with weights
	emotion_totals = {}
	emotion_weights = {}

	for chunk_results in results:
	# Get confidence scores for this chunk
	chunk_scores = {entry["label"]: entry["score"] for entry in chunk_results}

	# Weight by confidence (more confident predictions get higher weight)
	total_confidence = sum(chunk_scores.values())

	for emotion, score in chunk_scores.items():
	weight = score / total_confidence if total_confidence > 0 else 0
	emotion_totals[emotion] = emotion_totals.get(emotion, 0) + score
	emotion_weights[emotion] = emotion_weights.get(emotion, 0) + 1 # Simple count for now

	# Average the scores across chunks
	aggregated_scores = {}
	for emotion in emotion_totals:
	# Use weighted average based on number of chunks that detected this emotion
	aggregated_scores[emotion] = emotion_totals[emotion] / emotion_weights[emotion]

	return aggregated_scores