Spaces:

dev11-13
/

news-whisper-api

Sleeping

news-whisper-api / backend /summarization /utils.py

Devang1290

feat: deploy News Whisper on-demand search API (FastAPI + Docker)

2cb327c 5 days ago

2.39 kB

	"""
	Text Cleaning Utilities
	========================
	Functions for cleaning and processing article text before summarization.

	Cleaning pipeline:
	1. Remove URLs (http:// and www.)
	2. Strip Unicode emojis
	3. Normalize special characters to spaces
	4. Collapse whitespace
	5. Remove duplicate sentences

	Usage:
	from backend.summarization.utils import clean_text, should_summarize

	cleaned = clean_text(raw_article_content)
	if should_summarize(cleaned):
	summary = summarizer.summarize(cleaned)
	"""

	import re

	# Pre-compiled patterns for performance
	URL_PATTERN = re.compile(r"https?://\S+\|www\.\S+")
	EMOJI_PATTERN = re.compile(
	"["
	"\U0001F600-\U0001F64F"
	"\U0001F300-\U0001F5FF"
	"\U0001F680-\U0001F6FF"
	"\U0001F700-\U0001F77F"
	"\U0001F780-\U0001F7FF"
	"\U0001F800-\U0001F8FF"
	"\U0001F900-\U0001F9FF"
	"\U0001FA00-\U0001FAFF"
	"]+",
	flags=re.UNICODE
	)


	def clean_text(raw_text: str) -> str:
	"""Clean and normalize article text by removing URLs, emojis, and duplicates.

	Args:
	raw_text: The raw article body text.

	Returns:
	Cleaned string with normalized whitespace and no duplicate sentences.
	"""
	if not raw_text:
	return ""

	text = raw_text
	text = URL_PATTERN.sub("", text)
	text = EMOJI_PATTERN.sub("", text)
	text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE)
	text = re.sub(r"\s+", " ", text).strip()
	text = remove_duplicate_sentences(text)

	return text


	def remove_duplicate_sentences(text: str) -> str:
	"""Remove duplicate sentences while preserving order.

	Splits on '. ' (period-space), deduplicates by lowered content,
	and reassembles. Sentences shorter than 5 characters are dropped.
	"""
	if not text:
	return ""

	sentences = text.split('. ')
	seen = set()
	cleaned = []

	for s in sentences:
	s_clean = s.strip().lower()
	if len(s_clean) < 5:
	continue
	if s_clean not in seen:
	seen.add(s_clean)
	cleaned.append(s.strip())

	return '. '.join(cleaned)


	def should_summarize(text: str) -> bool:
	"""Check if text is long enough to benefit from AI summarization.

	Returns True for texts >= 400 characters (~60-80 words).
	Shorter texts are kept as-is (no AI processing needed).
	"""
	return len(text) >= 400