Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
"""
Text Cleaning Utilities
========================
Functions for cleaning and processing article text before summarization.
Cleaning pipeline:
1. Remove URLs (http:// and www.)
2. Strip Unicode emojis
3. Normalize special characters to spaces
4. Collapse whitespace
5. Remove duplicate sentences
Usage:
from backend.summarization.utils import clean_text, should_summarize
cleaned = clean_text(raw_article_content)
if should_summarize(cleaned):
summary = summarizer.summarize(cleaned)
"""
import re
# Pre-compiled patterns for performance
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
EMOJI_PATTERN = re.compile(
"["
"\U0001F600-\U0001F64F"
"\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF"
"\U0001F700-\U0001F77F"
"\U0001F780-\U0001F7FF"
"\U0001F800-\U0001F8FF"
"\U0001F900-\U0001F9FF"
"\U0001FA00-\U0001FAFF"
"]+",
flags=re.UNICODE
)
def clean_text(raw_text: str) -> str:
"""Clean and normalize article text by removing URLs, emojis, and duplicates.
Args:
raw_text: The raw article body text.
Returns:
Cleaned string with normalized whitespace and no duplicate sentences.
"""
if not raw_text:
return ""
text = raw_text
text = URL_PATTERN.sub("", text)
text = EMOJI_PATTERN.sub("", text)
text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE)
text = re.sub(r"\s+", " ", text).strip()
text = remove_duplicate_sentences(text)
return text
def remove_duplicate_sentences(text: str) -> str:
"""Remove duplicate sentences while preserving order.
Splits on '. ' (period-space), deduplicates by lowered content,
and reassembles. Sentences shorter than 5 characters are dropped.
"""
if not text:
return ""
sentences = text.split('. ')
seen = set()
cleaned = []
for s in sentences:
s_clean = s.strip().lower()
if len(s_clean) < 5:
continue
if s_clean not in seen:
seen.add(s_clean)
cleaned.append(s.strip())
return '. '.join(cleaned)
def should_summarize(text: str) -> bool:
"""Check if text is long enough to benefit from AI summarization.
Returns True for texts >= 400 characters (~60-80 words).
Shorter texts are kept as-is (no AI processing needed).
"""
return len(text) >= 400