""" Text Cleaning Utilities ======================== Functions for cleaning and processing article text before summarization. Cleaning pipeline: 1. Remove URLs (http:// and www.) 2. Strip Unicode emojis 3. Normalize special characters to spaces 4. Collapse whitespace 5. Remove duplicate sentences Usage: from backend.summarization.utils import clean_text, should_summarize cleaned = clean_text(raw_article_content) if should_summarize(cleaned): summary = summarizer.summarize(cleaned) """ import re # Pre-compiled patterns for performance URL_PATTERN = re.compile(r"https?://\S+|www\.\S+") EMOJI_PATTERN = re.compile( "[" "\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF" "\U0001F680-\U0001F6FF" "\U0001F700-\U0001F77F" "\U0001F780-\U0001F7FF" "\U0001F800-\U0001F8FF" "\U0001F900-\U0001F9FF" "\U0001FA00-\U0001FAFF" "]+", flags=re.UNICODE ) def clean_text(raw_text: str) -> str: """Clean and normalize article text by removing URLs, emojis, and duplicates. Args: raw_text: The raw article body text. Returns: Cleaned string with normalized whitespace and no duplicate sentences. """ if not raw_text: return "" text = raw_text text = URL_PATTERN.sub("", text) text = EMOJI_PATTERN.sub("", text) text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE) text = re.sub(r"\s+", " ", text).strip() text = remove_duplicate_sentences(text) return text def remove_duplicate_sentences(text: str) -> str: """Remove duplicate sentences while preserving order. Splits on '. ' (period-space), deduplicates by lowered content, and reassembles. Sentences shorter than 5 characters are dropped. """ if not text: return "" sentences = text.split('. ') seen = set() cleaned = [] for s in sentences: s_clean = s.strip().lower() if len(s_clean) < 5: continue if s_clean not in seen: seen.add(s_clean) cleaned.append(s.strip()) return '. '.join(cleaned) def should_summarize(text: str) -> bool: """Check if text is long enough to benefit from AI summarization. Returns True for texts >= 400 characters (~60-80 words). Shorter texts are kept as-is (no AI processing needed). """ return len(text) >= 400