""" Text Utilities Functions for cleaning and processing text """ import re # Patterns for cleaning URL_PATTERN = re.compile(r"https?://\S+|www\.\S+") EMOJI_PATTERN = re.compile( "[" "\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF" "\U0001F680-\U0001F6FF" "\U0001F700-\U0001F77F" "\U0001F780-\U0001F7FF" "\U0001F800-\U0001F8FF" "\U0001F900-\U0001F9FF" "\U0001FA00-\U0001FAFF" "]+", flags=re.UNICODE ) def clean_text(raw_text: str) -> str: # Clean and normalize text by removing URLs, emojis, and duplicates if not raw_text: return "" text = raw_text text = URL_PATTERN.sub("", text) text = EMOJI_PATTERN.sub("", text) text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE) text = re.sub(r"\s+", " ", text).strip() text = remove_duplicate_sentences(text) return text def remove_duplicate_sentences(text: str) -> str: # Remove duplicate sentences while preserving order if not text: return "" sentences = text.split('. ') seen = set() cleaned = [] for s in sentences: s_clean = s.strip().lower() if len(s_clean) < 5: continue if s_clean not in seen: seen.add(s_clean) cleaned.append(s.strip()) return '. '.join(cleaned) def should_summarize(text: str) -> bool: # Check if text needs summarization based on length return len(text) >= 400 # ~60-80 words