Spaces:
Sleeping
Sleeping
| """ | |
| Text Cleaning Utilities | |
| ======================== | |
| Functions for cleaning and processing article text before summarization. | |
| Cleaning pipeline: | |
| 1. Remove URLs (http:// and www.) | |
| 2. Strip Unicode emojis | |
| 3. Normalize special characters to spaces | |
| 4. Collapse whitespace | |
| 5. Remove duplicate sentences | |
| Usage: | |
| from backend.summarization.utils import clean_text, should_summarize | |
| cleaned = clean_text(raw_article_content) | |
| if should_summarize(cleaned): | |
| summary = summarizer.summarize(cleaned) | |
| """ | |
| import re | |
| # Pre-compiled patterns for performance | |
| URL_PATTERN = re.compile(r"https?://\S+|www\.\S+") | |
| EMOJI_PATTERN = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" | |
| "\U0001F300-\U0001F5FF" | |
| "\U0001F680-\U0001F6FF" | |
| "\U0001F700-\U0001F77F" | |
| "\U0001F780-\U0001F7FF" | |
| "\U0001F800-\U0001F8FF" | |
| "\U0001F900-\U0001F9FF" | |
| "\U0001FA00-\U0001FAFF" | |
| "]+", | |
| flags=re.UNICODE | |
| ) | |
| def clean_text(raw_text: str) -> str: | |
| """Clean and normalize article text by removing URLs, emojis, and duplicates. | |
| Args: | |
| raw_text: The raw article body text. | |
| Returns: | |
| Cleaned string with normalized whitespace and no duplicate sentences. | |
| """ | |
| if not raw_text: | |
| return "" | |
| text = raw_text | |
| text = URL_PATTERN.sub("", text) | |
| text = EMOJI_PATTERN.sub("", text) | |
| text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| text = remove_duplicate_sentences(text) | |
| return text | |
| def remove_duplicate_sentences(text: str) -> str: | |
| """Remove duplicate sentences while preserving order. | |
| Splits on '. ' (period-space), deduplicates by lowered content, | |
| and reassembles. Sentences shorter than 5 characters are dropped. | |
| """ | |
| if not text: | |
| return "" | |
| sentences = text.split('. ') | |
| seen = set() | |
| cleaned = [] | |
| for s in sentences: | |
| s_clean = s.strip().lower() | |
| if len(s_clean) < 5: | |
| continue | |
| if s_clean not in seen: | |
| seen.add(s_clean) | |
| cleaned.append(s.strip()) | |
| return '. '.join(cleaned) | |
| def should_summarize(text: str) -> bool: | |
| """Check if text is long enough to benefit from AI summarization. | |
| Returns True for texts >= 400 characters (~60-80 words). | |
| Shorter texts are kept as-is (no AI processing needed). | |
| """ | |
| return len(text) >= 400 | |