Spaces:
Sleeping
Sleeping
| """ | |
| Text Utilities | |
| Functions for cleaning and processing text | |
| """ | |
| import re | |
| # Patterns for cleaning | |
| URL_PATTERN = re.compile(r"https?://\S+|www\.\S+") | |
| EMOJI_PATTERN = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" | |
| "\U0001F300-\U0001F5FF" | |
| "\U0001F680-\U0001F6FF" | |
| "\U0001F700-\U0001F77F" | |
| "\U0001F780-\U0001F7FF" | |
| "\U0001F800-\U0001F8FF" | |
| "\U0001F900-\U0001F9FF" | |
| "\U0001FA00-\U0001FAFF" | |
| "]+", | |
| flags=re.UNICODE | |
| ) | |
| def clean_text(raw_text: str) -> str: | |
| # Clean and normalize text by removing URLs, emojis, and duplicates | |
| if not raw_text: | |
| return "" | |
| text = raw_text | |
| text = URL_PATTERN.sub("", text) | |
| text = EMOJI_PATTERN.sub("", text) | |
| text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| text = remove_duplicate_sentences(text) | |
| return text | |
| def remove_duplicate_sentences(text: str) -> str: | |
| # Remove duplicate sentences while preserving order | |
| if not text: | |
| return "" | |
| sentences = text.split('. ') | |
| seen = set() | |
| cleaned = [] | |
| for s in sentences: | |
| s_clean = s.strip().lower() | |
| if len(s_clean) < 5: | |
| continue | |
| if s_clean not in seen: | |
| seen.add(s_clean) | |
| cleaned.append(s.strip()) | |
| return '. '.join(cleaned) | |
| def should_summarize(text: str) -> bool: | |
| # Check if text needs summarization based on length | |
| return len(text) >= 400 # ~60-80 words |