Spaces:
Sleeping
Sleeping
File size: 1,484 Bytes
2cb327c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | """
Text Utilities
Functions for cleaning and processing text
"""
import re
# Patterns for cleaning
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
EMOJI_PATTERN = re.compile(
"["
"\U0001F600-\U0001F64F"
"\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF"
"\U0001F700-\U0001F77F"
"\U0001F780-\U0001F7FF"
"\U0001F800-\U0001F8FF"
"\U0001F900-\U0001F9FF"
"\U0001FA00-\U0001FAFF"
"]+",
flags=re.UNICODE
)
def clean_text(raw_text: str) -> str:
# Clean and normalize text by removing URLs, emojis, and duplicates
if not raw_text:
return ""
text = raw_text
text = URL_PATTERN.sub("", text)
text = EMOJI_PATTERN.sub("", text)
text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE)
text = re.sub(r"\s+", " ", text).strip()
text = remove_duplicate_sentences(text)
return text
def remove_duplicate_sentences(text: str) -> str:
# Remove duplicate sentences while preserving order
if not text:
return ""
sentences = text.split('. ')
seen = set()
cleaned = []
for s in sentences:
s_clean = s.strip().lower()
if len(s_clean) < 5:
continue
if s_clean not in seen:
seen.add(s_clean)
cleaned.append(s.strip())
return '. '.join(cleaned)
def should_summarize(text: str) -> bool:
# Check if text needs summarization based on length
return len(text) >= 400 # ~60-80 words |