Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
"""
Text Utilities
Functions for cleaning and processing text
"""
import re
# Patterns for cleaning
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
EMOJI_PATTERN = re.compile(
"["
"\U0001F600-\U0001F64F"
"\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF"
"\U0001F700-\U0001F77F"
"\U0001F780-\U0001F7FF"
"\U0001F800-\U0001F8FF"
"\U0001F900-\U0001F9FF"
"\U0001FA00-\U0001FAFF"
"]+",
flags=re.UNICODE
)
def clean_text(raw_text: str) -> str:
# Clean and normalize text by removing URLs, emojis, and duplicates
if not raw_text:
return ""
text = raw_text
text = URL_PATTERN.sub("", text)
text = EMOJI_PATTERN.sub("", text)
text = re.sub(r"[^\w\s.,!?']", " ", text, flags=re.UNICODE)
text = re.sub(r"\s+", " ", text).strip()
text = remove_duplicate_sentences(text)
return text
def remove_duplicate_sentences(text: str) -> str:
# Remove duplicate sentences while preserving order
if not text:
return ""
sentences = text.split('. ')
seen = set()
cleaned = []
for s in sentences:
s_clean = s.strip().lower()
if len(s_clean) < 5:
continue
if s_clean not in seen:
seen.add(s_clean)
cleaned.append(s.strip())
return '. '.join(cleaned)
def should_summarize(text: str) -> bool:
# Check if text needs summarization based on length
return len(text) >= 400 # ~60-80 words