import re def normalize_text(text: str) -> str: """Clean and normalize extracted text from any format (PDF/DOCX/MD/TXT).""" if not text: return "" # Replace common PDF CID artifacts like (cid:1234) text = re.sub(r'\(cid:\d+\)', '', text) # Replace newlines/tabs with spaces text = text.replace('\n', ' ').replace('\t', ' ') # Remove emojis and pictographs emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map "\U0001F1E0-\U0001F1FF" # flags "\U00002500-\U00002BEF" "\U00002700-\U000027BF" "\U0001F900-\U0001F9FF" "\U0001FA70-\U0001FAFF" "\U00002600-\U000026FF" "\U00002B00-\U00002BFF" "]+", flags=re.UNICODE ) text = emoji_pattern.sub("", text) # Collapse multiple spaces text = re.sub(r'\s+', ' ', text) return text.strip()