Spaces:
Paused
Paused
| import re | |
| def normalize_text(text: str) -> str: | |
| """Clean and normalize extracted text from any format (PDF/DOCX/MD/TXT).""" | |
| if not text: | |
| return "" | |
| # Replace common PDF CID artifacts like (cid:1234) | |
| text = re.sub(r'\(cid:\d+\)', '', text) | |
| # Replace newlines/tabs with spaces | |
| text = text.replace('\n', ' ').replace('\t', ' ') | |
| # Remove emojis and pictographs | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map | |
| "\U0001F1E0-\U0001F1FF" # flags | |
| "\U00002500-\U00002BEF" | |
| "\U00002700-\U000027BF" | |
| "\U0001F900-\U0001F9FF" | |
| "\U0001FA70-\U0001FAFF" | |
| "\U00002600-\U000026FF" | |
| "\U00002B00-\U00002BFF" | |
| "]+", flags=re.UNICODE | |
| ) | |
| text = emoji_pattern.sub("", text) | |
| # Collapse multiple spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |