Spaces:
Paused
Paused
File size: 997 Bytes
1bc3f18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | import re
def normalize_text(text: str) -> str:
"""Clean and normalize extracted text from any format (PDF/DOCX/MD/TXT)."""
if not text:
return ""
# Replace common PDF CID artifacts like (cid:1234)
text = re.sub(r'\(cid:\d+\)', '', text)
# Replace newlines/tabs with spaces
text = text.replace('\n', ' ').replace('\t', ' ')
# Remove emojis and pictographs
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map
"\U0001F1E0-\U0001F1FF" # flags
"\U00002500-\U00002BEF"
"\U00002700-\U000027BF"
"\U0001F900-\U0001F9FF"
"\U0001FA70-\U0001FAFF"
"\U00002600-\U000026FF"
"\U00002B00-\U00002BFF"
"]+", flags=re.UNICODE
)
text = emoji_pattern.sub("", text)
# Collapse multiple spaces
text = re.sub(r'\s+', ' ', text)
return text.strip()
|