EXAM_RAG_API / ingestion /loaders /normalization.py
MinaNasser's picture
1st
1bc3f18
raw
history blame contribute delete
997 Bytes
import re
def normalize_text(text: str) -> str:
"""Clean and normalize extracted text from any format (PDF/DOCX/MD/TXT)."""
if not text:
return ""
# Replace common PDF CID artifacts like (cid:1234)
text = re.sub(r'\(cid:\d+\)', '', text)
# Replace newlines/tabs with spaces
text = text.replace('\n', ' ').replace('\t', ' ')
# Remove emojis and pictographs
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map
"\U0001F1E0-\U0001F1FF" # flags
"\U00002500-\U00002BEF"
"\U00002700-\U000027BF"
"\U0001F900-\U0001F9FF"
"\U0001FA70-\U0001FAFF"
"\U00002600-\U000026FF"
"\U00002B00-\U00002BFF"
"]+", flags=re.UNICODE
)
text = emoji_pattern.sub("", text)
# Collapse multiple spaces
text = re.sub(r'\s+', ' ', text)
return text.strip()