Spaces:

MinaNasser
/

EXAM_RAG_API

Paused

1st

1bc3f18 about 2 months ago

997 Bytes

	import re


	def normalize_text(text: str) -> str:
	"""Clean and normalize extracted text from any format (PDF/DOCX/MD/TXT)."""
	if not text:
	return ""

	# Replace common PDF CID artifacts like (cid:1234)
	text = re.sub(r'\(cid:\d+\)', '', text)

	# Replace newlines/tabs with spaces
	text = text.replace('\n', ' ').replace('\t', ' ')

	# Remove emojis and pictographs
	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # emoticons
	"\U0001F300-\U0001F5FF" # symbols & pictographs
	"\U0001F680-\U0001F6FF" # transport & map
	"\U0001F1E0-\U0001F1FF" # flags
	"\U00002500-\U00002BEF"
	"\U00002700-\U000027BF"
	"\U0001F900-\U0001F9FF"
	"\U0001FA70-\U0001FAFF"
	"\U00002600-\U000026FF"
	"\U00002B00-\U00002BFF"
	"]+", flags=re.UNICODE
	)
	text = emoji_pattern.sub("", text)

	# Collapse multiple spaces
	text = re.sub(r'\s+', ' ', text)

	return text.strip()