import re import html import unicodedata def normalization(text): # Strip HTML tags (note: won't catch multiline tags) text = re.sub(r'<[^>]+>', ' ', text) # HTML entity decoding text = html.unescape(text) # NFC normalization text = unicodedata.normalize('NFC', text) # Control characters — including \x7f (DEL) text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) # Unicode line/paragraph separators → newline (structural, not removed) text = re.sub(r'[\u2028\u2029]', '\n', text) # Zero-width characters text = re.sub(r'[\u200b\u200c\u200d\ufeff\u00ad]', '', text) # Replacement character text = text.replace('\ufffd', '') # Normalize line endings text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') # Collapse spaces only (preserve leading tabs for indentation) text = re.sub(r' +', ' ', text) # Trailing spaces/tabs at end of line text = re.sub(r'[ \t]+\n', '\n', text) # Collapse excess newlines text = re.sub(r'\n{3,}', '\n\n', text) text = text.strip() return text