sllm / tokenizer /normalizer.py
geeteshcodes's picture
Initial commit
7f974df verified
import re
import html
import unicodedata
def normalization(text):
# Strip HTML tags (note: won't catch multiline tags)
text = re.sub(r'<[^>]+>', ' ', text)
# HTML entity decoding
text = html.unescape(text)
# NFC normalization
text = unicodedata.normalize('NFC', text)
# Control characters — including \x7f (DEL)
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# Unicode line/paragraph separators → newline (structural, not removed)
text = re.sub(r'[\u2028\u2029]', '\n', text)
# Zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff\u00ad]', '', text)
# Replacement character
text = text.replace('\ufffd', '')
# Normalize line endings
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
# Collapse spaces only (preserve leading tabs for indentation)
text = re.sub(r' +', ' ', text)
# Trailing spaces/tabs at end of line
text = re.sub(r'[ \t]+\n', '\n', text)
# Collapse excess newlines
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()
return text