geeteshcodes
/

sllm

Text Generation

Model card Files Files and versions

sllm / tokenizer /normalizer.py

geeteshcodes's picture

Initial commit

7f974df verified 4 days ago

history blame contribute delete

1.2 kB

	import re
	import html
	import unicodedata

	def normalization(text):

	# Strip HTML tags (note: won't catch multiline tags)
	text = re.sub(r'<[^>]+>', ' ', text)

	# HTML entity decoding
	text = html.unescape(text)

	# NFC normalization
	text = unicodedata.normalize('NFC', text)

	# Control characters — including \x7f (DEL)
	text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)

	# Unicode line/paragraph separators → newline (structural, not removed)
	text = re.sub(r'[\u2028\u2029]', '\n', text)

	# Zero-width characters
	text = re.sub(r'[\u200b\u200c\u200d\ufeff\u00ad]', '', text)

	# Replacement character
	text = text.replace('\ufffd', '')

	# Normalize line endings
	text = text.replace('\r\n', '\n')
	text = text.replace('\r', '\n')

	# Collapse spaces only (preserve leading tabs for indentation)
	text = re.sub(r' +', ' ', text)

	# Trailing spaces/tabs at end of line
	text = re.sub(r'[ \t]+\n', '\n', text)

	# Collapse excess newlines
	text = re.sub(r'\n{3,}', '\n\n', text)

	text = text.strip()
	return text