| import re
|
| import html
|
| import unicodedata
|
|
|
| def normalization(text):
|
|
|
|
|
| text = re.sub(r'<[^>]+>', ' ', text)
|
|
|
|
|
| text = html.unescape(text)
|
|
|
|
|
| text = unicodedata.normalize('NFC', text)
|
|
|
|
|
| text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
|
|
|
|
| text = re.sub(r'[\u2028\u2029]', '\n', text)
|
|
|
|
|
| text = re.sub(r'[\u200b\u200c\u200d\ufeff\u00ad]', '', text)
|
|
|
|
|
| text = text.replace('\ufffd', '')
|
|
|
|
|
| text = text.replace('\r\n', '\n')
|
| text = text.replace('\r', '\n')
|
|
|
|
|
| text = re.sub(r' +', ' ', text)
|
|
|
|
|
| text = re.sub(r'[ \t]+\n', '\n', text)
|
|
|
|
|
| text = re.sub(r'\n{3,}', '\n\n', text)
|
|
|
| text = text.strip()
|
| return text |