File size: 1,199 Bytes
7f974df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re
import html
import unicodedata

def normalization(text):
    
    # Strip HTML tags (note: won't catch multiline tags)
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # HTML entity decoding
    text = html.unescape(text)
    
    # NFC normalization
    text = unicodedata.normalize('NFC', text)
    
    # Control characters — including \x7f (DEL)
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    
    # Unicode line/paragraph separators → newline (structural, not removed)
    text = re.sub(r'[\u2028\u2029]', '\n', text)
    
    # Zero-width characters
    text = re.sub(r'[\u200b\u200c\u200d\ufeff\u00ad]', '', text)
    
    # Replacement character
    text = text.replace('\ufffd', '')
    
    # Normalize line endings
    text = text.replace('\r\n', '\n')
    text = text.replace('\r', '\n')
    
    # Collapse spaces only (preserve leading tabs for indentation)
    text = re.sub(r' +', ' ', text)
    
    # Trailing spaces/tabs at end of line
    text = re.sub(r'[ \t]+\n', '\n', text)
    
    # Collapse excess newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    text = text.strip()
    return text