File size: 926 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
import unicodedata
from nltk.corpus import stopwords

# Load English stopwords
STOPWORDS = set(stopwords.words("english"))

def postprocess_extracted_text(text: str) -> str: # space between lower-uppercase
    text = re.sub(r'[\t\r\n]+', ' ', text)            # remove tabs/newlines
    #text = re.sub(r' {2,}', ' ', text).strip()       # remove multiple spaces
    return text

def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove emails
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', text)
    
    # Remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOPWORDS]
    text = " ".join(tokens)

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text