Spaces:
Running
Running
File size: 312 Bytes
4d16182 | 1 2 3 4 5 6 7 8 9 10 11 12 | import re
import unicodedata
def preprocess_text(text):
text = unicodedata.normalize("NFKC", text)
text = re.sub(r"http\S+|www\.\S+", "", text)
text = re.sub(r"<.*?>", "", text)
text = re.sub(r"[^\wÀ-ÖØ-öø-ÿ?!,. ]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
|