batch-analyse / core /preprocess.py
pjxcharya's picture
initial commit
10dce3c
raw
history blame contribute delete
325 Bytes
import re
def clean_text(text):
text = text.lower()
text = re.sub(r"http\S+|www\S+", "", text)
text = re.sub(r"[^a-zA-Z0-9 .,!?\']+", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def preprocess_text_list(text_list):
return [clean_text(text) for text in text_list if text.strip()]