File size: 325 Bytes
10dce3c
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9 .,!?\']+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_text_list(text_list):
    return [clean_text(text) for text in text_list if text.strip()]