team-149-project / utils /clean_text.py
knguyen471's picture
Upload 11 files
888aba6 verified
raw
history blame contribute delete
702 Bytes
import re
def clean_text(text) -> str:
# Strip and lower
text = text.strip().lower()
# Remove mentions (@username) and hashtags (#tag)
text = re.sub(r'[@#][\w∆]+', '', text)
# Remove extra spaces left behind
text = re.sub(r'\s+', ' ', text)
text = text.replace("\n", " ").replace("\t", " ")
# Remove phone numbers
text = re.sub(r'\b\d{10}\b', '', text)
# Collapse repeated punctuation (e.g. !!!!)
text = re.sub(r'([^\w\s])\1+', r'\1', text)
# Collapse multiple spaces
text = re.sub(r'\s+', ' ', text)
# Fix "\'" like: can\'t, don\'t, etc
text = re.sub(r"\\'", "'", text)
text = re.sub(r"\\'", "'", text)
return text.strip()