File size: 702 Bytes
888aba6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re

def clean_text(text) -> str:

    # Strip and lower
    text = text.strip().lower()

    # Remove mentions (@username) and hashtags (#tag)
    text = re.sub(r'[@#][\w∆]+', '', text)

    # Remove extra spaces left behind
    text = re.sub(r'\s+', ' ', text)
    text = text.replace("\n", " ").replace("\t", " ")

    # Remove phone numbers
    text = re.sub(r'\b\d{10}\b', '', text)

    # Collapse repeated punctuation (e.g. !!!!)
    text = re.sub(r'([^\w\s])\1+', r'\1', text)

    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Fix "\'" like: can\'t, don\'t, etc
    text = re.sub(r"\\'", "'", text)
    text = re.sub(r"\\'", "'", text)

    return text.strip()