Spaces:
Runtime error
Runtime error
add clean text func
Browse files- src/utils.py +29 -0
src/utils.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
emoji_pattern = re.compile(
|
| 4 |
+
"["
|
| 5 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
| 6 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 7 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 8 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
| 9 |
+
u"\U00002702-\U000027B0"
|
| 10 |
+
u"\U000024C2-\U0001F251"
|
| 11 |
+
"]+",
|
| 12 |
+
flags=re.UNICODE,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def clean_text(x):
|
| 17 |
+
x = x.lower() # lowercase
|
| 18 |
+
x = x.encode("ascii", "ignore").decode() # unicode
|
| 19 |
+
x = re.sub(r"https*\S+", " ", x) # url
|
| 20 |
+
x = re.sub(r"@\S+", " ", x) # mentions
|
| 21 |
+
x = re.sub(r"#\S+", " ", x) # hastags
|
| 22 |
+
x = x.replace("'", "") # remove ticks
|
| 23 |
+
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
|
| 24 |
+
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
|
| 25 |
+
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
| 26 |
+
x = emoji_pattern.sub(r"", x) # emojis
|
| 27 |
+
x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
|
| 28 |
+
|
| 29 |
+
return x
|