Create preprocess.py
Browse files- preprocess.py +18 -0
preprocess.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re , string
|
| 2 |
+
from textacy.preprocessing.resources import (RE_EMAIL , RE_URL , RE_NUMBER ,
|
| 3 |
+
RE_NUMBER , RE_EMOJI , RE_SHORT_URL , RE_PHONE_NUMBER
|
| 4 |
+
)
|
| 5 |
+
|
| 6 |
+
NON_ARABIC_RE = re.compile(r"[%s]"%string.ascii_letters)
|
| 7 |
+
|
| 8 |
+
def clean_text(text:str)->str:
|
| 9 |
+
'''remove unwanted data'''
|
| 10 |
+
patterns = [RE_EMAIL , RE_EMOJI , RE_NUMBER , RE_PHONE_NUMBER , RE_SHORT_URL , RE_URL , NON_ARABIC_RE]
|
| 11 |
+
|
| 12 |
+
for pattern in patterns:
|
| 13 |
+
|
| 14 |
+
text = pattern.sub("" , text)
|
| 15 |
+
|
| 16 |
+
return text
|
| 17 |
+
|
| 18 |
+
|