kkAsmaa commited on
Commit
9a2393f
·
verified ·
1 Parent(s): 317d582

Create preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +20 -0
preprocessing.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from arabert.preprocess import ArabertPreprocessor
3
+ model_name = "aubmindlab/bert-base-arabertv02-twitter"
4
+ arabic_prep = ArabertPreprocessor(model_name=model_name)
5
+
6
+ def clean_obfuscation(text):
7
+ text = str(text)
8
+ text = re.sub(r'https?://\S+|www\.\S+|@\S+|#', '', text)
9
+ text = re.sub(r'(?<=[أ-ي])[^\sأ-ي](?=[أ-ي])', '', text)
10
+ text = re.sub(r'(?<=[أ-ي])\s(?=[أ-ي]\s|[أ-ي]$)', '', text)
11
+ text = re.sub(r'ـ+', '', text)
12
+ text = re.sub(r'(.)\1{2,}', r'\1\1', text)
13
+ text = re.sub(r'[^\w\s\.]', ' ', text)
14
+ text = re.sub(r'\s+', ' ', text)
15
+ return text.strip()
16
+
17
+ def full_preprocess(text):
18
+ text_no_trickery = clean_obfuscation(text)
19
+ final_text = arabic_prep.preprocess(text_no_trickery)
20
+ return final_text