kkAsmaa commited on
Commit
604575d
·
verified ·
1 Parent(s): 9a2393f

Delete preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +0 -20
preprocessing.py DELETED
@@ -1,20 +0,0 @@
1
- import re
2
- from arabert.preprocess import ArabertPreprocessor
3
- model_name = "aubmindlab/bert-base-arabertv02-twitter"
4
- arabic_prep = ArabertPreprocessor(model_name=model_name)
5
-
6
- def clean_obfuscation(text):
7
- text = str(text)
8
- text = re.sub(r'https?://\S+|www\.\S+|@\S+|#', '', text)
9
- text = re.sub(r'(?<=[أ-ي])[^\sأ-ي](?=[أ-ي])', '', text)
10
- text = re.sub(r'(?<=[أ-ي])\s(?=[أ-ي]\s|[أ-ي]$)', '', text)
11
- text = re.sub(r'ـ+', '', text)
12
- text = re.sub(r'(.)\1{2,}', r'\1\1', text)
13
- text = re.sub(r'[^\w\s\.]', ' ', text)
14
- text = re.sub(r'\s+', ' ', text)
15
- return text.strip()
16
-
17
- def full_preprocess(text):
18
- text_no_trickery = clean_obfuscation(text)
19
- final_text = arabic_prep.preprocess(text_no_trickery)
20
- return final_text