| import re |
| import string |
| import nltk |
| nltk.download('stopwords') |
|
|
|
|
| arabic_stopwords = set(nltk.corpus.stopwords.words("arabic")) |
|
|
| arabic_diacritics = re.compile(""" |
| ู | # Tashdid |
| ู | # Fatha |
| ู | # Tanwin Fath |
| ู | # Damma |
| ู | # Tanwin Damm |
| ู | # Kasra |
| ู | # Tanwin Kasr |
| ู | # Sukun |
| ู # Tatwil/Kashida |
| """, re.VERBOSE) |
|
|
| arabic_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู''' |
| english_punctuations = string.punctuation |
| punctuations = arabic_punctuations + english_punctuations |
|
|
|
|
| def remove_urls (text): |
| text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) |
| return text |
|
|
|
|
| def remove_emails(text): |
| text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "", text, flags=re.MULTILINE) |
| return text |
|
|
| |
| |
|
|
| def remove_emoji(data): |
| emoj = re.compile("[" |
| u"\U0001F600-\U0001F64F" |
| u"\U0001F300-\U0001F5FF" |
| u"\U0001F680-\U0001F6FF" |
| u"\U0001F1E0-\U0001F1FF" |
| u"\U00002500-\U00002BEF" |
| u"\U00002702-\U000027B0" |
| u"\U00002702-\U000027B0" |
| u"\U000024C2-\U0001F251" |
| u"\U0001f926-\U0001f937" |
| u"\U00010000-\U0010ffff" |
| u"\u2640-\u2642" |
| u"\u2600-\u2B55" |
| u"\u200d" |
| u"\u23cf" |
| u"\u23e9" |
| u"\u231a" |
| u"\ufe0f" |
| u"\u3030" |
| "]+", re.UNICODE) |
| return re.sub(emoj, '', data) |
|
|
| def normalization(text): |
| text = re.sub("[ุฅุฃุขุง]", "ุง", text) |
| text = re.sub("ู", "ู", text) |
| text = re.sub("ุค", "ุก", text) |
| text = re.sub("ุฆ", "ุก", text) |
| text = re.sub("ุฉ", "ู", text) |
| text = re.sub("ฺฏ", "ู", text) |
| return text |
|
|
| def remove_diacritics(text): |
| text = re.sub(arabic_diacritics, '', text) |
| return text |
|
|
| def remove_stopwords(text): |
| filtered_sentence = [w for w in text.split() if not w in arabic_stopwords] |
| return ' '.join(filtered_sentence) |
|
|
| def cleaning_content(line): |
| if (isinstance(line, float)): |
| return None |
| line.replace('\n', ' ') |
| line = remove_emails(line) |
| line = remove_urls(line) |
| line = remove_emoji(line) |
| nline = [w if '@' not in w else 'USERID' for w in line.split()] |
| line = ' '.join(nline) |
| line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('"', '').replace('<url>', '').replace('USERID', '') |
|
|
|
|
| |
| line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations})) |
|
|
| |
| translator = str.maketrans('', '', punctuations) |
| line = line.translate(translator) |
|
|
| line = remove_stopwords(line) |
| line=remove_diacritics(normalization(line)) |
|
|
| line = line.strip() |
| return line |
|
|
| def hasDigits(s): |
| return any( 48 <= ord(char) <= 57 or 1632 <= ord(char) <= 1641 for char in s) |