arabic-misogyny-detector / preprocess.py
alainkh's picture
Upload 4 files
8e63cf1 verified
import re
import json
import emoji
import pyarabic.araby as araby
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
def load_tokenizer(path='tokenizer.json'):
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
return tokenizer_from_json(data)
TOKENIZER = load_tokenizer()
def data_cleaning(text):
# For any foreign language or for links
text = text.lower()
# Removing all mentions with @
text = re.sub(r"@\w+", '', text)
# Remove all links
text = re.sub(r'https?:\/\/.*[\r\n]*', "", text, flags=re.MULTILINE)
# These are some issues that couldnt be removed so I had to manually force remove them
for char in ["مستخدم@", "#", "…", "RT", "\ufffd"]:
text = text.replace(char, "")
# Convert exclamation point and question mark into words, to give them more weight in the context
text = re.sub(r'!+', ' [EXCLAMATION] ', text)
text = re.sub(r'\?+', ' [QUESTION] ', text)
# Removed some useless characters
chars_to_remove = r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~،؛؟ـ٪٫٬«»“”•·…﴾﴿〈〉°±÷ש®™€£¥¢]'
text = re.sub(chars_to_remove, ' ', text)
# I am keeping everything on one line
text = re.sub(r'[\r\n]+', ' ', text)
# Normalizing some letters to unify some words and match more words
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("گ", "ك", text)
# Remove extra spaces
text = re.sub(r'\s+', ' ', text)
# This is for underscores inside hashtags, so I am converting the hashtag into words
text = text.replace("_", " ")
# Since I dont have much data, I am converting emojis into text to also give more context to sentences
text = emoji.demojize(text, delimiters=(" ", " "))
# strip tashkeel and tatweel
text = araby.strip_tashkeel(text)
text = araby.strip_tatweel(text)
text = text.strip()
return text
def prepare_input(text, max_len=48):
# Step 1: Clean
cleaned_text = data_cleaning(text)
# Step 2: Tokenize (Convert text to sequence of integers)
# Wrap cleaned_text in a list because texts_to_sequences expects a list of strings
sequence = TOKENIZER.texts_to_sequences([cleaned_text])
# Step 3: Pad (Ensure fixed length for the CNN-BiGRU architecture)
padded = pad_sequences(sequence, maxlen=max_len)
return padded