Spaces:

fatttty
/

arabic_text_cleaner_app

Sleeping

App Files Files Community

arabic_text_cleaner_app / text_preprocessor.py

fatttty

Update text_preprocessor.py

8aa048a verified over 1 year ago

raw

history blame contribute delete

3.69 kB


	import re
	import nltk
	import string
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import qalsadi.analex as qa
	import qalsadi.lemmatizer

	# Download required NLTK resources
	nltk.download('punkt')
	nltk.download('stopwords')

	class TextPreprocessor:
	def __init__(self):
	self.stop_words = set(stopwords.words('arabic'))
	self.arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+\|!”…“–ـ'''
	self.english_punctuations = string.punctuation
	self.punctuations_list = self.arabic_punctuations + self.english_punctuations

	def remove_punctuations(self, text):
	translator = str.maketrans('', '', self.punctuations_list)
	return text.translate(translator)

	def remove_english(self, text):
	english_pattern = re.compile(r'\b[a-zA-Z]+\b')
	cleaned_text = re.sub(english_pattern, '', text)
	return cleaned_text

	def remove_digits(self, text):
	text = re.sub(r'[0-9]+', '', text) # Remove English digits
	text = re.sub(r'[٠١۲٣٤٥٦٧٨٩]+', '', text) # Remove Arabic digits
	return text

	def remove_diacritics(self, text):
	pattern = re.compile(r"""
	ّ \| # Tashdid
	َ \| # Fatha
	ً \| # Tanwin Fath
	ُ \| # Damma
	ٌ \| # Tanwin Damm
	ِ \| # Kasra
	ٍ \| # Tanwin Kasr
	ْ \| # Sukun
	ـ # Tatwil/Kashida
	""", re.VERBOSE)
	cleaned_text = re.sub(pattern, '', text)
	return cleaned_text

	def remove_extra_whitespaces(self, text):
	trimmed_text = text.strip()
	return re.sub(r"\s+", ' ', trimmed_text)

	def text_normalize(self, text):
	text = re.sub("[إأآااً]", "ا", text)
	text = re.sub("ى", "ي", text)
	text = re.sub("ؤ", "ء", text)
	text = re.sub("ئ", "ء", text)
	text = re.sub("۽", "ء", text)
	text = re.sub("ة", "ه", text)
	text = re.sub("[ڱګگݣڪ]", "ك", text)
	text = re.sub("ڤ", "ف", text)
	text = re.sub("چ", "ج", text)
	text = re.sub("ژ", "ز", text)
	text = re.sub("ڒ", "ز", text)
	text = re.sub("ٺ", "ت", text)
	text = re.sub("پ", "ب", text)
	# text = re.sub("ه", "ة", text)
	text = re.sub("پ", "ب", text)
	return text

	def remove_stop_words(self, text):
	tokens = word_tokenize(text)
	filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words]
	filtered_text = ' '.join(filtered_tokens)
	return filtered_text

	def remove_arabic_prefixes(self,text):
	text = re.sub(r"\bال", '', text)
	text = re.sub(r"\bوال", '', text)
	text = re.sub(r"\bلل", '', text)
	text = re.sub(r"\bبال", '', text)
	text = re.sub("الا", "ا", text)
	return text

	def tokenize(self, text):
	tokens = word_tokenize(str(text)) # Convert text to string if not NaN
	return tokens

	def preprocess(self, text):
	# Chain all preprocessing steps together
	text = self.remove_punctuations(text)
	text = self.remove_english(text)
	text = self.remove_digits(text)
	text = self.remove_diacritics(text)
	text = self.remove_extra_whitespaces(text)
	text = self.text_normalize(text)
	text = self.remove_arabic_prefixes(text)
	# text = self.remove_stop_words(text)
	return text