Spaces:

mabosaimi
/

arabic-summarizer-classifier

Sleeping

arabic-summarizer-classifier / preprocessor.py

moabos

chore: setup fastapi with initial routes and hook up traditional models (phase 1) with preprocessing

354c6a0 8 months ago

5.87 kB

	import re
	from nltk.corpus import stopwords
	from nltk.stem.isri import ISRIStemmer

	arabic_stopwords = set(stopwords.words("arabic"))
	stemmer = ISRIStemmer()

	char_map = str.maketrans(
	{"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
	)

	diacritics_pattern = re.compile(r"[\u064B-\u0652]")
	punctuation_pattern = re.compile(r"[^\w\s]")
	whitespace_pattern = re.compile(r"\s+")
	repeated_char_pattern = re.compile(r"(.)\1+")


	def normalize_arabic(text: str) -> str:
	"""Normalize Arabic characters."""
	return text.translate(char_map)


	def remove_diacritics(text: str) -> str:
	"""Remove Arabic diacritics."""
	return diacritics_pattern.sub("", text)


	def remove_punctuation(text: str) -> str:
	"""Remove punctuation marks."""
	return punctuation_pattern.sub(" ", text)


	def reduce_repeated_characters(text: str) -> str:
	"""Reduce repeated characters to single occurrence."""
	return repeated_char_pattern.sub(r"\1", text)


	def remove_stopwords(tokens: list[str]) -> list[str]:
	"""Remove Arabic stopwords from tokens."""
	return [word for word in tokens if word not in arabic_stopwords]


	def stem_tokens(tokens: list[str]) -> list[str]:
	"""Apply ISRI stemming to tokens."""
	return [stemmer.stem(token) for token in tokens]


	def preprocess_for_classification(text: str) -> str:
	"""Preprocess text for classification: normalize, clean, tokenize, stem."""
	text = text.strip().lower()
	text = normalize_arabic(text)
	text = remove_diacritics(text)
	text = remove_punctuation(text)
	text = reduce_repeated_characters(text)
	text = whitespace_pattern.sub(" ", text).strip()
	text = re.sub(r"\d+", "", text)
	tokens = text.split()
	tokens = remove_stopwords(tokens)
	tokens = stem_tokens(tokens)
	return " ".join(tokens)


	def preprocess_for_summarization(text: str) -> str:
	"""Light preprocessing for summarization: remove diacritics and numbers."""
	if not isinstance(text, str):
	return ""
	text = text.strip().lower()
	text = remove_diacritics(text)
	text = whitespace_pattern.sub(" ", text).strip()
	return re.sub(r"\d+", "", text)


	class ArabicPreprocessor:
	"""Arabic text preprocessor with analysis capabilities."""

	def __init__(self):
	self.arabic_stopwords = arabic_stopwords
	self.stemmer = stemmer
	self.char_map = char_map

	def preprocess_for_classification(self, text: str) -> str:
	"""Preprocess text for classification."""
	return preprocess_for_classification(text)

	def preprocess_for_summarization(self, text: str) -> str:
	"""Preprocess text for summarization."""
	return preprocess_for_summarization(text)

	def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
	"""Get detailed preprocessing steps for analysis."""
	steps = {
	"original": text,
	"stripped_lowered": text.strip().lower(),
	}

	current = text.strip().lower()

	if task_type == "classification":
	steps["normalized"] = normalize_arabic(current)
	current = normalize_arabic(current)

	steps["diacritics_removed"] = remove_diacritics(current)
	current = remove_diacritics(current)

	steps["punctuation_removed"] = remove_punctuation(current)
	current = remove_punctuation(current)

	steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
	current = reduce_repeated_characters(current)

	steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
	current = whitespace_pattern.sub(" ", current).strip()

	steps["numbers_removed"] = re.sub(r"\d+", "", current)
	current = re.sub(r"\d+", "", current)

	tokens = current.split()
	steps["tokenized"] = tokens

	tokens_no_stop = remove_stopwords(tokens)
	steps["stopwords_removed"] = tokens_no_stop

	stemmed_tokens = stem_tokens(tokens_no_stop)
	steps["stemmed"] = stemmed_tokens

	steps["final"] = " ".join(stemmed_tokens)

	elif task_type == "summarization":
	steps["diacritics_removed"] = remove_diacritics(current)
	current = remove_diacritics(current)

	steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
	current = whitespace_pattern.sub(" ", current).strip()

	steps["numbers_removed"] = re.sub(r"\d+", "", current)
	steps["final"] = re.sub(r"\d+", "", current)

	return steps

	def analyze_text(self, text: str) -> dict:
	"""Analyze text characteristics and statistics."""
	original_sentences = re.split(r"[.!؟\n]+", text)
	original_sentences = [s.strip() for s in original_sentences if s.strip()]

	tokens = text.split()
	arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))

	return {
	"character_count": len(text),
	"word_count": len(tokens),
	"sentence_count": len(original_sentences),
	"arabic_character_count": arabic_chars,
	"arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
	"average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
	"average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
	"has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
	"punctuation_count": len(re.findall(r'[^\w\s]', text))
	}