moabos
chore: setup fastapi with initial routes and hook up traditional models (phase 1) with preprocessing
354c6a0
| import re | |
| from nltk.corpus import stopwords | |
| from nltk.stem.isri import ISRIStemmer | |
| arabic_stopwords = set(stopwords.words("arabic")) | |
| stemmer = ISRIStemmer() | |
| char_map = str.maketrans( | |
| {"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""} | |
| ) | |
| diacritics_pattern = re.compile(r"[\u064B-\u0652]") | |
| punctuation_pattern = re.compile(r"[^\w\s]") | |
| whitespace_pattern = re.compile(r"\s+") | |
| repeated_char_pattern = re.compile(r"(.)\1+") | |
| def normalize_arabic(text: str) -> str: | |
| """Normalize Arabic characters.""" | |
| return text.translate(char_map) | |
| def remove_diacritics(text: str) -> str: | |
| """Remove Arabic diacritics.""" | |
| return diacritics_pattern.sub("", text) | |
| def remove_punctuation(text: str) -> str: | |
| """Remove punctuation marks.""" | |
| return punctuation_pattern.sub(" ", text) | |
| def reduce_repeated_characters(text: str) -> str: | |
| """Reduce repeated characters to single occurrence.""" | |
| return repeated_char_pattern.sub(r"\1", text) | |
| def remove_stopwords(tokens: list[str]) -> list[str]: | |
| """Remove Arabic stopwords from tokens.""" | |
| return [word for word in tokens if word not in arabic_stopwords] | |
| def stem_tokens(tokens: list[str]) -> list[str]: | |
| """Apply ISRI stemming to tokens.""" | |
| return [stemmer.stem(token) for token in tokens] | |
| def preprocess_for_classification(text: str) -> str: | |
| """Preprocess text for classification: normalize, clean, tokenize, stem.""" | |
| text = text.strip().lower() | |
| text = normalize_arabic(text) | |
| text = remove_diacritics(text) | |
| text = remove_punctuation(text) | |
| text = reduce_repeated_characters(text) | |
| text = whitespace_pattern.sub(" ", text).strip() | |
| text = re.sub(r"\d+", "", text) | |
| tokens = text.split() | |
| tokens = remove_stopwords(tokens) | |
| tokens = stem_tokens(tokens) | |
| return " ".join(tokens) | |
| def preprocess_for_summarization(text: str) -> str: | |
| """Light preprocessing for summarization: remove diacritics and numbers.""" | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.strip().lower() | |
| text = remove_diacritics(text) | |
| text = whitespace_pattern.sub(" ", text).strip() | |
| return re.sub(r"\d+", "", text) | |
| class ArabicPreprocessor: | |
| """Arabic text preprocessor with analysis capabilities.""" | |
| def __init__(self): | |
| self.arabic_stopwords = arabic_stopwords | |
| self.stemmer = stemmer | |
| self.char_map = char_map | |
| def preprocess_for_classification(self, text: str) -> str: | |
| """Preprocess text for classification.""" | |
| return preprocess_for_classification(text) | |
| def preprocess_for_summarization(self, text: str) -> str: | |
| """Preprocess text for summarization.""" | |
| return preprocess_for_summarization(text) | |
| def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict: | |
| """Get detailed preprocessing steps for analysis.""" | |
| steps = { | |
| "original": text, | |
| "stripped_lowered": text.strip().lower(), | |
| } | |
| current = text.strip().lower() | |
| if task_type == "classification": | |
| steps["normalized"] = normalize_arabic(current) | |
| current = normalize_arabic(current) | |
| steps["diacritics_removed"] = remove_diacritics(current) | |
| current = remove_diacritics(current) | |
| steps["punctuation_removed"] = remove_punctuation(current) | |
| current = remove_punctuation(current) | |
| steps["repeated_chars_reduced"] = reduce_repeated_characters(current) | |
| current = reduce_repeated_characters(current) | |
| steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip() | |
| current = whitespace_pattern.sub(" ", current).strip() | |
| steps["numbers_removed"] = re.sub(r"\d+", "", current) | |
| current = re.sub(r"\d+", "", current) | |
| tokens = current.split() | |
| steps["tokenized"] = tokens | |
| tokens_no_stop = remove_stopwords(tokens) | |
| steps["stopwords_removed"] = tokens_no_stop | |
| stemmed_tokens = stem_tokens(tokens_no_stop) | |
| steps["stemmed"] = stemmed_tokens | |
| steps["final"] = " ".join(stemmed_tokens) | |
| elif task_type == "summarization": | |
| steps["diacritics_removed"] = remove_diacritics(current) | |
| current = remove_diacritics(current) | |
| steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip() | |
| current = whitespace_pattern.sub(" ", current).strip() | |
| steps["numbers_removed"] = re.sub(r"\d+", "", current) | |
| steps["final"] = re.sub(r"\d+", "", current) | |
| return steps | |
| def analyze_text(self, text: str) -> dict: | |
| """Analyze text characteristics and statistics.""" | |
| original_sentences = re.split(r"[.!؟\n]+", text) | |
| original_sentences = [s.strip() for s in original_sentences if s.strip()] | |
| tokens = text.split() | |
| arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text)) | |
| return { | |
| "character_count": len(text), | |
| "word_count": len(tokens), | |
| "sentence_count": len(original_sentences), | |
| "arabic_character_count": arabic_chars, | |
| "arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0, | |
| "average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0, | |
| "average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0, | |
| "has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)), | |
| "punctuation_count": len(re.findall(r'[^\w\s]', text)) | |
| } | |