# pyaramorph, an Arabic morphological analyzer # Copyright © 2005 Alexander Lee # Ported to Python from Tim Buckwalter's aramorph.pl. import sys import re from collections import namedtuple #from . import buckwalter from . import util Segment = namedtuple('Segment', 'prefix stem suffix') # Data file paths TABLE_AB = "tableAB" TABLE_BC = "tableBC" TABLE_AC = "tableAC" DICT_PREFIXES = "dictPrefixes" DICT_STEMS = "dictStems" DICT_SUFFIXES = "dictSuffixes" def _tokenize(text): """ Extract all Arabic words from input and ignore everything else """ tokens = re.split('[^\u0621-\u0652\u0670-\u0671]+', text) # include all strictly Arabic consonants and diacritics -- # perhaps include other letters at a later time. return tokens def _clean_arabic(text): """ Remove any تَطوِيل and vowels/diacritics """ return re.sub('[\u0640\u064b-\u0652\u0670]', '', text) # FIXME do something about \u0671, ALIF WASLA ? class Analyzer: def __init__(self): self.tableAB = util.load_table(TABLE_AB) self.tableBC = util.load_table(TABLE_BC) self.tableAC = util.load_table(TABLE_AC) self.prefixes = util.load_dict(DICT_PREFIXES) self.stems = util.load_dict(DICT_STEMS) self.suffixes = util.load_dict(DICT_SUFFIXES) def analyze_text(self, text): """Generate analyses for each word in the given Arabic text.""" results = [] tokens = _tokenize(text.strip()) for token in tokens: token = _clean_arabic(token) #buckword = buckwalter.uni2buck(token) #print(token) analyses = self.analyze_word(token) if len(analyses) > 0: analyses.insert(0, "analysis for: %s" % (token)) results.append(analyses) return results def analyze_word(self, word): """Return all possible analyses for the given word""" analyses = [] count = 0 segments = self._build_segments(word) for prefix, stem, suffix in segments: analyses.extend(self._check_segment(prefix, stem, suffix)) return analyses def _check_segment(self, prefix, stem, suffix): """ See if the prefix, stem, and suffix are compatible """ analyses = [] # Loop through the possible prefix entries for pre_entry in self.prefixes[prefix]: (voc_a, cat_a, gloss_a, pos_a) = pre_entry[1:5] # Loop through the possible stem entries for stem_entry in self.stems[stem]: (voc_b, cat_b, gloss_b, pos_b, lemmaID,spattern, root ) = stem_entry[1:] # Check the prefix + stem pair pairAB = "%s %s" % (cat_a, cat_b) if not pairAB in self.tableAB: continue # Loop through the possible suffix entries for suf_entry in self.suffixes[suffix]: (voc_c, cat_c, gloss_c, pos_c) = suf_entry[1:5] # Check the prefix + suffix pair pairAC = "%s %s" % (cat_a, cat_c) if not pairAC in self.tableAC: continue # Check the stem + suffix pair pairBC = "%s %s" % (cat_b, cat_c) if not pairBC in self.tableBC: continue # Ok, it passed! univoc = "%s%s%s" % (voc_a, voc_b, voc_c) #univoc = buckwalter.buck2uni(buckvoc) if gloss_a == '': gloss_a = '___' if gloss_c == '': gloss_c = '___' if pos_a == '': pos_a = '___' if pos_c == '': pos_c = '___' analyses.append( " solution: %s\n" " lemmaID: %s\n" " root: %s\n" " stem_pattern: %s\n" " pos: %s+%s+%s\n" " gloss: %s + %s + %s\n" % \ (univoc, \ lemmaID, \ root, \ spattern, \ pos_a, pos_b, pos_c, \ gloss_a, gloss_b, gloss_c)) return analyses def _valid_segment(self, segment): """Determines whether the segment is possible.""" return segment.prefix in self.prefixes \ and segment.stem in self.stems \ and segment.suffix in self.suffixes def _build_segments(self, word): """Returns all possible segmentations of the given word.""" segments = [] for stem_idx, suf_idx in util.segment_indexes(len(word)): prefix = word[0:stem_idx] stem = word[stem_idx:suf_idx] suffix = word[suf_idx:] segment = Segment(prefix, stem, suffix) if self._valid_segment(segment): segments.append(segment) return segments