Spaces:

kachaf
/

sarf

Sleeping

App Files Files Community

sarf / pyaramorph /__init__.py

Boulbaba

Upload 17 files

108abd3 verified 8 days ago

raw

history blame contribute delete

5.03 kB

	# pyaramorph, an Arabic morphological analyzer
	# Copyright © 2005 Alexander Lee
	# Ported to Python from Tim Buckwalter's aramorph.pl.

	import sys
	import re
	from collections import namedtuple
	#from . import buckwalter
	from . import util

	Segment = namedtuple('Segment', 'prefix stem suffix')

	# Data file paths
	TABLE_AB = "tableAB"
	TABLE_BC = "tableBC"
	TABLE_AC = "tableAC"
	DICT_PREFIXES = "dictPrefixes"
	DICT_STEMS = "dictStems"
	DICT_SUFFIXES = "dictSuffixes"

	def _tokenize(text):
	""" Extract all Arabic words from input and ignore everything
	else """
	tokens = re.split('[^\u0621-\u0652\u0670-\u0671]+', text)
	# include all strictly Arabic consonants and diacritics --
	# perhaps include other letters at a later time.
	return tokens

	def _clean_arabic(text):
	""" Remove any تَطوِيل and vowels/diacritics """
	return re.sub('[\u0640\u064b-\u0652\u0670]', '', text)
	# FIXME do something about \u0671, ALIF WASLA ?

	class Analyzer:

	def __init__(self):
	self.tableAB = util.load_table(TABLE_AB)
	self.tableBC = util.load_table(TABLE_BC)
	self.tableAC = util.load_table(TABLE_AC)

	self.prefixes = util.load_dict(DICT_PREFIXES)
	self.stems = util.load_dict(DICT_STEMS)
	self.suffixes = util.load_dict(DICT_SUFFIXES)

	def analyze_text(self, text):
	"""Generate analyses for each word in the given Arabic text."""
	results = []
	tokens = _tokenize(text.strip())

	for token in tokens:
	token = _clean_arabic(token)
	#buckword = buckwalter.uni2buck(token)
	#print(token)
	analyses = self.analyze_word(token)
	if len(analyses) > 0:
	analyses.insert(0, "analysis for: %s" % (token))
	results.append(analyses)

	return results

	def analyze_word(self, word):
	"""Return all possible analyses for the given word"""
	analyses = []
	count = 0
	segments = self._build_segments(word)

	for prefix, stem, suffix in segments:
	analyses.extend(self._check_segment(prefix, stem, suffix))

	return analyses

	def _check_segment(self, prefix, stem, suffix):
	""" See if the prefix, stem, and suffix are compatible """
	analyses = []

	# Loop through the possible prefix entries
	for pre_entry in self.prefixes[prefix]:
	(voc_a, cat_a, gloss_a, pos_a) = pre_entry[1:5]

	# Loop through the possible stem entries
	for stem_entry in self.stems[stem]:
	(voc_b, cat_b, gloss_b, pos_b, lemmaID,spattern, root ) = stem_entry[1:]

	# Check the prefix + stem pair
	pairAB = "%s %s" % (cat_a, cat_b)
	if not pairAB in self.tableAB: continue

	# Loop through the possible suffix entries
	for suf_entry in self.suffixes[suffix]:
	(voc_c, cat_c, gloss_c, pos_c) = suf_entry[1:5]

	# Check the prefix + suffix pair
	pairAC = "%s %s" % (cat_a, cat_c)
	if not pairAC in self.tableAC: continue

	# Check the stem + suffix pair
	pairBC = "%s %s" % (cat_b, cat_c)
	if not pairBC in self.tableBC: continue

	# Ok, it passed!
	univoc = "%s%s%s" % (voc_a, voc_b, voc_c)
	#univoc = buckwalter.buck2uni(buckvoc)
	if gloss_a == '': gloss_a = '___'
	if gloss_c == '': gloss_c = '___'
	if pos_a == '': pos_a = '___'
	if pos_c == '': pos_c = '___'
	analyses.append(
	" solution: %s\n"
	" lemmaID: %s\n"
	" root: %s\n"
	" stem_pattern: %s\n"
	" pos: %s+%s+%s\n"
	" gloss: %s + %s + %s\n" % \
	(univoc, \
	lemmaID, \
	root, \
	spattern, \
	pos_a, pos_b, pos_c, \
	gloss_a, gloss_b, gloss_c))

	return analyses

	def _valid_segment(self, segment):
	"""Determines whether the segment is possible."""
	return segment.prefix in self.prefixes \
	and segment.stem in self.stems \
	and segment.suffix in self.suffixes

	def _build_segments(self, word):
	"""Returns all possible segmentations of the given word."""
	segments = []

	for stem_idx, suf_idx in util.segment_indexes(len(word)):
	prefix = word[0:stem_idx]
	stem = word[stem_idx:suf_idx]
	suffix = word[suf_idx:]
	segment = Segment(prefix, stem, suffix)
	if self._valid_segment(segment):

	segments.append(segment)

	return segments