# pyaramorph, an Arabic morphological analyzer
# Copyright © 2005 Alexander Lee
# Ported to Python from Tim Buckwalter's aramorph.pl.

import sys
import re
from collections import namedtuple
#from . import buckwalter
from . import util

Segment = namedtuple('Segment', 'prefix stem suffix')

# Data file paths
TABLE_AB = "tableAB"
TABLE_BC = "tableBC"
TABLE_AC = "tableAC"
DICT_PREFIXES = "dictPrefixes"
DICT_STEMS = "dictStems"
DICT_SUFFIXES = "dictSuffixes"

def _tokenize(text):
    """ Extract all Arabic words from input and ignore everything
    else """
    tokens = re.split('[^\u0621-\u0652\u0670-\u0671]+', text)
        # include all strictly Arabic consonants and diacritics --
        # perhaps include other letters at a later time.
    return tokens

def _clean_arabic(text):
    """ Remove any تَطوِيل and vowels/diacritics """
    return re.sub('[\u0640\u064b-\u0652\u0670]', '', text)
    # FIXME do something about \u0671, ALIF WASLA ?

class Analyzer:

    def __init__(self):
        self.tableAB = util.load_table(TABLE_AB)
        self.tableBC = util.load_table(TABLE_BC)
        self.tableAC = util.load_table(TABLE_AC)

        self.prefixes = util.load_dict(DICT_PREFIXES)
        self.stems = util.load_dict(DICT_STEMS)
        self.suffixes = util.load_dict(DICT_SUFFIXES)

    def analyze_text(self, text):
        """Generate analyses for each word in the given Arabic text."""
        results = []
        tokens = _tokenize(text.strip())

        for token in tokens:
            token = _clean_arabic(token)
            #buckword = buckwalter.uni2buck(token)
            #print(token)
            analyses = self.analyze_word(token)
            if len(analyses) > 0:
                analyses.insert(0, "analysis for: %s" % (token))
                results.append(analyses)

        return results

    def analyze_word(self, word):
        """Return all possible analyses for the given word"""
        analyses = []
        count = 0
        segments = self._build_segments(word)

        for prefix, stem, suffix in segments:
            analyses.extend(self._check_segment(prefix, stem, suffix))

        return analyses

    def _check_segment(self, prefix, stem, suffix):
        """ See if the prefix, stem, and suffix are compatible """
        analyses = []

        # Loop through the possible prefix entries
        for pre_entry in self.prefixes[prefix]:
            (voc_a, cat_a, gloss_a, pos_a) = pre_entry[1:5]

            # Loop through the possible stem entries
            for stem_entry in self.stems[stem]:
                (voc_b, cat_b, gloss_b, pos_b, lemmaID,spattern, root ) = stem_entry[1:]

                # Check the prefix + stem pair
                pairAB = "%s %s" % (cat_a, cat_b)
                if not pairAB in self.tableAB: continue

                # Loop through the possible suffix entries
                for suf_entry in self.suffixes[suffix]:
                    (voc_c, cat_c, gloss_c, pos_c) = suf_entry[1:5]

                    # Check the prefix + suffix pair
                    pairAC = "%s %s" % (cat_a, cat_c)
                    if not pairAC in self.tableAC: continue

                    # Check the stem + suffix pair
                    pairBC = "%s %s" % (cat_b, cat_c)
                    if not pairBC in self.tableBC: continue

                    # Ok, it passed!
                    univoc = "%s%s%s" % (voc_a, voc_b, voc_c)
                    #univoc = buckwalter.buck2uni(buckvoc)
                    if gloss_a == '': gloss_a = '___'
                    if gloss_c == '': gloss_c = '___'
                    if pos_a == '': pos_a = '___'
                    if pos_c == '': pos_c = '___'
                    analyses.append(
                        "    solution: %s\n"
                        "       lemmaID: %s\n"
                        "       root: %s\n"
                        "       stem_pattern: %s\n"
                        "       pos: %s+%s+%s\n"
                        "       gloss: %s + %s + %s\n" % \
                        (univoc, \
                        lemmaID, \
                        root, \
                        spattern, \
                        pos_a, pos_b, pos_c, \
                        gloss_a, gloss_b, gloss_c))

        return analyses

    def _valid_segment(self, segment):
        """Determines whether the segment is possible."""
        return segment.prefix in self.prefixes \
                and segment.stem in self.stems \
                and segment.suffix in self.suffixes

    def _build_segments(self, word):
        """Returns all possible segmentations of the given word."""
        segments = []

        for stem_idx, suf_idx in util.segment_indexes(len(word)):
            prefix = word[0:stem_idx]
            stem = word[stem_idx:suf_idx]
            suffix = word[suf_idx:]
            segment = Segment(prefix, stem, suffix)
            if self._valid_segment(segment):
               
                segments.append(segment)

        return segments