Spaces:

kachaf
/

qutuf

Running

File size: 7,390 Bytes

21baa2f


'''
Created on ١٠‏/٠٣‏/٢٠١٠

@Created by: Muhammad Altabba
'''

from ...Controllers.Morphology.Entities.SurfaceFormMorphemes import *;
from ...Controllers.Morphology.Entities.GreedyMorphemes import *;
from ...Controllers.Tokenization.TokenType import *;
from ...Controllers.General.ArabicStringUtility import *;
from ...Models.Lexicon.LettersConstants import *;
from ...Controllers.Morphology.Entities.Particle import *;
from ...Controllers.Morphology.Entities.DerivedCliticless import *;

class Word(object):
    """
     # PyUML: Do not remove this line! # XMI_ID:_qyYhVY35Ed-gg8GOK1TmhA
    """
    '''
    Text Word
    '''
    
    OriginalString = '';
    #Original String:
    
    FirstNormalizationForm = '';
    #String after making normalization without loosing information: example removing hyphen (-)
        
    SecondNormalizationForm = '';
    #String after making normalization that make information lost: example removing التشكيل (vowelization)

    String = '';
    #String in manipulation.
    
    TokenType = TokenType();
    #Token Type:
    
    MorphologicalParsingCompleted = False;
    #To be set to True if the word is completely finished Morphological Parsing.
    #For example if it is detected as a compound word and parsed completely at the stage of Compound Parsing.
    
    
    PrematureTags = {};
    #Possible Premature Tags assigned by Premature Tagger. It takes it values from PrematureTagsSet.
    #self.PrematureTags[index] = ['TagName', Certainty]
    
#    أصبح موجود ضمن Morpheme
#    Tags = [];
#    #Possible Tags assigned by Overdue Tagger: It takes it values from TagSet.
#    #self.Tags[index] = ['TagName', Certainty]
#    
    
    GreedyMorphemes = GreedyMorphemes([],None,[]);   
    
    SurfaceFormMorphemes = [];
    #Possible sequences of the analyzed word:
    #This an array of instances of Morphology.Entities.SurfaceFormMorphemes
    
    #Optionally used to expose the lemmas on Word level
    Lemmas = []
       
    def GetAffixationPosibilities(self):
        '''
        Return a list of all possibilities of word segmentation. 
            (That is all possible forms of the word with clitics)
        Number of possibilities = 1 + (Number of Proclitics + 1) * (Number of Enclitics + 1) 
        For example: أوبعلمائكم
            [[], 'أوبعلمائكم', []]  
            [[], 'أوبعلمائك', ['م']], 
            [[], 'أوبعلمائ', ['ك', 'م']], 
            [['أ'], 'وبعلمائكم', []], 
            [['أ'], 'وبعلمائك', ['م']], 
            [['أ'], 'وبعلمائ', ['ك', 'م']], 
            [['أ', 'و'], 'بعلمائكم', []], 
            [['أ', 'و'], 'بعلمائك', ['م']], 
            [['أ', 'و'], 'بعلمائ', ['ك', 'م']],
            [['أ', 'و', 'ب'], 'علمائكم', []], 
            [['أ', 'و', 'ب'], 'علمائك', ['م']], 
            [['أ', 'و', 'ب'], 'علمائ', ['ك', 'م']]

        '''
#        
#        procliticsLen = 0;
#        for k in range(len(self.GreedyMorphemes.Proclitics)):
#            procliticsLen += len(GreedyMorphemes.Proclitics[k][0]);
#        encliticsLen = 0;
#        for k in range(len(GreedyMorphemes.Enclitics)):
#            encliticsLen += len(GreedyMorphemes.Enclitics[k][0]);          
        
        tempList = [];
        tempList.append([[('','c')], self.String, [('','c')]]);
        tempP = [('','c')];
        procliticsCutIndex = 0;
        for i in range(-1,len(self.GreedyMorphemes.Proclitics)):
            tempS = [('','c')];
            if i > -1:            
                tempP = list(tempP);
                tempP.append([x for x in self.GreedyMorphemes.Proclitics[i]]);
                procliticsCutIndex += len(self.GreedyMorphemes.Proclitics[i][0]);
                tempList.append([tempP,self.String[procliticsCutIndex:], tempS]);
            encliticsCutIndex = 0;
            for j in range(len(self.GreedyMorphemes.Enclitics)):
                li = [[x for x in self.GreedyMorphemes.Enclitics[j]]];
                li.extend(tempS);                                
                tempS = li;
                encliticsCutIndex += len(self.GreedyMorphemes.Enclitics[j][0]);
                tempList.append([tempP,self.String[procliticsCutIndex:len(self.String)-(encliticsCutIndex)], tempS]);
        return tempList;
    pass
    
    def __init__(self, string):
        '''
        Constructor
        '''
        self.OriginalString = string
        self.String = string

        self.FirstNormalizationForm = '';
        self.SecondNormalizationForm = '';
        self.TokenType = TokenType();
        self.PrematureTags = {};
        self.Tags = [];
        self.SurfaceFormMorphemes = [];
        self.GreedyMorphemes = GreedyMorphemes([],None,[]);
        self.MorphologicalParsingCompleted = False;
    pass
    
    def __str__(self):
        str = 'Word:';
        str += '\tOriginal:' + self.OriginalString;
        str += '\tFirst Norm. Form:' + self.FirstNormalizationForm;
        str += '\tSecond Norm. Form:' + self.SecondNormalizationForm;
        str += '\tString:' + self.String;
        str += '\tToken Type:' + self.TokenType.__str__();
        str += '\tPre. Tags:' + self.PrematureTags.__str__();
        str += '\tGreedy Morphemes: ' + self.GreedyMorphemes.__str__();
        str += '\tTags:' + self.Tags.__str__();
        str += '\tPos. Morphemes:' + self.SurfaceFormMorphemes.__str__();
        str += '\n';
        return str;
    pass
    
    def ClipString(self, formNumber, lettersCountFromStart, lettersCountFromEnd):
        string = '';
        if formNumber == 0:
            string = self.OriginalString;
        elif formNumber == 1:
            string = self.FirstNormalizationForm;
        elif formNumber == 2:
            string = self.SecondNormalizationForm;
        elif formNumber == None:
            string = self.String;
        
        return ArabicStringUtility.ClipString(ArabicStringUtility, string, lettersCountFromStart, lettersCountFromEnd);   
    
    pass

    def GetDiacratic(self, procliticString, searchFromRight = False):
        
        return ArabicStringUtility.GetDiacratic(ArabicStringUtility, self.FirstNormalizationForm, procliticString, 0, searchFromRight);        
        
    pass
    
    def GetTopPrematureTagsKeys(self, word):
        
        topKeys = [];
        max = -1;
        if (word.PrematureTags != {}):
            for key in ['Noun', 'Verb', 'Particle']:
                if (max == word.PrematureTags[key]):
                    topKeys.append(key)
                elif (max < word.PrematureTags[key]):
                    max = word.PrematureTags[key]
                    topKeys = []
                    topKeys.append(key)
        
        return topKeys
    pass

    def fillLemmas(self):
        if len(self.SurfaceFormMorphemes) == 0:
            self.Lemmas = [self.SecondNormalizationForm]
        else:
            self.Lemmas = []
            for surfaceFormMorphemes in self.SurfaceFormMorphemes:
                if not surfaceFormMorphemes.Cliticless.UnvoweledForm in self.Lemmas:
                    self.Lemmas.append(surfaceFormMorphemes.Cliticless.UnvoweledForm)
                # surfaceFormMorphemes.Proclitics
                # surfaceFormMorphemes.Cliticless
                # surfaceFormMorphemes.Enclitics
    pass