File size: 7,390 Bytes
21baa2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

'''
Created on ١٠‏/٠٣‏/٢٠١٠

@Created by: Muhammad Altabba
'''

from ...Controllers.Morphology.Entities.SurfaceFormMorphemes import *;
from ...Controllers.Morphology.Entities.GreedyMorphemes import *;
from ...Controllers.Tokenization.TokenType import *;
from ...Controllers.General.ArabicStringUtility import *;
from ...Models.Lexicon.LettersConstants import *;
from ...Controllers.Morphology.Entities.Particle import *;
from ...Controllers.Morphology.Entities.DerivedCliticless import *;

class Word(object):
    """
     # PyUML: Do not remove this line! # XMI_ID:_qyYhVY35Ed-gg8GOK1TmhA
    """
    '''
    Text Word
    '''
    
    OriginalString = '';
    #Original String:
    
    FirstNormalizationForm = '';
    #String after making normalization without loosing information: example removing hyphen (-)
        
    SecondNormalizationForm = '';
    #String after making normalization that make information lost: example removing التشكيل (vowelization)

    String = '';
    #String in manipulation.
    
    TokenType = TokenType();
    #Token Type:
    
    MorphologicalParsingCompleted = False;
    #To be set to True if the word is completely finished Morphological Parsing.
    #For example if it is detected as a compound word and parsed completely at the stage of Compound Parsing.
    
    
    PrematureTags = {};
    #Possible Premature Tags assigned by Premature Tagger. It takes it values from PrematureTagsSet.
    #self.PrematureTags[index] = ['TagName', Certainty]
    
#    أصبح موجود ضمن Morpheme
#    Tags = [];
#    #Possible Tags assigned by Overdue Tagger: It takes it values from TagSet.
#    #self.Tags[index] = ['TagName', Certainty]
#    
    
    GreedyMorphemes = GreedyMorphemes([],None,[]);   
    
    SurfaceFormMorphemes = [];
    #Possible sequences of the analyzed word:
    #This an array of instances of Morphology.Entities.SurfaceFormMorphemes
    
    #Optionally used to expose the lemmas on Word level
    Lemmas = []
       
    def GetAffixationPosibilities(self):
        '''
        Return a list of all possibilities of word segmentation. 
            (That is all possible forms of the word with clitics)
        Number of possibilities = 1 + (Number of Proclitics + 1) * (Number of Enclitics + 1) 
        For example: أوبعلمائكم
            [[], 'أوبعلمائكم', []]  
            [[], 'أوبعلمائك', ['م']], 
            [[], 'أوبعلمائ', ['ك', 'م']], 
            [['أ'], 'وبعلمائكم', []], 
            [['أ'], 'وبعلمائك', ['م']], 
            [['أ'], 'وبعلمائ', ['ك', 'م']], 
            [['أ', 'و'], 'بعلمائكم', []], 
            [['أ', 'و'], 'بعلمائك', ['م']], 
            [['أ', 'و'], 'بعلمائ', ['ك', 'م']],
            [['أ', 'و', 'ب'], 'علمائكم', []], 
            [['أ', 'و', 'ب'], 'علمائك', ['م']], 
            [['أ', 'و', 'ب'], 'علمائ', ['ك', 'م']]

        '''
#        
#        procliticsLen = 0;
#        for k in range(len(self.GreedyMorphemes.Proclitics)):
#            procliticsLen += len(GreedyMorphemes.Proclitics[k][0]);
#        encliticsLen = 0;
#        for k in range(len(GreedyMorphemes.Enclitics)):
#            encliticsLen += len(GreedyMorphemes.Enclitics[k][0]);          
        
        tempList = [];
        tempList.append([[('','c')], self.String, [('','c')]]);
        tempP = [('','c')];
        procliticsCutIndex = 0;
        for i in range(-1,len(self.GreedyMorphemes.Proclitics)):
            tempS = [('','c')];
            if i > -1:            
                tempP = list(tempP);
                tempP.append([x for x in self.GreedyMorphemes.Proclitics[i]]);
                procliticsCutIndex += len(self.GreedyMorphemes.Proclitics[i][0]);
                tempList.append([tempP,self.String[procliticsCutIndex:], tempS]);
            encliticsCutIndex = 0;
            for j in range(len(self.GreedyMorphemes.Enclitics)):
                li = [[x for x in self.GreedyMorphemes.Enclitics[j]]];
                li.extend(tempS);                                
                tempS = li;
                encliticsCutIndex += len(self.GreedyMorphemes.Enclitics[j][0]);
                tempList.append([tempP,self.String[procliticsCutIndex:len(self.String)-(encliticsCutIndex)], tempS]);
        return tempList;
    pass
    
    def __init__(self, string):
        '''
        Constructor
        '''
        self.OriginalString = string
        self.String = string

        self.FirstNormalizationForm = '';
        self.SecondNormalizationForm = '';
        self.TokenType = TokenType();
        self.PrematureTags = {};
        self.Tags = [];
        self.SurfaceFormMorphemes = [];
        self.GreedyMorphemes = GreedyMorphemes([],None,[]);
        self.MorphologicalParsingCompleted = False;
    pass
    
    def __str__(self):
        str = 'Word:';
        str += '\tOriginal:' + self.OriginalString;
        str += '\tFirst Norm. Form:' + self.FirstNormalizationForm;
        str += '\tSecond Norm. Form:' + self.SecondNormalizationForm;
        str += '\tString:' + self.String;
        str += '\tToken Type:' + self.TokenType.__str__();
        str += '\tPre. Tags:' + self.PrematureTags.__str__();
        str += '\tGreedy Morphemes: ' + self.GreedyMorphemes.__str__();
        str += '\tTags:' + self.Tags.__str__();
        str += '\tPos. Morphemes:' + self.SurfaceFormMorphemes.__str__();
        str += '\n';
        return str;
    pass
    
    def ClipString(self, formNumber, lettersCountFromStart, lettersCountFromEnd):
        string = '';
        if formNumber == 0:
            string = self.OriginalString;
        elif formNumber == 1:
            string = self.FirstNormalizationForm;
        elif formNumber == 2:
            string = self.SecondNormalizationForm;
        elif formNumber == None:
            string = self.String;
        
        return ArabicStringUtility.ClipString(ArabicStringUtility, string, lettersCountFromStart, lettersCountFromEnd);   
    
    pass

    def GetDiacratic(self, procliticString, searchFromRight = False):
        
        return ArabicStringUtility.GetDiacratic(ArabicStringUtility, self.FirstNormalizationForm, procliticString, 0, searchFromRight);        
        
    pass
    
    def GetTopPrematureTagsKeys(self, word):
        
        topKeys = [];
        max = -1;
        if (word.PrematureTags != {}):
            for key in ['Noun', 'Verb', 'Particle']:
                if (max == word.PrematureTags[key]):
                    topKeys.append(key)
                elif (max < word.PrematureTags[key]):
                    max = word.PrematureTags[key]
                    topKeys = []
                    topKeys.append(key)
        
        return topKeys
    pass

    def fillLemmas(self):
        if len(self.SurfaceFormMorphemes) == 0:
            self.Lemmas = [self.SecondNormalizationForm]
        else:
            self.Lemmas = []
            for surfaceFormMorphemes in self.SurfaceFormMorphemes:
                if not surfaceFormMorphemes.Cliticless.UnvoweledForm in self.Lemmas:
                    self.Lemmas.append(surfaceFormMorphemes.Cliticless.UnvoweledForm)
                # surfaceFormMorphemes.Proclitics
                # surfaceFormMorphemes.Cliticless
                # surfaceFormMorphemes.Enclitics
    pass