File size: 7,390 Bytes
21baa2f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
'''
Created on ١٠/٠٣/٢٠١٠
@Created by: Muhammad Altabba
'''
from ...Controllers.Morphology.Entities.SurfaceFormMorphemes import *;
from ...Controllers.Morphology.Entities.GreedyMorphemes import *;
from ...Controllers.Tokenization.TokenType import *;
from ...Controllers.General.ArabicStringUtility import *;
from ...Models.Lexicon.LettersConstants import *;
from ...Controllers.Morphology.Entities.Particle import *;
from ...Controllers.Morphology.Entities.DerivedCliticless import *;
class Word(object):
"""
# PyUML: Do not remove this line! # XMI_ID:_qyYhVY35Ed-gg8GOK1TmhA
"""
'''
Text Word
'''
OriginalString = '';
#Original String:
FirstNormalizationForm = '';
#String after making normalization without loosing information: example removing hyphen (-)
SecondNormalizationForm = '';
#String after making normalization that make information lost: example removing التشكيل (vowelization)
String = '';
#String in manipulation.
TokenType = TokenType();
#Token Type:
MorphologicalParsingCompleted = False;
#To be set to True if the word is completely finished Morphological Parsing.
#For example if it is detected as a compound word and parsed completely at the stage of Compound Parsing.
PrematureTags = {};
#Possible Premature Tags assigned by Premature Tagger. It takes it values from PrematureTagsSet.
#self.PrematureTags[index] = ['TagName', Certainty]
# أصبح موجود ضمن Morpheme
# Tags = [];
# #Possible Tags assigned by Overdue Tagger: It takes it values from TagSet.
# #self.Tags[index] = ['TagName', Certainty]
#
GreedyMorphemes = GreedyMorphemes([],None,[]);
SurfaceFormMorphemes = [];
#Possible sequences of the analyzed word:
#This an array of instances of Morphology.Entities.SurfaceFormMorphemes
#Optionally used to expose the lemmas on Word level
Lemmas = []
def GetAffixationPosibilities(self):
'''
Return a list of all possibilities of word segmentation.
(That is all possible forms of the word with clitics)
Number of possibilities = 1 + (Number of Proclitics + 1) * (Number of Enclitics + 1)
For example: أوبعلمائكم
[[], 'أوبعلمائكم', []]
[[], 'أوبعلمائك', ['م']],
[[], 'أوبعلمائ', ['ك', 'م']],
[['أ'], 'وبعلمائكم', []],
[['أ'], 'وبعلمائك', ['م']],
[['أ'], 'وبعلمائ', ['ك', 'م']],
[['أ', 'و'], 'بعلمائكم', []],
[['أ', 'و'], 'بعلمائك', ['م']],
[['أ', 'و'], 'بعلمائ', ['ك', 'م']],
[['أ', 'و', 'ب'], 'علمائكم', []],
[['أ', 'و', 'ب'], 'علمائك', ['م']],
[['أ', 'و', 'ب'], 'علمائ', ['ك', 'م']]
'''
#
# procliticsLen = 0;
# for k in range(len(self.GreedyMorphemes.Proclitics)):
# procliticsLen += len(GreedyMorphemes.Proclitics[k][0]);
# encliticsLen = 0;
# for k in range(len(GreedyMorphemes.Enclitics)):
# encliticsLen += len(GreedyMorphemes.Enclitics[k][0]);
tempList = [];
tempList.append([[('','c')], self.String, [('','c')]]);
tempP = [('','c')];
procliticsCutIndex = 0;
for i in range(-1,len(self.GreedyMorphemes.Proclitics)):
tempS = [('','c')];
if i > -1:
tempP = list(tempP);
tempP.append([x for x in self.GreedyMorphemes.Proclitics[i]]);
procliticsCutIndex += len(self.GreedyMorphemes.Proclitics[i][0]);
tempList.append([tempP,self.String[procliticsCutIndex:], tempS]);
encliticsCutIndex = 0;
for j in range(len(self.GreedyMorphemes.Enclitics)):
li = [[x for x in self.GreedyMorphemes.Enclitics[j]]];
li.extend(tempS);
tempS = li;
encliticsCutIndex += len(self.GreedyMorphemes.Enclitics[j][0]);
tempList.append([tempP,self.String[procliticsCutIndex:len(self.String)-(encliticsCutIndex)], tempS]);
return tempList;
pass
def __init__(self, string):
'''
Constructor
'''
self.OriginalString = string
self.String = string
self.FirstNormalizationForm = '';
self.SecondNormalizationForm = '';
self.TokenType = TokenType();
self.PrematureTags = {};
self.Tags = [];
self.SurfaceFormMorphemes = [];
self.GreedyMorphemes = GreedyMorphemes([],None,[]);
self.MorphologicalParsingCompleted = False;
pass
def __str__(self):
str = 'Word:';
str += '\tOriginal:' + self.OriginalString;
str += '\tFirst Norm. Form:' + self.FirstNormalizationForm;
str += '\tSecond Norm. Form:' + self.SecondNormalizationForm;
str += '\tString:' + self.String;
str += '\tToken Type:' + self.TokenType.__str__();
str += '\tPre. Tags:' + self.PrematureTags.__str__();
str += '\tGreedy Morphemes: ' + self.GreedyMorphemes.__str__();
str += '\tTags:' + self.Tags.__str__();
str += '\tPos. Morphemes:' + self.SurfaceFormMorphemes.__str__();
str += '\n';
return str;
pass
def ClipString(self, formNumber, lettersCountFromStart, lettersCountFromEnd):
string = '';
if formNumber == 0:
string = self.OriginalString;
elif formNumber == 1:
string = self.FirstNormalizationForm;
elif formNumber == 2:
string = self.SecondNormalizationForm;
elif formNumber == None:
string = self.String;
return ArabicStringUtility.ClipString(ArabicStringUtility, string, lettersCountFromStart, lettersCountFromEnd);
pass
def GetDiacratic(self, procliticString, searchFromRight = False):
return ArabicStringUtility.GetDiacratic(ArabicStringUtility, self.FirstNormalizationForm, procliticString, 0, searchFromRight);
pass
def GetTopPrematureTagsKeys(self, word):
topKeys = [];
max = -1;
if (word.PrematureTags != {}):
for key in ['Noun', 'Verb', 'Particle']:
if (max == word.PrematureTags[key]):
topKeys.append(key)
elif (max < word.PrematureTags[key]):
max = word.PrematureTags[key]
topKeys = []
topKeys.append(key)
return topKeys
pass
def fillLemmas(self):
if len(self.SurfaceFormMorphemes) == 0:
self.Lemmas = [self.SecondNormalizationForm]
else:
self.Lemmas = []
for surfaceFormMorphemes in self.SurfaceFormMorphemes:
if not surfaceFormMorphemes.Cliticless.UnvoweledForm in self.Lemmas:
self.Lemmas.append(surfaceFormMorphemes.Cliticless.UnvoweledForm)
# surfaceFormMorphemes.Proclitics
# surfaceFormMorphemes.Cliticless
# surfaceFormMorphemes.Enclitics
pass |