Boulbaba's picture
Upload 210 files
21baa2f verified
'''
Created on ١٠‏/٠٣‏/٢٠١٠
@Created by: Muhammad Altabba
'''
from ...Controllers.Morphology.Entities.SurfaceFormMorphemes import *;
from ...Controllers.Morphology.Entities.GreedyMorphemes import *;
from ...Controllers.Tokenization.TokenType import *;
from ...Controllers.General.ArabicStringUtility import *;
from ...Models.Lexicon.LettersConstants import *;
from ...Controllers.Morphology.Entities.Particle import *;
from ...Controllers.Morphology.Entities.DerivedCliticless import *;
class Word(object):
"""
# PyUML: Do not remove this line! # XMI_ID:_qyYhVY35Ed-gg8GOK1TmhA
"""
'''
Text Word
'''
OriginalString = '';
#Original String:
FirstNormalizationForm = '';
#String after making normalization without loosing information: example removing hyphen (-)
SecondNormalizationForm = '';
#String after making normalization that make information lost: example removing التشكيل (vowelization)
String = '';
#String in manipulation.
TokenType = TokenType();
#Token Type:
MorphologicalParsingCompleted = False;
#To be set to True if the word is completely finished Morphological Parsing.
#For example if it is detected as a compound word and parsed completely at the stage of Compound Parsing.
PrematureTags = {};
#Possible Premature Tags assigned by Premature Tagger. It takes it values from PrematureTagsSet.
#self.PrematureTags[index] = ['TagName', Certainty]
# أصبح موجود ضمن Morpheme
# Tags = [];
# #Possible Tags assigned by Overdue Tagger: It takes it values from TagSet.
# #self.Tags[index] = ['TagName', Certainty]
#
GreedyMorphemes = GreedyMorphemes([],None,[]);
SurfaceFormMorphemes = [];
#Possible sequences of the analyzed word:
#This an array of instances of Morphology.Entities.SurfaceFormMorphemes
#Optionally used to expose the lemmas on Word level
Lemmas = []
def GetAffixationPosibilities(self):
'''
Return a list of all possibilities of word segmentation.
(That is all possible forms of the word with clitics)
Number of possibilities = 1 + (Number of Proclitics + 1) * (Number of Enclitics + 1)
For example: أوبعلمائكم
[[], 'أوبعلمائكم', []]
[[], 'أوبعلمائك', ['م']],
[[], 'أوبعلمائ', ['ك', 'م']],
[['أ'], 'وبعلمائكم', []],
[['أ'], 'وبعلمائك', ['م']],
[['أ'], 'وبعلمائ', ['ك', 'م']],
[['أ', 'و'], 'بعلمائكم', []],
[['أ', 'و'], 'بعلمائك', ['م']],
[['أ', 'و'], 'بعلمائ', ['ك', 'م']],
[['أ', 'و', 'ب'], 'علمائكم', []],
[['أ', 'و', 'ب'], 'علمائك', ['م']],
[['أ', 'و', 'ب'], 'علمائ', ['ك', 'م']]
'''
#
# procliticsLen = 0;
# for k in range(len(self.GreedyMorphemes.Proclitics)):
# procliticsLen += len(GreedyMorphemes.Proclitics[k][0]);
# encliticsLen = 0;
# for k in range(len(GreedyMorphemes.Enclitics)):
# encliticsLen += len(GreedyMorphemes.Enclitics[k][0]);
tempList = [];
tempList.append([[('','c')], self.String, [('','c')]]);
tempP = [('','c')];
procliticsCutIndex = 0;
for i in range(-1,len(self.GreedyMorphemes.Proclitics)):
tempS = [('','c')];
if i > -1:
tempP = list(tempP);
tempP.append([x for x in self.GreedyMorphemes.Proclitics[i]]);
procliticsCutIndex += len(self.GreedyMorphemes.Proclitics[i][0]);
tempList.append([tempP,self.String[procliticsCutIndex:], tempS]);
encliticsCutIndex = 0;
for j in range(len(self.GreedyMorphemes.Enclitics)):
li = [[x for x in self.GreedyMorphemes.Enclitics[j]]];
li.extend(tempS);
tempS = li;
encliticsCutIndex += len(self.GreedyMorphemes.Enclitics[j][0]);
tempList.append([tempP,self.String[procliticsCutIndex:len(self.String)-(encliticsCutIndex)], tempS]);
return tempList;
pass
def __init__(self, string):
'''
Constructor
'''
self.OriginalString = string
self.String = string
self.FirstNormalizationForm = '';
self.SecondNormalizationForm = '';
self.TokenType = TokenType();
self.PrematureTags = {};
self.Tags = [];
self.SurfaceFormMorphemes = [];
self.GreedyMorphemes = GreedyMorphemes([],None,[]);
self.MorphologicalParsingCompleted = False;
pass
def __str__(self):
str = 'Word:';
str += '\tOriginal:' + self.OriginalString;
str += '\tFirst Norm. Form:' + self.FirstNormalizationForm;
str += '\tSecond Norm. Form:' + self.SecondNormalizationForm;
str += '\tString:' + self.String;
str += '\tToken Type:' + self.TokenType.__str__();
str += '\tPre. Tags:' + self.PrematureTags.__str__();
str += '\tGreedy Morphemes: ' + self.GreedyMorphemes.__str__();
str += '\tTags:' + self.Tags.__str__();
str += '\tPos. Morphemes:' + self.SurfaceFormMorphemes.__str__();
str += '\n';
return str;
pass
def ClipString(self, formNumber, lettersCountFromStart, lettersCountFromEnd):
string = '';
if formNumber == 0:
string = self.OriginalString;
elif formNumber == 1:
string = self.FirstNormalizationForm;
elif formNumber == 2:
string = self.SecondNormalizationForm;
elif formNumber == None:
string = self.String;
return ArabicStringUtility.ClipString(ArabicStringUtility, string, lettersCountFromStart, lettersCountFromEnd);
pass
def GetDiacratic(self, procliticString, searchFromRight = False):
return ArabicStringUtility.GetDiacratic(ArabicStringUtility, self.FirstNormalizationForm, procliticString, 0, searchFromRight);
pass
def GetTopPrematureTagsKeys(self, word):
topKeys = [];
max = -1;
if (word.PrematureTags != {}):
for key in ['Noun', 'Verb', 'Particle']:
if (max == word.PrematureTags[key]):
topKeys.append(key)
elif (max < word.PrematureTags[key]):
max = word.PrematureTags[key]
topKeys = []
topKeys.append(key)
return topKeys
pass
def fillLemmas(self):
if len(self.SurfaceFormMorphemes) == 0:
self.Lemmas = [self.SecondNormalizationForm]
else:
self.Lemmas = []
for surfaceFormMorphemes in self.SurfaceFormMorphemes:
if not surfaceFormMorphemes.Cliticless.UnvoweledForm in self.Lemmas:
self.Lemmas.append(surfaceFormMorphemes.Cliticless.UnvoweledForm)
# surfaceFormMorphemes.Proclitics
# surfaceFormMorphemes.Cliticless
# surfaceFormMorphemes.Enclitics
pass