Spaces:

kachaf
/

qutuf

Running

App Files Files Community

qutuf / SourceCode /Controllers /TextEntities /Word.py

Boulbaba

Upload 210 files

21baa2f verified 18 days ago

raw

history blame contribute delete

7.39 kB


	'''
	Created on ١٠‏/٠٣‏/٢٠١٠

	@Created by: Muhammad Altabba
	'''

	from ...Controllers.Morphology.Entities.SurfaceFormMorphemes import *;
	from ...Controllers.Morphology.Entities.GreedyMorphemes import *;
	from ...Controllers.Tokenization.TokenType import *;
	from ...Controllers.General.ArabicStringUtility import *;
	from ...Models.Lexicon.LettersConstants import *;
	from ...Controllers.Morphology.Entities.Particle import *;
	from ...Controllers.Morphology.Entities.DerivedCliticless import *;

	class Word(object):
	"""
	# PyUML: Do not remove this line! # XMI_ID:_qyYhVY35Ed-gg8GOK1TmhA
	"""
	'''
	Text Word
	'''

	OriginalString = '';
	#Original String:

	FirstNormalizationForm = '';
	#String after making normalization without loosing information: example removing hyphen (-)

	SecondNormalizationForm = '';
	#String after making normalization that make information lost: example removing التشكيل (vowelization)

	String = '';
	#String in manipulation.

	TokenType = TokenType();
	#Token Type:

	MorphologicalParsingCompleted = False;
	#To be set to True if the word is completely finished Morphological Parsing.
	#For example if it is detected as a compound word and parsed completely at the stage of Compound Parsing.


	PrematureTags = {};
	#Possible Premature Tags assigned by Premature Tagger. It takes it values from PrematureTagsSet.
	#self.PrematureTags[index] = ['TagName', Certainty]

	# أصبح موجود ضمن Morpheme
	# Tags = [];
	# #Possible Tags assigned by Overdue Tagger: It takes it values from TagSet.
	# #self.Tags[index] = ['TagName', Certainty]
	#

	GreedyMorphemes = GreedyMorphemes([],None,[]);

	SurfaceFormMorphemes = [];
	#Possible sequences of the analyzed word:
	#This an array of instances of Morphology.Entities.SurfaceFormMorphemes

	#Optionally used to expose the lemmas on Word level
	Lemmas = []

	def GetAffixationPosibilities(self):
	'''
	Return a list of all possibilities of word segmentation.
	(That is all possible forms of the word with clitics)
	Number of possibilities = 1 + (Number of Proclitics + 1) * (Number of Enclitics + 1)
	For example: أوبعلمائكم
	[[], 'أوبعلمائكم', []]
	[[], 'أوبعلمائك', ['م']],
	[[], 'أوبعلمائ', ['ك', 'م']],
	[['أ'], 'وبعلمائكم', []],
	[['أ'], 'وبعلمائك', ['م']],
	[['أ'], 'وبعلمائ', ['ك', 'م']],
	[['أ', 'و'], 'بعلمائكم', []],
	[['أ', 'و'], 'بعلمائك', ['م']],
	[['أ', 'و'], 'بعلمائ', ['ك', 'م']],
	[['أ', 'و', 'ب'], 'علمائكم', []],
	[['أ', 'و', 'ب'], 'علمائك', ['م']],
	[['أ', 'و', 'ب'], 'علمائ', ['ك', 'م']]

	'''
	#
	# procliticsLen = 0;
	# for k in range(len(self.GreedyMorphemes.Proclitics)):
	# procliticsLen += len(GreedyMorphemes.Proclitics[k][0]);
	# encliticsLen = 0;
	# for k in range(len(GreedyMorphemes.Enclitics)):
	# encliticsLen += len(GreedyMorphemes.Enclitics[k][0]);

	tempList = [];
	tempList.append([[('','c')], self.String, [('','c')]]);
	tempP = [('','c')];
	procliticsCutIndex = 0;
	for i in range(-1,len(self.GreedyMorphemes.Proclitics)):
	tempS = [('','c')];
	if i > -1:
	tempP = list(tempP);
	tempP.append([x for x in self.GreedyMorphemes.Proclitics[i]]);
	procliticsCutIndex += len(self.GreedyMorphemes.Proclitics[i][0]);
	tempList.append([tempP,self.String[procliticsCutIndex:], tempS]);
	encliticsCutIndex = 0;
	for j in range(len(self.GreedyMorphemes.Enclitics)):
	li = [[x for x in self.GreedyMorphemes.Enclitics[j]]];
	li.extend(tempS);
	tempS = li;
	encliticsCutIndex += len(self.GreedyMorphemes.Enclitics[j][0]);
	tempList.append([tempP,self.String[procliticsCutIndex:len(self.String)-(encliticsCutIndex)], tempS]);
	return tempList;
	pass

	def __init__(self, string):
	'''
	Constructor
	'''
	self.OriginalString = string
	self.String = string

	self.FirstNormalizationForm = '';
	self.SecondNormalizationForm = '';
	self.TokenType = TokenType();
	self.PrematureTags = {};
	self.Tags = [];
	self.SurfaceFormMorphemes = [];
	self.GreedyMorphemes = GreedyMorphemes([],None,[]);
	self.MorphologicalParsingCompleted = False;
	pass

	def __str__(self):
	str = 'Word:';
	str += '\tOriginal:' + self.OriginalString;
	str += '\tFirst Norm. Form:' + self.FirstNormalizationForm;
	str += '\tSecond Norm. Form:' + self.SecondNormalizationForm;
	str += '\tString:' + self.String;
	str += '\tToken Type:' + self.TokenType.__str__();
	str += '\tPre. Tags:' + self.PrematureTags.__str__();
	str += '\tGreedy Morphemes: ' + self.GreedyMorphemes.__str__();
	str += '\tTags:' + self.Tags.__str__();
	str += '\tPos. Morphemes:' + self.SurfaceFormMorphemes.__str__();
	str += '\n';
	return str;
	pass

	def ClipString(self, formNumber, lettersCountFromStart, lettersCountFromEnd):
	string = '';
	if formNumber == 0:
	string = self.OriginalString;
	elif formNumber == 1:
	string = self.FirstNormalizationForm;
	elif formNumber == 2:
	string = self.SecondNormalizationForm;
	elif formNumber == None:
	string = self.String;

	return ArabicStringUtility.ClipString(ArabicStringUtility, string, lettersCountFromStart, lettersCountFromEnd);

	pass

	def GetDiacratic(self, procliticString, searchFromRight = False):

	return ArabicStringUtility.GetDiacratic(ArabicStringUtility, self.FirstNormalizationForm, procliticString, 0, searchFromRight);

	pass

	def GetTopPrematureTagsKeys(self, word):

	topKeys = [];
	max = -1;
	if (word.PrematureTags != {}):
	for key in ['Noun', 'Verb', 'Particle']:
	if (max == word.PrematureTags[key]):
	topKeys.append(key)
	elif (max < word.PrematureTags[key]):
	max = word.PrematureTags[key]
	topKeys = []
	topKeys.append(key)

	return topKeys
	pass

	def fillLemmas(self):
	if len(self.SurfaceFormMorphemes) == 0:
	self.Lemmas = [self.SecondNormalizationForm]
	else:
	self.Lemmas = []
	for surfaceFormMorphemes in self.SurfaceFormMorphemes:
	if not surfaceFormMorphemes.Cliticless.UnvoweledForm in self.Lemmas:
	self.Lemmas.append(surfaceFormMorphemes.Cliticless.UnvoweledForm)
	# surfaceFormMorphemes.Proclitics
	# surfaceFormMorphemes.Cliticless
	# surfaceFormMorphemes.Enclitics
	pass