Spaces:

kachaf
/

qutuf

Running

App Files Files Community

qutuf / SourceCode /Controllers /Tokenization /Tokenizer.py

Boulbaba

Upload 210 files

21baa2f verified 18 days ago

raw

history blame contribute delete

7.87 kB


	'''
	Created on ١٠‏/٠٣‏/٢٠١٠

	@Created by: Muhammad Altabba
	'''

	from ...Controllers.TextEntities.Sentence import *;
	from ...Controllers.TextEntities.Word import *;
	from ...Controllers.Tokenization.TokenType import TokenType;
	from ...Models.Lexicon import *;
	from ...Models.Lexicon.LettersConstants import ArabicLetters;
	from ...Models.Tokenization.SentenceSeperatorsList import *;
	from ...Models.Tokenization.TokenizerConstants import *;
	import re;

	class Tokenizer(object):
	"""
	# PyUML: Do not remove this line! # XMI_ID:_hUBlVI34Ed-gg8GOK1TmhA
	"""
	'''
	Text Tokenizer
	'''
	FinalCharsType = [];

	def __init__(self):
	'''
	Constructor
	'''

	self.FinalCharsType = [];
	pass

	def Tokenize(self, string):

	string = string;
	# if not self.isSentenceSeperator(string[len(string)-1])\
	# and not self.isSentenceSeperator(string[len(string)-2])\
	# and not self.isSentenceSeperator(string[len(string)-3]):
	# string += '.';
	if not self.isSentenceSeperator(string[len(string)-1]):
	string += '.';

	self.__FillFinalCharsTypeList(string);

	wordStart = 0;
	sentenceStart = 0;
	wordsList = [];
	sentences = [];
	i = 0;
	while i < len(string):
	if self.FinalCharsType[i] != 's' and self.FinalCharsType[i+1] == 's':
	if self.FinalCharsType[i] in WhiteSpacesList:
	word = Word(string[wordStart:i+1]);
	word.TokenType = TokenType(2); # (2, "White Space")
	wordsList.append(word);
	elif self.FinalCharsType[i] == 'l':
	words = self.__SeparateByLanguage(string[wordStart:i+1]);
	wordsList.extend(words);
	elif self.FinalCharsType[i] == 'd':
	word = Word(string[wordStart:i+1]);
	word.TokenType = TokenType(1); # (1, "Numbers")
	wordsList.append(word);
	wordStart = i + 1;

	if self.FinalCharsType[i] == 's':
	isEndOfSentence = False;
	sentenceEnd = i + 1;
	while i+1 < len(string)\
	and (string[i] == string[i+1]\
	or string[i+1] == ' '\
	or (self.isSentenceSeperator(string[i]) and self.isSentenceSeperator(string[i+1]))):
	if self.isSentenceSeperator(string[i]):
	isEndOfSentence = True;
	sentenceEnd = i + 2;
	i += 1;
	word = Word(string[wordStart:i+1]);
	if string[i] in WhiteSpacesList:
	word.TokenType = TokenType(2); # (2, "White Space")
	else:
	word.TokenType = TokenType(3); # (3, "Punctuation")
	wordsList.append(word);
	wordStart = i + 1;

	if self.isSentenceSeperator(string[i]) == True or isEndOfSentence == True:
	sentence = Sentence(string[sentenceStart:sentenceEnd]);
	sentence.Words = wordsList;
	sentences.append(sentence);
	sentenceStart = i + 1;
	wordsList = [];

	i += 1;
	return sentences;
	pass

	def __SeparateByLanguage(self, string):

	words = [];
	wordStart = 0;
	previousLanguage = '';
	currentLanguage = '';
	if(string[0] in ArabicLetters.AllLetters or string[0] in DiacriticsConstants.AllDiacritics):
	previousLanguage = TokenType.Constants.Id.ArabicText;
	elif(string[0] not in ArabicLetters.AllLetters):
	previousLanguage = TokenType.Constants.Id.OtherText;
	for i in range(1, len(string)):
	if(string[i] in ArabicLetters.AllLetters or string[i] in DiacriticsConstants.AllDiacritics):
	currentLanguage = TokenType.Constants.Id.ArabicText;
	elif(string[i] not in ArabicLetters.AllLetters):
	currentLanguage = TokenType.Constants.Id.OtherText;

	if(previousLanguage != currentLanguage):
	# print(previousLanguage, ', ', currentLanguage,\
	# ', ', wordStart, ', ', i);
	word = Word(string[wordStart:i]);
	word.TokenType = TokenType(previousLanguage); # (0 or 4, "Text")
	wordStart = i;
	words.append(word);
	previousLanguage = currentLanguage;
	else:
	if(len(string) == 1):
	i = 0;
	currentLanguage = previousLanguage;
	word = Word(string[wordStart:i+1]);
	word.TokenType = TokenType(currentLanguage); # (0, "ArabicText")
	words.append(word);

	return words;
	pass

	def isSentenceSeperator(self, sepChar):
	#...
	if sepChar in SentenceSeperatorsList:
	return True;
	else:
	return False;
	pass

	def __FillFinalCharsTypeList(self, string):

	#new lists for for applying independent algorithm ;
	self.FinalCharsType = [];

	i = 0;
	while i < len(string):
	if (string[i] in isAmbiguousA \
	or string[i] in isAmbiguousB \
	or string[i] in isAmbiguousC \
	or string[i] in isAmbiguousD) \
	and (i + 1 == len(string) or i == 0):
	self.FinalCharsType.append('s');
	i += 1;
	continue;

	elif string[i] in isDigit:
	self.FinalCharsType.append("d");
	elif string[i] not in isAmbiguousA \
	and string[i] not in isAmbiguousB \
	and string[i] not in isAmbiguousC \
	and string[i] not in isAmbiguousD \
	and string[i] not in isSep : # or letter given letter;
	self.FinalCharsType.append("l");
	elif string[i] in isAmbiguousA : # to checking the rules (dot)
	if string[i-1] not in isAmbiguousA\
	and string[i-1] not in isAmbiguousB \
	and string[i-1] not in isAmbiguousC \
	and string[i-1] not in isSep \
	and string[i+1] not in isAmbiguousA \
	and string[i+1] not in isAmbiguousB \
	and string[i+1] not in isAmbiguousC \
	and string[i+1] not in isSep:
	# between two letter & between two digit give two letter
	if string[i-1] in isDigit and string[i+1] in isDigit:
	self.FinalCharsType.append("d");
	else:
	self.FinalCharsType.append("l");
	else:
	self.FinalCharsType.append('s');
	elif string[i] in isSep :
	self.FinalCharsType.append('s');
	elif string[i] in isAmbiguousB :
	if string[i-1] in isDigit and string[i+1] in isDigit :
	self.FinalCharsType.append("d");
	else:
	self.FinalCharsType.append('s');
	elif string[i] in isAmbiguousC: #between two numbers give letter
	if string[i-1] in isDigit and string[i+1] in isDigit :
	self.FinalCharsType.append("l");
	else:
	self.FinalCharsType.append('s');
	else:
	self.FinalCharsType.append('s');
	i += 1;
	pass








	def __str__(self):
	for i in range(len(self.Sentences)):
	str += self.Sentences[i].__str__();
	return str;
	pass