|
|
| ''' |
| Created on ١٠/٠٣/٢٠١٠ |
| |
| @Created by: Muhammad Altabba |
| ''' |
|
|
| from ...Controllers.TextEntities.Sentence import *; |
| from ...Controllers.TextEntities.Word import *; |
| from ...Controllers.Tokenization.TokenType import TokenType; |
| from ...Models.Lexicon import *; |
| from ...Models.Lexicon.LettersConstants import ArabicLetters; |
| from ...Models.Tokenization.SentenceSeperatorsList import *; |
| from ...Models.Tokenization.TokenizerConstants import *; |
| import re; |
|
|
| class Tokenizer(object): |
| """ |
| # PyUML: Do not remove this line! # XMI_ID:_hUBlVI34Ed-gg8GOK1TmhA |
| """ |
| ''' |
| Text Tokenizer |
| ''' |
| FinalCharsType = []; |
| |
| def __init__(self): |
| ''' |
| Constructor |
| ''' |
|
|
| self.FinalCharsType = []; |
| pass |
| |
| def Tokenize(self, string): |
| |
| string = string; |
| |
| |
| |
| |
| if not self.isSentenceSeperator(string[len(string)-1]): |
| string += '.'; |
| |
| self.__FillFinalCharsTypeList(string); |
| |
| wordStart = 0; |
| sentenceStart = 0; |
| wordsList = []; |
| sentences = []; |
| i = 0; |
| while i < len(string): |
| if self.FinalCharsType[i] != 's' and self.FinalCharsType[i+1] == 's': |
| if self.FinalCharsType[i] in WhiteSpacesList: |
| word = Word(string[wordStart:i+1]); |
| word.TokenType = TokenType(2); |
| wordsList.append(word); |
| elif self.FinalCharsType[i] == 'l': |
| words = self.__SeparateByLanguage(string[wordStart:i+1]); |
| wordsList.extend(words); |
| elif self.FinalCharsType[i] == 'd': |
| word = Word(string[wordStart:i+1]); |
| word.TokenType = TokenType(1); |
| wordsList.append(word); |
| wordStart = i + 1; |
| |
| if self.FinalCharsType[i] == 's': |
| isEndOfSentence = False; |
| sentenceEnd = i + 1; |
| while i+1 < len(string)\ |
| and (string[i] == string[i+1]\ |
| or string[i+1] == ' '\ |
| or (self.isSentenceSeperator(string[i]) and self.isSentenceSeperator(string[i+1]))): |
| if self.isSentenceSeperator(string[i]): |
| isEndOfSentence = True; |
| sentenceEnd = i + 2; |
| i += 1; |
| word = Word(string[wordStart:i+1]); |
| if string[i] in WhiteSpacesList: |
| word.TokenType = TokenType(2); |
| else: |
| word.TokenType = TokenType(3); |
| wordsList.append(word); |
| wordStart = i + 1; |
| |
| if self.isSentenceSeperator(string[i]) == True or isEndOfSentence == True: |
| sentence = Sentence(string[sentenceStart:sentenceEnd]); |
| sentence.Words = wordsList; |
| sentences.append(sentence); |
| sentenceStart = i + 1; |
| wordsList = []; |
| |
| i += 1; |
| return sentences; |
| pass |
|
|
| def __SeparateByLanguage(self, string): |
| |
| words = []; |
| wordStart = 0; |
| previousLanguage = ''; |
| currentLanguage = ''; |
| if(string[0] in ArabicLetters.AllLetters or string[0] in DiacriticsConstants.AllDiacritics): |
| previousLanguage = TokenType.Constants.Id.ArabicText; |
| elif(string[0] not in ArabicLetters.AllLetters): |
| previousLanguage = TokenType.Constants.Id.OtherText; |
| for i in range(1, len(string)): |
| if(string[i] in ArabicLetters.AllLetters or string[i] in DiacriticsConstants.AllDiacritics): |
| currentLanguage = TokenType.Constants.Id.ArabicText; |
| elif(string[i] not in ArabicLetters.AllLetters): |
| currentLanguage = TokenType.Constants.Id.OtherText; |
| |
| if(previousLanguage != currentLanguage): |
| |
| |
| word = Word(string[wordStart:i]); |
| word.TokenType = TokenType(previousLanguage); |
| wordStart = i; |
| words.append(word); |
| previousLanguage = currentLanguage; |
| else: |
| if(len(string) == 1): |
| i = 0; |
| currentLanguage = previousLanguage; |
| word = Word(string[wordStart:i+1]); |
| word.TokenType = TokenType(currentLanguage); |
| words.append(word); |
| |
| return words; |
| pass |
| |
| def isSentenceSeperator(self, sepChar): |
| |
| if sepChar in SentenceSeperatorsList: |
| return True; |
| else: |
| return False; |
| pass |
| |
| def __FillFinalCharsTypeList(self, string): |
| |
| |
| self.FinalCharsType = []; |
|
|
| i = 0; |
| while i < len(string): |
| if (string[i] in isAmbiguousA \ |
| or string[i] in isAmbiguousB \ |
| or string[i] in isAmbiguousC \ |
| or string[i] in isAmbiguousD) \ |
| and (i + 1 == len(string) or i == 0): |
| self.FinalCharsType.append('s'); |
| i += 1; |
| continue; |
| |
| elif string[i] in isDigit: |
| self.FinalCharsType.append("d"); |
| elif string[i] not in isAmbiguousA \ |
| and string[i] not in isAmbiguousB \ |
| and string[i] not in isAmbiguousC \ |
| and string[i] not in isAmbiguousD \ |
| and string[i] not in isSep : |
| self.FinalCharsType.append("l"); |
| elif string[i] in isAmbiguousA : |
| if string[i-1] not in isAmbiguousA\ |
| and string[i-1] not in isAmbiguousB \ |
| and string[i-1] not in isAmbiguousC \ |
| and string[i-1] not in isSep \ |
| and string[i+1] not in isAmbiguousA \ |
| and string[i+1] not in isAmbiguousB \ |
| and string[i+1] not in isAmbiguousC \ |
| and string[i+1] not in isSep: |
| |
| if string[i-1] in isDigit and string[i+1] in isDigit: |
| self.FinalCharsType.append("d"); |
| else: |
| self.FinalCharsType.append("l"); |
| else: |
| self.FinalCharsType.append('s'); |
| elif string[i] in isSep : |
| self.FinalCharsType.append('s'); |
| elif string[i] in isAmbiguousB : |
| if string[i-1] in isDigit and string[i+1] in isDigit : |
| self.FinalCharsType.append("d"); |
| else: |
| self.FinalCharsType.append('s'); |
| elif string[i] in isAmbiguousC: |
| if string[i-1] in isDigit and string[i+1] in isDigit : |
| self.FinalCharsType.append("l"); |
| else: |
| self.FinalCharsType.append('s'); |
| else: |
| self.FinalCharsType.append('s'); |
| i += 1; |
| pass |
|
|
|
|
|
|
|
|
|
|
| |
|
|
|
|
| def __str__(self): |
| for i in range(len(self.Sentences)): |
| str += self.Sentences[i].__str__(); |
| return str; |
| pass |
| |
|
|
| |
|
|