Spaces:

kachaf
/

qutuf

Running

File size: 7,868 Bytes

21baa2f


'''
Created on ١٠‏/٠٣‏/٢٠١٠

@Created by: Muhammad Altabba
'''

from ...Controllers.TextEntities.Sentence import *;
from ...Controllers.TextEntities.Word import *;
from ...Controllers.Tokenization.TokenType import TokenType;
from ...Models.Lexicon import *;
from ...Models.Lexicon.LettersConstants import ArabicLetters;
from ...Models.Tokenization.SentenceSeperatorsList import *;
from ...Models.Tokenization.TokenizerConstants import *;
import re;

class Tokenizer(object):
    """
     # PyUML: Do not remove this line! # XMI_ID:_hUBlVI34Ed-gg8GOK1TmhA
    """
    '''
    Text Tokenizer
    '''
    FinalCharsType = [];
    
    def __init__(self):
        '''
        Constructor
        '''

        self.FinalCharsType = [];
    pass
    
    def Tokenize(self, string):
        
        string = string;
#        if not self.isSentenceSeperator(string[len(string)-1])\
#        and not self.isSentenceSeperator(string[len(string)-2])\
#        and not self.isSentenceSeperator(string[len(string)-3]):
#            string += '.';
        if not self.isSentenceSeperator(string[len(string)-1]):
            string += '.';
            
        self.__FillFinalCharsTypeList(string);
            
        wordStart = 0;
        sentenceStart = 0;
        wordsList = [];
        sentences = [];
        i = 0;
        while i < len(string):
            if self.FinalCharsType[i] != 's' and self.FinalCharsType[i+1] == 's':
                if self.FinalCharsType[i] in WhiteSpacesList:
                    word = Word(string[wordStart:i+1]);
                    word.TokenType = TokenType(2); # (2, "White Space")
                    wordsList.append(word);
                elif self.FinalCharsType[i] == 'l':
                    words = self.__SeparateByLanguage(string[wordStart:i+1]);
                    wordsList.extend(words);
                elif self.FinalCharsType[i] == 'd':
                    word = Word(string[wordStart:i+1]);
                    word.TokenType = TokenType(1); # (1, "Numbers")
                    wordsList.append(word);
                wordStart = i + 1;
                
            if self.FinalCharsType[i] == 's':
                isEndOfSentence = False;
                sentenceEnd = i + 1;
                while i+1 < len(string)\
                and (string[i] == string[i+1]\
                or string[i+1] == ' '\
                or (self.isSentenceSeperator(string[i]) and self.isSentenceSeperator(string[i+1]))):
                    if self.isSentenceSeperator(string[i]):
                        isEndOfSentence = True;
                        sentenceEnd = i + 2;
                    i += 1;                    
                word = Word(string[wordStart:i+1]);
                if string[i] in WhiteSpacesList:
                    word.TokenType = TokenType(2); # (2, "White Space")
                else:
                    word.TokenType = TokenType(3); # (3, "Punctuation")
                wordsList.append(word);
                wordStart = i + 1;
            
                if self.isSentenceSeperator(string[i]) == True or isEndOfSentence == True:
                    sentence = Sentence(string[sentenceStart:sentenceEnd]);
                    sentence.Words = wordsList;
                    sentences.append(sentence);
                    sentenceStart = i + 1;
                    wordsList = [];
            
            i += 1;
        return sentences;
    pass

    def __SeparateByLanguage(self, string):
        
        words = [];
        wordStart = 0;
        previousLanguage = '';
        currentLanguage = '';
        if(string[0] in ArabicLetters.AllLetters or string[0] in DiacriticsConstants.AllDiacritics):
            previousLanguage = TokenType.Constants.Id.ArabicText;
        elif(string[0] not in ArabicLetters.AllLetters):
            previousLanguage = TokenType.Constants.Id.OtherText;
        for i in range(1, len(string)):
            if(string[i] in ArabicLetters.AllLetters or string[i] in DiacriticsConstants.AllDiacritics):
                currentLanguage = TokenType.Constants.Id.ArabicText;
            elif(string[i] not in ArabicLetters.AllLetters):
                currentLanguage = TokenType.Constants.Id.OtherText;
                
            if(previousLanguage != currentLanguage):
#                print(previousLanguage, ', ', currentLanguage,\
#                      ', ', wordStart, ', ', i);
                word = Word(string[wordStart:i]);
                word.TokenType = TokenType(previousLanguage); # (0 or 4, "Text")
                wordStart = i;
                words.append(word);
                previousLanguage = currentLanguage;
        else:
            if(len(string) == 1):
                i = 0;
                currentLanguage = previousLanguage;
            word = Word(string[wordStart:i+1]);
            word.TokenType = TokenType(currentLanguage); # (0, "ArabicText")
            words.append(word);
        
        return words;
    pass
    
    def isSentenceSeperator(self, sepChar):
        #...
        if sepChar in SentenceSeperatorsList:
            return True;
        else:
            return False;
    pass
    
    def __FillFinalCharsTypeList(self, string):
        
#new lists for for applying independent algorithm ;
        self.FinalCharsType = [];

        i = 0;
        while i < len(string):
            if (string[i] in isAmbiguousA \
            or string[i] in  isAmbiguousB \
            or string[i] in isAmbiguousC \
            or string[i] in isAmbiguousD) \
            and (i + 1 == len(string) or i == 0):
                self.FinalCharsType.append('s');
                i += 1;
                continue;
                        
            elif string[i] in isDigit:
                self.FinalCharsType.append("d");
            elif string[i] not in isAmbiguousA \
            and string[i]  not in isAmbiguousB \
            and string[i] not in isAmbiguousC \
            and string[i] not in isAmbiguousD \
            and string[i] not in isSep : # or letter given letter;
                self.FinalCharsType.append("l");
            elif string[i] in isAmbiguousA : # to checking the rules (dot)
                if string[i-1] not in isAmbiguousA\
                and string[i-1] not in  isAmbiguousB \
                and string[i-1] not in isAmbiguousC \
                and string[i-1] not in isSep \
                and string[i+1] not in  isAmbiguousA \
                and string[i+1] not in isAmbiguousB \
                and string[i+1] not in  isAmbiguousC \
                and string[i+1] not in isSep: 
                    # between two letter & between two digit give two letter
                    if string[i-1] in isDigit and string[i+1] in isDigit:
                        self.FinalCharsType.append("d"); 
                    else:
                        self.FinalCharsType.append("l");
                else:
                    self.FinalCharsType.append('s');
            elif string[i]  in isSep :
                self.FinalCharsType.append('s');
            elif string[i] in isAmbiguousB :
                if string[i-1] in  isDigit and string[i+1] in isDigit :
                    self.FinalCharsType.append("d");
                else:
                    self.FinalCharsType.append('s');
            elif string[i] in  isAmbiguousC: #between two numbers give letter
                if string[i-1] in  isDigit and string[i+1] in isDigit :   
                    self.FinalCharsType.append("l");
                else:
                    self.FinalCharsType.append('s');
            else:
                self.FinalCharsType.append('s');
            i += 1;
    pass





        


    def __str__(self):
        for i in range(len(self.Sentences)):
            str += self.Sentences[i].__str__();
        return str;
    pass