File size: 11,163 Bytes

382124c

#Loading of SKT Pickles
from romtoslp import rom_slp
from json import *
import pprint
from utilities import *
class word_new():
    def __init__(self,names):
        self.lemmas=[]
        self.names=names
        self.urls=[]
        self.forms=[]

class chunks:
    def __init__(self,chunk_name):
        self.chunk_name=chunk_name
        self.chunk_words={}

class sentences:
    def __init__(self,sent_id,sentence):
        self.sent_id=sent_id
        self.sentence=sentence
        self.chunk=[]

# def getCNGs(formsDict):
#         l = []
#         if type(formsDict) == int or type(formsDict) == str:
#             return [int(formsDict)]
#         else:
#             for form, configs in formsDict.items():
#                 for c in configs:
#                     if(form == 'verbform'):                
#                         continue
#                     else:
#                         l.append(wtc_recursive(form, configs))
#             return list(set(l))

class SentenceError(Exception):
    def __init__(self, message):

        # Call the base class constructor with the parameters it needs
        super(SentenceError, self).__init__(message)

def SeeSentence(sentenceObj):
    print('SKT ANALYZE')
    print('-'*15)
    print(sentenceObj.sentence)
    zz = 0
    # (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain) = SentencePreprocess(sentenceObj)
    # for cid in chunkDict.keys():
    #     print('Analyzing:', rom_slp(sentenceObj.chunk[cid].chunk_name))
    #     for pos in chunkDict[cid].keys():
    #         tupIds = chunkDict[cid][pos]
    #         for ti in tupIds:
    #             print('%d :' % (pos, ), end = ' ')
    #             print(tuplesMain[ti][0][1], end=' ')
    #             for tup in tuplesMain[ti]:
    #                 print([zz, tup[2], tup[3]], end=' ')
    #                 zz += 1
    #             print('')
    #     print('-'*25)

    for chunk in sentenceObj.chunk:
        print("Analyzing ", rom_slp(chunk.chunk_name))
        for pos in chunk.chunk_words.keys():
            for word_sense in chunk.chunk_words[pos]:
                word_sense = fix_w_new(word_sense)
                print(pos, ": ", rom_slp(word_sense.names), word_sense.lemmas, word_sense.forms)
                # for formsDict in word_sense.forms:
                #     print(getCNGs(formsDict))
    print()

def getWord(sentenceObj, cid, pos,kii):
    ch = sentenceObj.chunk[cid]
    word = ch.chunk_words[pos][kii]
    return {'lemmas': word.lemmas, 'forms':word.forms, 'names':word.names}

# ---------------------------------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------------------

# ---------------------------------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------------------
from wordTypeCheckFunction import *
import pickle

"""
SentencePreprocess:
-------------------
    Read a sentence obj and create + return the following objects 

    -> chunkDict: chunk_id -> position -> index in lemmaList (nested dictionary)
    -> lemmaList: list of possible words as a result of word segmentation
    -> revMap2Chunk: Map word in wordList to (cid, position) in chunkDict
    -> qu: Possible query nodes
"""
v2t = pickle.load(open('verbs_vs_cngs_matrix_countonly.p', 'rb'), encoding=u'utf8')
def wtc_recursive(form, c):
    if type(c) ==list:
        for cc in c:
            return wtc_recursive(form, cc)
    else:
        return wordTypeCheck(form, c)

def CanBeQuery(chunk):
    allLemmas = []
    for pos, words in chunk.chunk_words.items():
        for word in words:
            for lemma in word.lemmas:
                if lemma != '':
                    allLemmas.append(lemma)
    if(len(allLemmas) == 1):
        return True

def Get_QCs(tuplesMain, chunkDict):
    # Form NON-competitor dictionary - Query - Candidate Pairs
    qc_pairs = {}
    nodeList = [t for ts in tuplesMain for t in ts]
    
    for ni in range(len(nodeList)):
        qc_pairs[ni] = set(range(len(nodeList))) - set([ni])

    for cid in chunkDict.keys():
        # Neighbours
        for pos1 in chunkDict[cid].keys():
            for pos2 in chunkDict[cid].keys():
                if pos1 <= pos2:
                    nList1 = []
                    for ti1 in chunkDict[cid][pos1]:
                        for tup1 in tuplesMain[ti1]:
                            nList1.append(tup1[0])
                    nList2 = []
                    for ti2 in chunkDict[cid][pos2]:
                        for tup2 in tuplesMain[ti2]:
                            nList2.append(tup2[0])
                    nList1 = set(nList1)
                    nList2 = set(nList2)
                    for n1 in nList1:
                        qc_pairs[n1] = qc_pairs[n1] - nList1

                    for n2 in nList2:
                        qc_pairs[n2] = qc_pairs[n2] - nList2

                    if pos1 < pos2:
                        for n1 in nList1:
                            for n2 in nList2:
                                if not CanCoExist_sandhi(pos1, pos2, nodeList[n1][1], nodeList[n2][1]):
                                    qc_pairs[n1] = qc_pairs[n1] - set([n2])
                                    qc_pairs[n2] = qc_pairs[n2] - set([n1])
                                    
    return qc_pairs

'''
===================
SentencePreprocess
===================
forceQuery: Setting it true will make the longest word available a query if no
            other query is available
'''
def SentencePreprocess(sentenceObj, forceQuery = False):
    """
    Considering word names only
    ***{Word forms or cngs can also be used}
    """
    def getCNGs(formsDict):
        if type(formsDict) == int or type(formsDict) == str:
            return [int(formsDict)]
        else:
            l = []
            for form, configs in formsDict.items():
                for c in configs:
                    if(form == 'verbform'):
                        continue
                    else:
                        l.append(wtc_recursive(form, c))
            return list(set(l))

    chunkDict = {}
    lemmaList = []
    wordList = []
    cngList = []
    revMap2Chunk = []
    qu = []
    tuplesMain = []

    cid = -1
    tidExclusive = 0

    ## Traverse sentence and form data-structures
    for chunk in sentenceObj.chunk:
        # print(chunk.chunk_name)
        cid = cid+1
        chunkDict[cid] = {}
        for pos in chunk.chunk_words.keys():
            tupleSet = {}
            chunkDict[cid][pos] = []
            for word_sense in chunk.chunk_words[pos]:
                # word_sense = fix_w_new(word_sense)
                nama = rom_slp(word_sense.names)
                if nama == '':
                    raise SentenceError('Empty Name Detected')
                if(len(word_sense.lemmas) > 0 and len(word_sense.forms) > 0):
                    tuples = []
                    for lemmaI in range(len(word_sense.lemmas)):
                        # lemma = rom_slp(word_sense.lemmas[lemmaI].split('_')[0]) # NOT REQUIRED - DONE IN FIX_W_NEW
                        lemma = word_sense.lemmas[lemmaI]
                        if lemma == '':
                            continue
                        tempCNGs = getCNGs(word_sense.forms[lemmaI])
                        for cng in tempCNGs:
                            # UPDATE LISTS
                            newT_Key = (lemma, cng)
                            newT = (tidExclusive, nama, lemma, cng)
                            if(newT_Key not in tupleSet):
                                tupleSet[newT_Key] = 1
                                tuples.append(newT) # Remember the order
                                lemmaList.append(lemma)
                                wordList.append(nama)
                                cngList.append(cng)
                                revMap2Chunk.append((cid, pos, len(tuplesMain)))
                                tidExclusive += 1

                    if(len(tuples) > 0):
                        # print(tuples)
                        k = len(tuplesMain)
                        chunkDict[cid][pos].append(k)
                        tuplesMain.append(tuples)

    ## Find QUERY nodes now
    for cid in chunkDict.keys():
        tuples = []
        for pos in chunkDict[cid].keys():
            tupIds = chunkDict[cid][pos]
            for tupId in tupIds:
                [tuples.append((pos, tup[0], tup[1])) for tup in tuplesMain[tupId]]
        for u in range(len(tuples)):
            tup1 = tuples[u]
            quFlag = True
            for v in range(len(tuples)):
                if(u == v):
                    continue
                tup2 = tuples[v]
                
                # '''
                # FIXME: REMOVE TRY CATCH
                # '''
                # try:
                if(tup1[0] < tup2[0]):
                    if not CanCoExist_sandhi(tup1[0], tup2[0], tup1[2], tup2[2]):
                        ## Found a competing node - hence can't be a query
                        quFlag = False
                        break
                elif(tup1[0] > tup2[0]):
                    if not CanCoExist_sandhi(tup2[0], tup1[0], tup2[2], tup1[2]):
                        ## Found a competing node - hence can't be a query
                        quFlag = False
                        break
                else:
                    quFlag = False
                    break

                # except IndexError:
                #     print('From SentencePreprocess IndexError:', sentenceObj.sent_id)
                #     raise IndexError

            if quFlag:
                qu.append(tup1[1])

    # if len(qu) == 0:
    #     print('No query available')
    #     maxI = 0
    #     for i in range(len(wordList)):
    #         if len(wordList[i]) > len(wordList[maxI]):
    #             maxI = i
    #         elif len(wordList[i]) == len(wordList[maxI]):
    #             # Check the competitor count

    #     print(wordList[maxI], 'is forced query')

    verbs = []
    i = -1
    for w in lemmaList:
        i += 1
        if w in list(v2t.keys()):
            verbs.append(i)


    # pprint.pprint(tuplesMain)
    # pprint.pprint(chunkDict)
    # pprint.pprint(revMap2Chunk)
    
    qc_pairs = Get_QCs(tuplesMain, chunkDict)
    
    '''
    qu = [] # Have to remove it later
    '''
    # print(chunkDict)
    if len(qu) == 0 and len(lemmaList) > 0:
        lens = np.array([len(t[1]) for ts in tuplesMain for t in ts])
        cw = [(t[0], t[1]) for ts in tuplesMain for t in ts]
        round1 = np.where(lens == np.max(lens))[0]
        hits = [len(qc_pairs[r]) for r in round1]
        finalist = round1[np.where(hits == np.min(hits))][0]
        qu.append(finalist)
    
    return (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain, qc_pairs)