File size: 11,163 Bytes
382124c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#Loading of SKT Pickles
from romtoslp import rom_slp
from json import *
import pprint
from utilities import *
class word_new():
    def __init__(self,names):
        self.lemmas=[]
        self.names=names
        self.urls=[]
        self.forms=[]

class chunks:
    def __init__(self,chunk_name):
        self.chunk_name=chunk_name
        self.chunk_words={}

class sentences:
    def __init__(self,sent_id,sentence):
        self.sent_id=sent_id
        self.sentence=sentence
        self.chunk=[]

# def getCNGs(formsDict):
#         l = []
#         if type(formsDict) == int or type(formsDict) == str:
#             return [int(formsDict)]
#         else:
#             for form, configs in formsDict.items():
#                 for c in configs:
#                     if(form == 'verbform'):                
#                         continue
#                     else:
#                         l.append(wtc_recursive(form, configs))
#             return list(set(l))

class SentenceError(Exception):
    def __init__(self, message):

        # Call the base class constructor with the parameters it needs
        super(SentenceError, self).__init__(message)

def SeeSentence(sentenceObj):
    print('SKT ANALYZE')
    print('-'*15)
    print(sentenceObj.sentence)
    zz = 0
    # (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain) = SentencePreprocess(sentenceObj)
    # for cid in chunkDict.keys():
    #     print('Analyzing:', rom_slp(sentenceObj.chunk[cid].chunk_name))
    #     for pos in chunkDict[cid].keys():
    #         tupIds = chunkDict[cid][pos]
    #         for ti in tupIds:
    #             print('%d :' % (pos, ), end = ' ')
    #             print(tuplesMain[ti][0][1], end=' ')
    #             for tup in tuplesMain[ti]:
    #                 print([zz, tup[2], tup[3]], end=' ')
    #                 zz += 1
    #             print('')
    #     print('-'*25)

    for chunk in sentenceObj.chunk:
        print("Analyzing ", rom_slp(chunk.chunk_name))
        for pos in chunk.chunk_words.keys():
            for word_sense in chunk.chunk_words[pos]:
                word_sense = fix_w_new(word_sense)
                print(pos, ": ", rom_slp(word_sense.names), word_sense.lemmas, word_sense.forms)
                # for formsDict in word_sense.forms:
                #     print(getCNGs(formsDict))
    print()

def getWord(sentenceObj, cid, pos,kii):
    ch = sentenceObj.chunk[cid]
    word = ch.chunk_words[pos][kii]
    return {'lemmas': word.lemmas, 'forms':word.forms, 'names':word.names}

# ---------------------------------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------------------

# ---------------------------------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------------------
from wordTypeCheckFunction import *
import pickle

"""
SentencePreprocess:
-------------------
    Read a sentence obj and create + return the following objects 

    -> chunkDict: chunk_id -> position -> index in lemmaList (nested dictionary)
    -> lemmaList: list of possible words as a result of word segmentation
    -> revMap2Chunk: Map word in wordList to (cid, position) in chunkDict
    -> qu: Possible query nodes
"""
v2t = pickle.load(open('verbs_vs_cngs_matrix_countonly.p', 'rb'), encoding=u'utf8')
def wtc_recursive(form, c):
    if type(c) ==list:
        for cc in c:
            return wtc_recursive(form, cc)
    else:
        return wordTypeCheck(form, c)

def CanBeQuery(chunk):
    allLemmas = []
    for pos, words in chunk.chunk_words.items():
        for word in words:
            for lemma in word.lemmas:
                if lemma != '':
                    allLemmas.append(lemma)
    if(len(allLemmas) == 1):
        return True

def Get_QCs(tuplesMain, chunkDict):
    # Form NON-competitor dictionary - Query - Candidate Pairs
    qc_pairs = {}
    nodeList = [t for ts in tuplesMain for t in ts]
    
    for ni in range(len(nodeList)):
        qc_pairs[ni] = set(range(len(nodeList))) - set([ni])

    for cid in chunkDict.keys():
        # Neighbours
        for pos1 in chunkDict[cid].keys():
            for pos2 in chunkDict[cid].keys():
                if pos1 <= pos2:
                    nList1 = []
                    for ti1 in chunkDict[cid][pos1]:
                        for tup1 in tuplesMain[ti1]:
                            nList1.append(tup1[0])
                    nList2 = []
                    for ti2 in chunkDict[cid][pos2]:
                        for tup2 in tuplesMain[ti2]:
                            nList2.append(tup2[0])
                    nList1 = set(nList1)
                    nList2 = set(nList2)
                    for n1 in nList1:
                        qc_pairs[n1] = qc_pairs[n1] - nList1

                    for n2 in nList2:
                        qc_pairs[n2] = qc_pairs[n2] - nList2

                    if pos1 < pos2:
                        for n1 in nList1:
                            for n2 in nList2:
                                if not CanCoExist_sandhi(pos1, pos2, nodeList[n1][1], nodeList[n2][1]):
                                    qc_pairs[n1] = qc_pairs[n1] - set([n2])
                                    qc_pairs[n2] = qc_pairs[n2] - set([n1])
                                    
    return qc_pairs

'''
===================
SentencePreprocess
===================
forceQuery: Setting it true will make the longest word available a query if no
            other query is available
'''
def SentencePreprocess(sentenceObj, forceQuery = False):
    """
    Considering word names only
    ***{Word forms or cngs can also be used}
    """
    def getCNGs(formsDict):
        if type(formsDict) == int or type(formsDict) == str:
            return [int(formsDict)]
        else:
            l = []
            for form, configs in formsDict.items():
                for c in configs:
                    if(form == 'verbform'):
                        continue
                    else:
                        l.append(wtc_recursive(form, c))
            return list(set(l))

    chunkDict = {}
    lemmaList = []
    wordList = []
    cngList = []
    revMap2Chunk = []
    qu = []
    tuplesMain = []

    cid = -1
    tidExclusive = 0

    ## Traverse sentence and form data-structures
    for chunk in sentenceObj.chunk:
        # print(chunk.chunk_name)
        cid = cid+1
        chunkDict[cid] = {}
        for pos in chunk.chunk_words.keys():
            tupleSet = {}
            chunkDict[cid][pos] = []
            for word_sense in chunk.chunk_words[pos]:
                # word_sense = fix_w_new(word_sense)
                nama = rom_slp(word_sense.names)
                if nama == '':
                    raise SentenceError('Empty Name Detected')
                if(len(word_sense.lemmas) > 0 and len(word_sense.forms) > 0):
                    tuples = []
                    for lemmaI in range(len(word_sense.lemmas)):
                        # lemma = rom_slp(word_sense.lemmas[lemmaI].split('_')[0]) # NOT REQUIRED - DONE IN FIX_W_NEW
                        lemma = word_sense.lemmas[lemmaI]
                        if lemma == '':
                            continue
                        tempCNGs = getCNGs(word_sense.forms[lemmaI])
                        for cng in tempCNGs:
                            # UPDATE LISTS
                            newT_Key = (lemma, cng)
                            newT = (tidExclusive, nama, lemma, cng)
                            if(newT_Key not in tupleSet):
                                tupleSet[newT_Key] = 1
                                tuples.append(newT) # Remember the order
                                lemmaList.append(lemma)
                                wordList.append(nama)
                                cngList.append(cng)
                                revMap2Chunk.append((cid, pos, len(tuplesMain)))
                                tidExclusive += 1

                    if(len(tuples) > 0):
                        # print(tuples)
                        k = len(tuplesMain)
                        chunkDict[cid][pos].append(k)
                        tuplesMain.append(tuples)

    ## Find QUERY nodes now
    for cid in chunkDict.keys():
        tuples = []
        for pos in chunkDict[cid].keys():
            tupIds = chunkDict[cid][pos]
            for tupId in tupIds:
                [tuples.append((pos, tup[0], tup[1])) for tup in tuplesMain[tupId]]
        for u in range(len(tuples)):
            tup1 = tuples[u]
            quFlag = True
            for v in range(len(tuples)):
                if(u == v):
                    continue
                tup2 = tuples[v]
                
                # '''
                # FIXME: REMOVE TRY CATCH
                # '''
                # try:
                if(tup1[0] < tup2[0]):
                    if not CanCoExist_sandhi(tup1[0], tup2[0], tup1[2], tup2[2]):
                        ## Found a competing node - hence can't be a query
                        quFlag = False
                        break
                elif(tup1[0] > tup2[0]):
                    if not CanCoExist_sandhi(tup2[0], tup1[0], tup2[2], tup1[2]):
                        ## Found a competing node - hence can't be a query
                        quFlag = False
                        break
                else:
                    quFlag = False
                    break

                # except IndexError:
                #     print('From SentencePreprocess IndexError:', sentenceObj.sent_id)
                #     raise IndexError

            if quFlag:
                qu.append(tup1[1])

    # if len(qu) == 0:
    #     print('No query available')
    #     maxI = 0
    #     for i in range(len(wordList)):
    #         if len(wordList[i]) > len(wordList[maxI]):
    #             maxI = i
    #         elif len(wordList[i]) == len(wordList[maxI]):
    #             # Check the competitor count

    #     print(wordList[maxI], 'is forced query')

    verbs = []
    i = -1
    for w in lemmaList:
        i += 1
        if w in list(v2t.keys()):
            verbs.append(i)


    # pprint.pprint(tuplesMain)
    # pprint.pprint(chunkDict)
    # pprint.pprint(revMap2Chunk)
    
    qc_pairs = Get_QCs(tuplesMain, chunkDict)
    
    '''
    qu = [] # Have to remove it later
    '''
    # print(chunkDict)
    if len(qu) == 0 and len(lemmaList) > 0:
        lens = np.array([len(t[1]) for ts in tuplesMain for t in ts])
        cw = [(t[0], t[1]) for ts in tuplesMain for t in ts]
        round1 = np.where(lens == np.max(lens))[0]
        hits = [len(qc_pairs[r]) for r in round1]
        finalist = round1[np.where(hits == np.min(hits))][0]
        qu.append(finalist)
    
    return (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain, qc_pairs)