|
|
|
|
|
from romtoslp import rom_slp |
|
|
from json import * |
|
|
import pprint |
|
|
from utilities import * |
|
|
class word_new(): |
|
|
def __init__(self,names): |
|
|
self.lemmas=[] |
|
|
self.names=names |
|
|
self.urls=[] |
|
|
self.forms=[] |
|
|
|
|
|
class chunks: |
|
|
def __init__(self,chunk_name): |
|
|
self.chunk_name=chunk_name |
|
|
self.chunk_words={} |
|
|
|
|
|
class sentences: |
|
|
def __init__(self,sent_id,sentence): |
|
|
self.sent_id=sent_id |
|
|
self.sentence=sentence |
|
|
self.chunk=[] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SentenceError(Exception): |
|
|
def __init__(self, message): |
|
|
|
|
|
|
|
|
super(SentenceError, self).__init__(message) |
|
|
|
|
|
def SeeSentence(sentenceObj): |
|
|
print('SKT ANALYZE') |
|
|
print('-'*15) |
|
|
print(sentenceObj.sentence) |
|
|
zz = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for chunk in sentenceObj.chunk: |
|
|
print("Analyzing ", rom_slp(chunk.chunk_name)) |
|
|
for pos in chunk.chunk_words.keys(): |
|
|
for word_sense in chunk.chunk_words[pos]: |
|
|
word_sense = fix_w_new(word_sense) |
|
|
print(pos, ": ", rom_slp(word_sense.names), word_sense.lemmas, word_sense.forms) |
|
|
|
|
|
|
|
|
print() |
|
|
|
|
|
def getWord(sentenceObj, cid, pos,kii): |
|
|
ch = sentenceObj.chunk[cid] |
|
|
word = ch.chunk_words[pos][kii] |
|
|
return {'lemmas': word.lemmas, 'forms':word.forms, 'names':word.names} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from wordTypeCheckFunction import * |
|
|
import pickle |
|
|
|
|
|
""" |
|
|
SentencePreprocess: |
|
|
------------------- |
|
|
Read a sentence obj and create + return the following objects |
|
|
|
|
|
-> chunkDict: chunk_id -> position -> index in lemmaList (nested dictionary) |
|
|
-> lemmaList: list of possible words as a result of word segmentation |
|
|
-> revMap2Chunk: Map word in wordList to (cid, position) in chunkDict |
|
|
-> qu: Possible query nodes |
|
|
""" |
|
|
v2t = pickle.load(open('verbs_vs_cngs_matrix_countonly.p', 'rb'), encoding=u'utf8') |
|
|
def wtc_recursive(form, c): |
|
|
if type(c) ==list: |
|
|
for cc in c: |
|
|
return wtc_recursive(form, cc) |
|
|
else: |
|
|
return wordTypeCheck(form, c) |
|
|
|
|
|
def CanBeQuery(chunk): |
|
|
allLemmas = [] |
|
|
for pos, words in chunk.chunk_words.items(): |
|
|
for word in words: |
|
|
for lemma in word.lemmas: |
|
|
if lemma != '': |
|
|
allLemmas.append(lemma) |
|
|
if(len(allLemmas) == 1): |
|
|
return True |
|
|
|
|
|
def Get_QCs(tuplesMain, chunkDict): |
|
|
|
|
|
qc_pairs = {} |
|
|
nodeList = [t for ts in tuplesMain for t in ts] |
|
|
|
|
|
for ni in range(len(nodeList)): |
|
|
qc_pairs[ni] = set(range(len(nodeList))) - set([ni]) |
|
|
|
|
|
for cid in chunkDict.keys(): |
|
|
|
|
|
for pos1 in chunkDict[cid].keys(): |
|
|
for pos2 in chunkDict[cid].keys(): |
|
|
if pos1 <= pos2: |
|
|
nList1 = [] |
|
|
for ti1 in chunkDict[cid][pos1]: |
|
|
for tup1 in tuplesMain[ti1]: |
|
|
nList1.append(tup1[0]) |
|
|
nList2 = [] |
|
|
for ti2 in chunkDict[cid][pos2]: |
|
|
for tup2 in tuplesMain[ti2]: |
|
|
nList2.append(tup2[0]) |
|
|
nList1 = set(nList1) |
|
|
nList2 = set(nList2) |
|
|
for n1 in nList1: |
|
|
qc_pairs[n1] = qc_pairs[n1] - nList1 |
|
|
|
|
|
for n2 in nList2: |
|
|
qc_pairs[n2] = qc_pairs[n2] - nList2 |
|
|
|
|
|
if pos1 < pos2: |
|
|
for n1 in nList1: |
|
|
for n2 in nList2: |
|
|
if not CanCoExist_sandhi(pos1, pos2, nodeList[n1][1], nodeList[n2][1]): |
|
|
qc_pairs[n1] = qc_pairs[n1] - set([n2]) |
|
|
qc_pairs[n2] = qc_pairs[n2] - set([n1]) |
|
|
|
|
|
return qc_pairs |
|
|
|
|
|
''' |
|
|
=================== |
|
|
SentencePreprocess |
|
|
=================== |
|
|
forceQuery: Setting it true will make the longest word available a query if no |
|
|
other query is available |
|
|
''' |
|
|
def SentencePreprocess(sentenceObj, forceQuery = False): |
|
|
""" |
|
|
Considering word names only |
|
|
***{Word forms or cngs can also be used} |
|
|
""" |
|
|
def getCNGs(formsDict): |
|
|
if type(formsDict) == int or type(formsDict) == str: |
|
|
return [int(formsDict)] |
|
|
else: |
|
|
l = [] |
|
|
for form, configs in formsDict.items(): |
|
|
for c in configs: |
|
|
if(form == 'verbform'): |
|
|
continue |
|
|
else: |
|
|
l.append(wtc_recursive(form, c)) |
|
|
return list(set(l)) |
|
|
|
|
|
chunkDict = {} |
|
|
lemmaList = [] |
|
|
wordList = [] |
|
|
cngList = [] |
|
|
revMap2Chunk = [] |
|
|
qu = [] |
|
|
tuplesMain = [] |
|
|
|
|
|
cid = -1 |
|
|
tidExclusive = 0 |
|
|
|
|
|
|
|
|
for chunk in sentenceObj.chunk: |
|
|
|
|
|
cid = cid+1 |
|
|
chunkDict[cid] = {} |
|
|
for pos in chunk.chunk_words.keys(): |
|
|
tupleSet = {} |
|
|
chunkDict[cid][pos] = [] |
|
|
for word_sense in chunk.chunk_words[pos]: |
|
|
|
|
|
nama = rom_slp(word_sense.names) |
|
|
if nama == '': |
|
|
raise SentenceError('Empty Name Detected') |
|
|
if(len(word_sense.lemmas) > 0 and len(word_sense.forms) > 0): |
|
|
tuples = [] |
|
|
for lemmaI in range(len(word_sense.lemmas)): |
|
|
|
|
|
lemma = word_sense.lemmas[lemmaI] |
|
|
if lemma == '': |
|
|
continue |
|
|
tempCNGs = getCNGs(word_sense.forms[lemmaI]) |
|
|
for cng in tempCNGs: |
|
|
|
|
|
newT_Key = (lemma, cng) |
|
|
newT = (tidExclusive, nama, lemma, cng) |
|
|
if(newT_Key not in tupleSet): |
|
|
tupleSet[newT_Key] = 1 |
|
|
tuples.append(newT) |
|
|
lemmaList.append(lemma) |
|
|
wordList.append(nama) |
|
|
cngList.append(cng) |
|
|
revMap2Chunk.append((cid, pos, len(tuplesMain))) |
|
|
tidExclusive += 1 |
|
|
|
|
|
if(len(tuples) > 0): |
|
|
|
|
|
k = len(tuplesMain) |
|
|
chunkDict[cid][pos].append(k) |
|
|
tuplesMain.append(tuples) |
|
|
|
|
|
|
|
|
for cid in chunkDict.keys(): |
|
|
tuples = [] |
|
|
for pos in chunkDict[cid].keys(): |
|
|
tupIds = chunkDict[cid][pos] |
|
|
for tupId in tupIds: |
|
|
[tuples.append((pos, tup[0], tup[1])) for tup in tuplesMain[tupId]] |
|
|
for u in range(len(tuples)): |
|
|
tup1 = tuples[u] |
|
|
quFlag = True |
|
|
for v in range(len(tuples)): |
|
|
if(u == v): |
|
|
continue |
|
|
tup2 = tuples[v] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(tup1[0] < tup2[0]): |
|
|
if not CanCoExist_sandhi(tup1[0], tup2[0], tup1[2], tup2[2]): |
|
|
|
|
|
quFlag = False |
|
|
break |
|
|
elif(tup1[0] > tup2[0]): |
|
|
if not CanCoExist_sandhi(tup2[0], tup1[0], tup2[2], tup1[2]): |
|
|
|
|
|
quFlag = False |
|
|
break |
|
|
else: |
|
|
quFlag = False |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if quFlag: |
|
|
qu.append(tup1[1]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
verbs = [] |
|
|
i = -1 |
|
|
for w in lemmaList: |
|
|
i += 1 |
|
|
if w in list(v2t.keys()): |
|
|
verbs.append(i) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
qc_pairs = Get_QCs(tuplesMain, chunkDict) |
|
|
|
|
|
''' |
|
|
qu = [] # Have to remove it later |
|
|
''' |
|
|
|
|
|
if len(qu) == 0 and len(lemmaList) > 0: |
|
|
lens = np.array([len(t[1]) for ts in tuplesMain for t in ts]) |
|
|
cw = [(t[0], t[1]) for ts in tuplesMain for t in ts] |
|
|
round1 = np.where(lens == np.max(lens))[0] |
|
|
hits = [len(qc_pairs[r]) for r in round1] |
|
|
finalist = round1[np.where(hits == np.min(hits))][0] |
|
|
qu.append(finalist) |
|
|
|
|
|
return (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain, qc_pairs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|