Add files using upload-large-folder tool

382124c verified 5 months ago

11.2 kB

	#Loading of SKT Pickles
	from romtoslp import rom_slp
	from json import *
	import pprint
	from utilities import *
	class word_new():
	def __init__(self,names):
	self.lemmas=[]
	self.names=names
	self.urls=[]
	self.forms=[]

	class chunks:
	def __init__(self,chunk_name):
	self.chunk_name=chunk_name
	self.chunk_words={}

	class sentences:
	def __init__(self,sent_id,sentence):
	self.sent_id=sent_id
	self.sentence=sentence
	self.chunk=[]

	# def getCNGs(formsDict):
	# l = []
	# if type(formsDict) == int or type(formsDict) == str:
	# return [int(formsDict)]
	# else:
	# for form, configs in formsDict.items():
	# for c in configs:
	# if(form == 'verbform'):
	# continue
	# else:
	# l.append(wtc_recursive(form, configs))
	# return list(set(l))

	class SentenceError(Exception):
	def __init__(self, message):

	# Call the base class constructor with the parameters it needs
	super(SentenceError, self).__init__(message)

	def SeeSentence(sentenceObj):
	print('SKT ANALYZE')
	print('-'*15)
	print(sentenceObj.sentence)
	zz = 0
	# (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain) = SentencePreprocess(sentenceObj)
	# for cid in chunkDict.keys():
	# print('Analyzing:', rom_slp(sentenceObj.chunk[cid].chunk_name))
	# for pos in chunkDict[cid].keys():
	# tupIds = chunkDict[cid][pos]
	# for ti in tupIds:
	# print('%d :' % (pos, ), end = ' ')
	# print(tuplesMain[ti][0][1], end=' ')
	# for tup in tuplesMain[ti]:
	# print([zz, tup[2], tup[3]], end=' ')
	# zz += 1
	# print('')
	# print('-'*25)

	for chunk in sentenceObj.chunk:
	print("Analyzing ", rom_slp(chunk.chunk_name))
	for pos in chunk.chunk_words.keys():
	for word_sense in chunk.chunk_words[pos]:
	word_sense = fix_w_new(word_sense)
	print(pos, ": ", rom_slp(word_sense.names), word_sense.lemmas, word_sense.forms)
	# for formsDict in word_sense.forms:
	# print(getCNGs(formsDict))
	print()

	def getWord(sentenceObj, cid, pos,kii):
	ch = sentenceObj.chunk[cid]
	word = ch.chunk_words[pos][kii]
	return {'lemmas': word.lemmas, 'forms':word.forms, 'names':word.names}

	# ---------------------------------------------------------------------------------------------------------------------
	# ---------------------------------------------------------------------------------------------------------------------

	# ---------------------------------------------------------------------------------------------------------------------
	# ---------------------------------------------------------------------------------------------------------------------
	from wordTypeCheckFunction import *
	import pickle

	"""
	SentencePreprocess:
	-------------------
	Read a sentence obj and create + return the following objects

	-> chunkDict: chunk_id -> position -> index in lemmaList (nested dictionary)
	-> lemmaList: list of possible words as a result of word segmentation
	-> revMap2Chunk: Map word in wordList to (cid, position) in chunkDict
	-> qu: Possible query nodes
	"""
	v2t = pickle.load(open('verbs_vs_cngs_matrix_countonly.p', 'rb'), encoding=u'utf8')
	def wtc_recursive(form, c):
	if type(c) ==list:
	for cc in c:
	return wtc_recursive(form, cc)
	else:
	return wordTypeCheck(form, c)

	def CanBeQuery(chunk):
	allLemmas = []
	for pos, words in chunk.chunk_words.items():
	for word in words:
	for lemma in word.lemmas:
	if lemma != '':
	allLemmas.append(lemma)
	if(len(allLemmas) == 1):
	return True

	def Get_QCs(tuplesMain, chunkDict):
	# Form NON-competitor dictionary - Query - Candidate Pairs
	qc_pairs = {}
	nodeList = [t for ts in tuplesMain for t in ts]

	for ni in range(len(nodeList)):
	qc_pairs[ni] = set(range(len(nodeList))) - set([ni])

	for cid in chunkDict.keys():
	# Neighbours
	for pos1 in chunkDict[cid].keys():
	for pos2 in chunkDict[cid].keys():
	if pos1 <= pos2:
	nList1 = []
	for ti1 in chunkDict[cid][pos1]:
	for tup1 in tuplesMain[ti1]:
	nList1.append(tup1[0])
	nList2 = []
	for ti2 in chunkDict[cid][pos2]:
	for tup2 in tuplesMain[ti2]:
	nList2.append(tup2[0])
	nList1 = set(nList1)
	nList2 = set(nList2)
	for n1 in nList1:
	qc_pairs[n1] = qc_pairs[n1] - nList1

	for n2 in nList2:
	qc_pairs[n2] = qc_pairs[n2] - nList2

	if pos1 < pos2:
	for n1 in nList1:
	for n2 in nList2:
	if not CanCoExist_sandhi(pos1, pos2, nodeList[n1][1], nodeList[n2][1]):
	qc_pairs[n1] = qc_pairs[n1] - set([n2])
	qc_pairs[n2] = qc_pairs[n2] - set([n1])

	return qc_pairs

	'''
	===================
	SentencePreprocess
	===================
	forceQuery: Setting it true will make the longest word available a query if no
	other query is available
	'''
	def SentencePreprocess(sentenceObj, forceQuery = False):
	"""
	Considering word names only
	***{Word forms or cngs can also be used}
	"""
	def getCNGs(formsDict):
	if type(formsDict) == int or type(formsDict) == str:
	return [int(formsDict)]
	else:
	l = []
	for form, configs in formsDict.items():
	for c in configs:
	if(form == 'verbform'):
	continue
	else:
	l.append(wtc_recursive(form, c))
	return list(set(l))

	chunkDict = {}
	lemmaList = []
	wordList = []
	cngList = []
	revMap2Chunk = []
	qu = []
	tuplesMain = []

	cid = -1
	tidExclusive = 0

	## Traverse sentence and form data-structures
	for chunk in sentenceObj.chunk:
	# print(chunk.chunk_name)
	cid = cid+1
	chunkDict[cid] = {}
	for pos in chunk.chunk_words.keys():
	tupleSet = {}
	chunkDict[cid][pos] = []
	for word_sense in chunk.chunk_words[pos]:
	# word_sense = fix_w_new(word_sense)
	nama = rom_slp(word_sense.names)
	if nama == '':
	raise SentenceError('Empty Name Detected')
	if(len(word_sense.lemmas) > 0 and len(word_sense.forms) > 0):
	tuples = []
	for lemmaI in range(len(word_sense.lemmas)):
	# lemma = rom_slp(word_sense.lemmas[lemmaI].split('_')[0]) # NOT REQUIRED - DONE IN FIX_W_NEW
	lemma = word_sense.lemmas[lemmaI]
	if lemma == '':
	continue
	tempCNGs = getCNGs(word_sense.forms[lemmaI])
	for cng in tempCNGs:
	# UPDATE LISTS
	newT_Key = (lemma, cng)
	newT = (tidExclusive, nama, lemma, cng)
	if(newT_Key not in tupleSet):
	tupleSet[newT_Key] = 1
	tuples.append(newT) # Remember the order
	lemmaList.append(lemma)
	wordList.append(nama)
	cngList.append(cng)
	revMap2Chunk.append((cid, pos, len(tuplesMain)))
	tidExclusive += 1

	if(len(tuples) > 0):
	# print(tuples)
	k = len(tuplesMain)
	chunkDict[cid][pos].append(k)
	tuplesMain.append(tuples)

	## Find QUERY nodes now
	for cid in chunkDict.keys():
	tuples = []
	for pos in chunkDict[cid].keys():
	tupIds = chunkDict[cid][pos]
	for tupId in tupIds:
	[tuples.append((pos, tup[0], tup[1])) for tup in tuplesMain[tupId]]
	for u in range(len(tuples)):
	tup1 = tuples[u]
	quFlag = True
	for v in range(len(tuples)):
	if(u == v):
	continue
	tup2 = tuples[v]

	# '''
	# FIXME: REMOVE TRY CATCH
	# '''
	# try:
	if(tup1[0] < tup2[0]):
	if not CanCoExist_sandhi(tup1[0], tup2[0], tup1[2], tup2[2]):
	## Found a competing node - hence can't be a query
	quFlag = False
	break
	elif(tup1[0] > tup2[0]):
	if not CanCoExist_sandhi(tup2[0], tup1[0], tup2[2], tup1[2]):
	## Found a competing node - hence can't be a query
	quFlag = False
	break
	else:
	quFlag = False
	break

	# except IndexError:
	# print('From SentencePreprocess IndexError:', sentenceObj.sent_id)
	# raise IndexError

	if quFlag:
	qu.append(tup1[1])

	# if len(qu) == 0:
	# print('No query available')
	# maxI = 0
	# for i in range(len(wordList)):
	# if len(wordList[i]) > len(wordList[maxI]):
	# maxI = i
	# elif len(wordList[i]) == len(wordList[maxI]):
	# # Check the competitor count

	# print(wordList[maxI], 'is forced query')

	verbs = []
	i = -1
	for w in lemmaList:
	i += 1
	if w in list(v2t.keys()):
	verbs.append(i)


	# pprint.pprint(tuplesMain)
	# pprint.pprint(chunkDict)
	# pprint.pprint(revMap2Chunk)

	qc_pairs = Get_QCs(tuplesMain, chunkDict)

	'''
	qu = [] # Have to remove it later
	'''
	# print(chunkDict)
	if len(qu) == 0 and len(lemmaList) > 0:
	lens = np.array([len(t[1]) for ts in tuplesMain for t in ts])
	cw = [(t[0], t[1]) for ts in tuplesMain for t in ts]
	round1 = np.where(lens == np.max(lens))[0]
	hits = [len(qc_pairs[r]) for r in round1]
	finalist = round1[np.where(hits == np.min(hits))][0]
	qu.append(finalist)

	return (chunkDict, lemmaList, wordList, revMap2Chunk, qu, cngList, verbs, tuplesMain, qc_pairs)