THUIAR
/

TEXTOIR

Model card Files Files and versions

TEXTOIR / TEXTOIR-main /open_intent_discovery /backbones /glove.py

THU-IAR's picture

Upload 198 files

2d06dcc verified over 1 year ago

history blame contribute delete

1.91 kB

	import numpy as np
	from collections import defaultdict

	class GloVeEmbeddingVectorizer(object):

	def __init__(self, embedding_matrix, index_word, X=None):
	self.embedding_matrix = embedding_matrix
	self.dim = embedding_matrix.shape[1]
	if X is not None:
	self.index_word = index_word
	self.D = embedding_matrix.shape[0]
	self.idf = self.get_idf(X)

	def get_idf(self, X):
	d = defaultdict(int)
	idf = defaultdict(int)
	if isinstance(X,list):
	for e in X:
	for word_indices in e:
	for idx in word_indices:
	d[idx] += 1
	else:
	for word_indices in X:
	for idx in word_indices:
	d[idx]+= 1
	idf = {k:np.log(self.D/v) for k, v in d.items()}
	return idf

	def transform(self, X, method='mean'):
	sentence_embs = []
	for word_indices in X:
	word_embs = []
	dividend = 0
	for idx in word_indices:
	if idx in self.index_word and idx!=0:
	if method=='mean':
	weight = 1
	elif method=='idf':
	mark = self.idf.get(idx,None)
	if mark is not None:
	weight = self.idf[idx]
	else:
	weight = np.log(self.D / 1)

	word_embs.append(self.embedding_matrix[idx]*weight)
	dividend += weight
	# no words founded in GloVe
	if dividend==0:
	sentence_emb = np.zeros(self.dim)
	else:
	sentence_emb = np.divide(np.sum(word_embs, axis=0), dividend)
	sentence_embs.append(sentence_emb)
	return np.array(sentence_embs)