SanDP / utils /load_word_embeddings.py

Upload folder using huggingface_hub

a7b3936 verified 5 months ago

4.55 kB

	import pickle
	import numpy as np
	from gensim.models import KeyedVectors
	import gzip
	import io
	import os

	def calc_mean_vec_for_lower_mapping(embedd_dict):
	lower_counts = {}
	for word in embedd_dict:
	word_lower = word.lower()
	if word_lower not in lower_counts:
	lower_counts[word_lower] = [word]
	else:
	lower_counts[word_lower] = lower_counts[word_lower] + [word]
	# calculating mean vector for all words that have the same mapping after performing lower()
	for word in lower_counts:
	embedd_dict[word] = np.mean([embedd_dict[word_] for word_ in lower_counts[word]])
	return embedd_dict

	def load_embedding_dict(embedding, embedding_path, lower_case=False):
	"""
	load word embeddings from file
	:param embedding:
	:param embedding_path:
	:return: embedding dict, embedding dimention, caseless
	"""
	print("loading embedding: %s from %s" % (embedding, embedding_path))
	if lower_case:
	pkl_path = embedding_path + '_lower' + '.pkl'
	else:
	pkl_path = embedding_path + '.pkl'
	if os.path.isfile(pkl_path):
	# load dict and dim from a pickle file
	with open(pkl_path, 'rb') as f:
	embedd_dict, embedd_dim = pickle.load(f)
	print("num dimensions of word embeddings:", embedd_dim)
	return embedd_dict, embedd_dim

	if embedding == 'glove':
	# loading GloVe
	embedd_dict = {}
	word = None
	with io.open(embedding_path, 'r', encoding='utf-8') as f:
	for line in f:
	word, vec = line.split(' ', 1)
	embedd_dict[word] = np.fromstring(vec, sep=' ')
	embedd_dim = len(embedd_dict[word])
	if lower_case:
	embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
	for k, v in embedd_dict.items():
	if len(v) != embedd_dim:
	print(len(v),embedd_dim)

	elif embedding == 'fasttext':
	# loading GloVe
	embedd_dict = {}
	word = None
	with io.open(embedding_path, 'r', encoding='utf-8') as f:
	# skip first line
	for i, line in enumerate(f):
	if i == 0:
	continue
	word, vec = line.split(' ', 1)
	embedd_dict[word] = np.fromstring(vec, sep=' ')
	embedd_dim = len(embedd_dict[word])
	if lower_case:
	embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
	for k, v in embedd_dict.items():
	if len(v) != embedd_dim:
	print(len(v),embedd_dim)

	elif embedding == 'hellwig':
	# loading hellwig
	embedd_dict = {}
	word = None
	with io.open(embedding_path, 'r', encoding='utf-8') as f:
	# skip first line
	for i, line in enumerate(f):
	if i == 0:
	continue
	word, vec = line.split(' ', 1)
	embedd_dict[word] = np.fromstring(vec, sep=' ')
	embedd_dim = len(embedd_dict[word])
	if lower_case:
	embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
	for k, v in embedd_dict.items():
	if len(v) != embedd_dim:
	print(len(v),embedd_dim)

	elif embedding == 'one_hot':
	# loading hellwig
	embedd_dict = {}
	word = None
	with io.open(embedding_path, 'r', encoding='utf-8') as f:
	# skip first line
	for i, line in enumerate(f):
	if i == 0:
	continue
	word, vec = line.split('@', 1)
	embedd_dict[word] = np.fromstring(vec, sep=' ')
	embedd_dim = len(embedd_dict[word])
	if lower_case:
	embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
	for k, v in embedd_dict.items():
	if len(v) != embedd_dim:
	print(len(v),embedd_dim)

	elif embedding == 'word2vec':
	# loading word2vec
	embedd_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
	if lower_case:
	embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict)
	embedd_dim = embedd_dict.vector_size

	else:
	raise ValueError("embedding should choose from [fasttext, glove, word2vec]")

	print("num dimensions of word embeddings:", embedd_dim)
	# save dict and dim to a pickle file
	with open(pkl_path, 'wb') as f:
	pickle.dump([embedd_dict, embedd_dim], f, pickle.HIGHEST_PROTOCOL)
	return embedd_dict, embedd_dim