|
|
import pickle |
|
|
import numpy as np |
|
|
from gensim.models import KeyedVectors |
|
|
import gzip |
|
|
import io |
|
|
import os |
|
|
|
|
|
def calc_mean_vec_for_lower_mapping(embedd_dict): |
|
|
lower_counts = {} |
|
|
for word in embedd_dict: |
|
|
word_lower = word.lower() |
|
|
if word_lower not in lower_counts: |
|
|
lower_counts[word_lower] = [word] |
|
|
else: |
|
|
lower_counts[word_lower] = lower_counts[word_lower] + [word] |
|
|
|
|
|
for word in lower_counts: |
|
|
embedd_dict[word] = np.mean([embedd_dict[word_] for word_ in lower_counts[word]]) |
|
|
return embedd_dict |
|
|
|
|
|
def load_embedding_dict(embedding, embedding_path, lower_case=False): |
|
|
""" |
|
|
load word embeddings from file |
|
|
:param embedding: |
|
|
:param embedding_path: |
|
|
:return: embedding dict, embedding dimention, caseless |
|
|
""" |
|
|
print("loading embedding: %s from %s" % (embedding, embedding_path)) |
|
|
if lower_case: |
|
|
pkl_path = embedding_path + '_lower' + '.pkl' |
|
|
else: |
|
|
pkl_path = embedding_path + '.pkl' |
|
|
if os.path.isfile(pkl_path): |
|
|
|
|
|
with open(pkl_path, 'rb') as f: |
|
|
embedd_dict, embedd_dim = pickle.load(f) |
|
|
print("num dimensions of word embeddings:", embedd_dim) |
|
|
return embedd_dict, embedd_dim |
|
|
|
|
|
if embedding == 'glove': |
|
|
|
|
|
embedd_dict = {} |
|
|
word = None |
|
|
with io.open(embedding_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
word, vec = line.split(' ', 1) |
|
|
embedd_dict[word] = np.fromstring(vec, sep=' ') |
|
|
embedd_dim = len(embedd_dict[word]) |
|
|
if lower_case: |
|
|
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
|
|
for k, v in embedd_dict.items(): |
|
|
if len(v) != embedd_dim: |
|
|
print(len(v),embedd_dim) |
|
|
|
|
|
elif embedding == 'fasttext': |
|
|
|
|
|
embedd_dict = {} |
|
|
word = None |
|
|
with io.open(embedding_path, 'r', encoding='utf-8') as f: |
|
|
|
|
|
for i, line in enumerate(f): |
|
|
if i == 0: |
|
|
continue |
|
|
word, vec = line.split(' ', 1) |
|
|
embedd_dict[word] = np.fromstring(vec, sep=' ') |
|
|
embedd_dim = len(embedd_dict[word]) |
|
|
if lower_case: |
|
|
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
|
|
for k, v in embedd_dict.items(): |
|
|
if len(v) != embedd_dim: |
|
|
print(len(v),embedd_dim) |
|
|
|
|
|
elif embedding == 'hellwig': |
|
|
|
|
|
embedd_dict = {} |
|
|
word = None |
|
|
with io.open(embedding_path, 'r', encoding='utf-8') as f: |
|
|
|
|
|
for i, line in enumerate(f): |
|
|
if i == 0: |
|
|
continue |
|
|
word, vec = line.split(' ', 1) |
|
|
embedd_dict[word] = np.fromstring(vec, sep=' ') |
|
|
embedd_dim = len(embedd_dict[word]) |
|
|
if lower_case: |
|
|
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
|
|
for k, v in embedd_dict.items(): |
|
|
if len(v) != embedd_dim: |
|
|
print(len(v),embedd_dim) |
|
|
|
|
|
elif embedding == 'one_hot': |
|
|
|
|
|
embedd_dict = {} |
|
|
word = None |
|
|
with io.open(embedding_path, 'r', encoding='utf-8') as f: |
|
|
|
|
|
for i, line in enumerate(f): |
|
|
if i == 0: |
|
|
continue |
|
|
word, vec = line.split('@', 1) |
|
|
embedd_dict[word] = np.fromstring(vec, sep=' ') |
|
|
embedd_dim = len(embedd_dict[word]) |
|
|
if lower_case: |
|
|
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
|
|
for k, v in embedd_dict.items(): |
|
|
if len(v) != embedd_dim: |
|
|
print(len(v),embedd_dim) |
|
|
|
|
|
elif embedding == 'word2vec': |
|
|
|
|
|
embedd_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True) |
|
|
if lower_case: |
|
|
embedd_dict = calc_mean_vec_for_lower_mapping(embedd_dict) |
|
|
embedd_dim = embedd_dict.vector_size |
|
|
|
|
|
else: |
|
|
raise ValueError("embedding should choose from [fasttext, glove, word2vec]") |
|
|
|
|
|
print("num dimensions of word embeddings:", embedd_dim) |
|
|
|
|
|
with open(pkl_path, 'wb') as f: |
|
|
pickle.dump([embedd_dict, embedd_dim], f, pickle.HIGHEST_PROTOCOL) |
|
|
return embedd_dict, embedd_dim |