cap / features.py
vikranth1111's picture
Upload 18 files
9e2ba5f
raw
history blame contribute delete
676 Bytes
import numpy as np
import gensim.downloader as api
import config
def get_word2vec_enc(corpus: list, gensim_pretrained_emb:str) -> list:
"""
Get the W2V value for each word withing
:param text: The text we want to get embeddings for
:param embed_size: Dimension output for pretrained embeddings
:param pretrained_emb: The pretrained embedding to use
:return: words encoded as vectors
"""
word_vecs = api.load(gensim_pretrained_emb)
embedding_weights = np.zeros((config.VOCAB_SIZE, config.EMBED_SIZE))
for word, i in corpus:
if word in word_vecs:
embedding_weights[i] = word_vecs[word]
return embedding_weights