File size: 1,069 Bytes

0240c6e

import pickle
import numpy as np
from tqdm import tqdm
from utils import CONFIG

CHARS = CONFIG['chars']
END_TOKEN = CONFIG['end_token']
MAX_WORD_SIZE = CONFIG['max_word_size']
WORDS_PATH = CONFIG['dataset_words_path']
VECT_PATH = CONFIG['vect_words_path']
CHARS_INDEXES = {c: index for index, c in enumerate(CHARS)}


def vectorize_text(text):
    word_vect = np.full((MAX_WORD_SIZE,), END_TOKEN, dtype=np.int32)
    for index, c in enumerate(text):
        if c in CHARS:
            word_vect[index] = CHARS_INDEXES[c]
        else:
            word_vect[index] = CHARS_INDEXES["UNDEFINED"]

    seq_len = len(text)
    return word_vect, seq_len


def vectorize_words(words_dic):
    vect_dic = {}
    for word in tqdm(words_dic, desc="Vectorizing words"):
        vect_dic[word] = {
            'vect': vectorize_text(word),
            'forms': words_dic[word]
        }

    return vect_dic


with open(WORDS_PATH, 'rb') as f:
    words_dic = pickle.load(f)

vec_words = vectorize_words(words_dic)

with open(VECT_PATH, 'wb+') as f:
    pickle.dump(vec_words, f)