import pickle import numpy as np from tqdm import tqdm from utils import CONFIG CHARS = CONFIG['chars'] END_TOKEN = CONFIG['end_token'] MAX_WORD_SIZE = CONFIG['max_word_size'] WORDS_PATH = CONFIG['dataset_words_path'] VECT_PATH = CONFIG['vect_words_path'] CHARS_INDEXES = {c: index for index, c in enumerate(CHARS)} def vectorize_text(text): word_vect = np.full((MAX_WORD_SIZE,), END_TOKEN, dtype=np.int32) for index, c in enumerate(text): if c in CHARS: word_vect[index] = CHARS_INDEXES[c] else: word_vect[index] = CHARS_INDEXES["UNDEFINED"] seq_len = len(text) return word_vect, seq_len def vectorize_words(words_dic): vect_dic = {} for word in tqdm(words_dic, desc="Vectorizing words"): vect_dic[word] = { 'vect': vectorize_text(word), 'forms': words_dic[word] } return vect_dic with open(WORDS_PATH, 'rb') as f: words_dic = pickle.load(f) vec_words = vectorize_words(words_dic) with open(VECT_PATH, 'wb+') as f: pickle.dump(vec_words, f)