File size: 1,069 Bytes
0240c6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pickle
import numpy as np
from tqdm import tqdm
from utils import CONFIG

CHARS = CONFIG['chars']
END_TOKEN = CONFIG['end_token']
MAX_WORD_SIZE = CONFIG['max_word_size']
WORDS_PATH = CONFIG['dataset_words_path']
VECT_PATH = CONFIG['vect_words_path']
CHARS_INDEXES = {c: index for index, c in enumerate(CHARS)}


def vectorize_text(text):
    word_vect = np.full((MAX_WORD_SIZE,), END_TOKEN, dtype=np.int32)
    for index, c in enumerate(text):
        if c in CHARS:
            word_vect[index] = CHARS_INDEXES[c]
        else:
            word_vect[index] = CHARS_INDEXES["UNDEFINED"]

    seq_len = len(text)
    return word_vect, seq_len


def vectorize_words(words_dic):
    vect_dic = {}
    for word in tqdm(words_dic, desc="Vectorizing words"):
        vect_dic[word] = {
            'vect': vectorize_text(word),
            'forms': words_dic[word]
        }

    return vect_dic


with open(WORDS_PATH, 'rb') as f:
    words_dic = pickle.load(f)

vec_words = vectorize_words(words_dic)

with open(VECT_PATH, 'wb+') as f:
    pickle.dump(vec_words, f)