File size: 2,179 Bytes
0240c6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pickle
from tqdm import tqdm
from collections import defaultdict
from utils import CONFIG, save_dataset, save_dictionary_items


MIN_WORD_SIZE = CONFIG['min_word_size']
PREFIX_FILTER_LENGTH = CONFIG['prefix_filter_length']
VECT_PATH = CONFIG['vect_words_path']
CLS_CLASSES_PATH = CONFIG['cls_classes_path']
DICT_WORDS_PATH = CONFIG['dics_path']


def generate(vec_words, main_cls_dic):
    dict_words = []
    rez_dict = defaultdict(list)
    for word in tqdm(vec_words, desc="Generating lemma dataset"):
        dic = vec_words[word]
        x_vec = dic['vect']

        for form in dic['forms']:
            main_cls = main_cls_dic[form['main']]

            if 'lemma' in form:
                word_y = form['lemma']
            else:
                continue

            if word_y not in vec_words \
               or MIN_WORD_SIZE > len(word_y) \
               or MIN_WORD_SIZE > len(word):
                continue

            if word_y[:PREFIX_FILTER_LENGTH] != word[:PREFIX_FILTER_LENGTH] \
                    and word_y[:PREFIX_FILTER_LENGTH].replace('ё', 'е') != word[:PREFIX_FILTER_LENGTH].replace('ё', 'е')\
                    and form['post'] != 'comp':
                #tqdm.write('Word to dictionary: {0} -> {1}'.format(word, word_y))
                dict_words.append(dict(
                    text=word,
                    text_y=word_y,
                    main=main_cls,
                    id=form['inflect_id']
                ))
                continue

            y_vec = vec_words[word_y]['vect']
            items = rez_dict[main_cls]
            items.append({
                'id': form['inflect_id'],
                'x_src': word,
                'x': x_vec[0],
                'x_len': x_vec[1],
                'y_src': word_y,
                'y': y_vec[0],
                'y_len': y_vec[1],
                'main_cls': main_cls
            })
            rez_dict[main_cls] = items

    save_dataset(rez_dict, 'lemma')
    save_dictionary_items(dict_words, 'lemma')


with open(VECT_PATH, 'rb') as f:
    vwords = pickle.load(f)

with open(CLS_CLASSES_PATH, 'rb') as f:
    cls_dic = pickle.load(f)

generate(vwords, cls_dic)