|
|
import pickle |
|
|
from tqdm import tqdm |
|
|
from collections import defaultdict |
|
|
from utils import CONFIG, save_dataset, save_dictionary_items |
|
|
|
|
|
|
|
|
MIN_WORD_SIZE = CONFIG['min_word_size'] |
|
|
PREFIX_FILTER_LENGTH = CONFIG['prefix_filter_length'] |
|
|
VECT_PATH = CONFIG['vect_words_path'] |
|
|
CLS_CLASSES_PATH = CONFIG['cls_classes_path'] |
|
|
DICT_WORDS_PATH = CONFIG['dics_path'] |
|
|
|
|
|
|
|
|
def generate(vec_words, main_cls_dic): |
|
|
dict_words = [] |
|
|
rez_dict = defaultdict(list) |
|
|
for word in tqdm(vec_words, desc="Generating lemma dataset"): |
|
|
dic = vec_words[word] |
|
|
x_vec = dic['vect'] |
|
|
|
|
|
for form in dic['forms']: |
|
|
main_cls = main_cls_dic[form['main']] |
|
|
|
|
|
if 'lemma' in form: |
|
|
word_y = form['lemma'] |
|
|
else: |
|
|
continue |
|
|
|
|
|
if word_y not in vec_words \ |
|
|
or MIN_WORD_SIZE > len(word_y) \ |
|
|
or MIN_WORD_SIZE > len(word): |
|
|
continue |
|
|
|
|
|
if word_y[:PREFIX_FILTER_LENGTH] != word[:PREFIX_FILTER_LENGTH] \ |
|
|
and word_y[:PREFIX_FILTER_LENGTH].replace('ё', 'е') != word[:PREFIX_FILTER_LENGTH].replace('ё', 'е')\ |
|
|
and form['post'] != 'comp': |
|
|
|
|
|
dict_words.append(dict( |
|
|
text=word, |
|
|
text_y=word_y, |
|
|
main=main_cls, |
|
|
id=form['inflect_id'] |
|
|
)) |
|
|
continue |
|
|
|
|
|
y_vec = vec_words[word_y]['vect'] |
|
|
items = rez_dict[main_cls] |
|
|
items.append({ |
|
|
'id': form['inflect_id'], |
|
|
'x_src': word, |
|
|
'x': x_vec[0], |
|
|
'x_len': x_vec[1], |
|
|
'y_src': word_y, |
|
|
'y': y_vec[0], |
|
|
'y_len': y_vec[1], |
|
|
'main_cls': main_cls |
|
|
}) |
|
|
rez_dict[main_cls] = items |
|
|
|
|
|
save_dataset(rez_dict, 'lemma') |
|
|
save_dictionary_items(dict_words, 'lemma') |
|
|
|
|
|
|
|
|
with open(VECT_PATH, 'rb') as f: |
|
|
vwords = pickle.load(f) |
|
|
|
|
|
with open(CLS_CLASSES_PATH, 'rb') as f: |
|
|
cls_dic = pickle.load(f) |
|
|
|
|
|
generate(vwords, cls_dic) |
|
|
|