DeepMorphy / 4_lemma_dataset.py
niobures's picture
DeepMorphy
0240c6e verified
import pickle
from tqdm import tqdm
from collections import defaultdict
from utils import CONFIG, save_dataset, save_dictionary_items
MIN_WORD_SIZE = CONFIG['min_word_size']
PREFIX_FILTER_LENGTH = CONFIG['prefix_filter_length']
VECT_PATH = CONFIG['vect_words_path']
CLS_CLASSES_PATH = CONFIG['cls_classes_path']
DICT_WORDS_PATH = CONFIG['dics_path']
def generate(vec_words, main_cls_dic):
dict_words = []
rez_dict = defaultdict(list)
for word in tqdm(vec_words, desc="Generating lemma dataset"):
dic = vec_words[word]
x_vec = dic['vect']
for form in dic['forms']:
main_cls = main_cls_dic[form['main']]
if 'lemma' in form:
word_y = form['lemma']
else:
continue
if word_y not in vec_words \
or MIN_WORD_SIZE > len(word_y) \
or MIN_WORD_SIZE > len(word):
continue
if word_y[:PREFIX_FILTER_LENGTH] != word[:PREFIX_FILTER_LENGTH] \
and word_y[:PREFIX_FILTER_LENGTH].replace('ё', 'е') != word[:PREFIX_FILTER_LENGTH].replace('ё', 'е')\
and form['post'] != 'comp':
#tqdm.write('Word to dictionary: {0} -> {1}'.format(word, word_y))
dict_words.append(dict(
text=word,
text_y=word_y,
main=main_cls,
id=form['inflect_id']
))
continue
y_vec = vec_words[word_y]['vect']
items = rez_dict[main_cls]
items.append({
'id': form['inflect_id'],
'x_src': word,
'x': x_vec[0],
'x_len': x_vec[1],
'y_src': word_y,
'y': y_vec[0],
'y_len': y_vec[1],
'main_cls': main_cls
})
rez_dict[main_cls] = items
save_dataset(rez_dict, 'lemma')
save_dictionary_items(dict_words, 'lemma')
with open(VECT_PATH, 'rb') as f:
vwords = pickle.load(f)
with open(CLS_CLASSES_PATH, 'rb') as f:
cls_dic = pickle.load(f)
generate(vwords, cls_dic)