import re import os import gzip import pickle from utils import CONFIG, get_dict_path, load_datasets NAR_REG = re.compile("\d+-.*") RANDOM_SEED = 1917 VECT_PATH = CONFIG['vect_words_path'] DATASET_PATH = CONFIG['dataset_path'] REZ_PATHS = CONFIG['publish_dictionary_paths'] DICT_WORDS_PATH = CONFIG['dict_words_path'] NOT_DICT_WORDS_PATH = CONFIG['dataset_words_path'] MAX_WORD_SIZE = CONFIG['max_word_size'] DICT_POST_TYPES = CONFIG['dict_post_types'] GRAMMEMES_TYPES = CONFIG['grammemes_types'] IGNORE_AD_TAGS = CONFIG['dict_ignore_tags'] REPLACE_WORD_DICT_ID = 1 CLASSES_INDEX_DICT = { cls: GRAMMEMES_TYPES[gram]['classes'][cls]['index'] for gram in sorted(GRAMMEMES_TYPES, key=lambda x: GRAMMEMES_TYPES[x]['index']) for cls in GRAMMEMES_TYPES[gram]['classes'] } POST_POWER_DICT = {} for key in DICT_POST_TYPES: POST_POWER_DICT[key] = DICT_POST_TYPES[key]['power'] if 'power' in DICT_POST_TYPES[key] else 1 p_dic = GRAMMEMES_TYPES['post']['classes'] for key in p_dic: POST_POWER_DICT[key] = p_dic[key]['power'] if 'power' in p_dic[key] else 1 with open(CONFIG['inflect_templates_path'], 'rb') as f: inflect_templates = pickle.load(f) with open(CONFIG['tags_path'], 'rb') as f: tpl_cls_dict = pickle.load(f) lemma_cls_dict = {} for lemma_tpl in inflect_templates: lemma_id = tpl_cls_dict[lemma_tpl]['i'] for tpl in inflect_templates[lemma_tpl]: lemma_cls_dict[tpl_cls_dict[tpl]['i']] = lemma_id lemma_dict = {} for item in load_datasets('inflect', 'test', 'train', 'valid'): if item['id'] not in lemma_dict: lemma_dict[item['id']] = (item['x_src'], item['x_cls']) ad_tags_dict = {} with open(VECT_PATH, 'rb') as f: vec_words = pickle.load(f) for word in vec_words: item = vec_words[word] for form in item['forms']: lexeme_id_key = 'inflect_id' if 'inflect_id' in form else 'id' lexeme_id = form[lexeme_id_key] if 'ad_tags' not in form: continue if lexeme_id not in ad_tags_dict: ad_tags_dict[lexeme_id] = set() ad_tags_dict[lexeme_id].add(form['ad_tags']) def build_index(words_dics): text_forms_dict = {} for id in words_dics: for item in words_dics[id]: text = item['text'] if text not in text_forms_dict: text_forms_dict[text] = [] text_forms_dict[text].append(item) index = [] for text in text_forms_dict: lexemes = [str(item['id']) for item in text_forms_dict[text]] lexemes = ','.join(lexemes) index.append(f"{text}:{lexemes}") index = list(set(index)) index = '\n'.join(index) return index def create_dictionary(words_dics): index = build_index(words_dics) lexeme = [] for id in words_dics: cur_lexeme = [id, '\t'] order = [] cur_forms_dict = {} for item in words_dics[id]: if item['text'] not in cur_forms_dict: cur_forms_dict[item['text']] = {} replace_other = item['replace_other'] if 'replace_other' in item else False cur_form_dic = cur_forms_dict[item['text']] if item['main'] not in cur_forms_dict or not cur_forms_dict[item['main']]: cur_form_dic[item['main']] = replace_other if item['text'] not in order: order.append(item['text']) for text in order: cur_lexeme.append(text) cur_lexeme.append(':') for cls in cur_forms_dict[text]: replace_other = cur_forms_dict[text][cls] cur_lexeme.append(str(cls)) if replace_other: cur_lexeme.append('!') cur_lexeme.append(",") del cur_lexeme[-1] cur_lexeme.append(';') del cur_lexeme[-1] lexeme.append(''.join(cur_lexeme)) lexeme = '\n'.join(lexeme) return index, lexeme def save_dictionary(index, lexeme, paths, file_name): for path in paths: path = os.path.join(path, f"{file_name}_index.txt.gz") with gzip.open(path, 'wb+') as f: f.write(index.encode('utf-8')) for path in paths: path = os.path.join(path, f"{file_name}.txt.gz") with gzip.open(path, 'wb+') as f: f.write(lexeme.encode('utf-8')) def release_dict_items(): with open(DICT_WORDS_PATH, 'rb') as f: words = pickle.load(f) words = [word for word in words if word['post'] != 'numb'] dict_words = {} for word in words: if word['id'] not in dict_words: dict_words[word['id']] = [] dict_words[word['id']].append(word) for id in dict_words: un_id_dict = {} rez_list = [] for word in sorted(dict_words[id], key=lambda x: x['index']): if word['main'] not in un_id_dict: un_id_dict[word['main']] = word else: un_id_dict[word['main']]['replace_other'] = True rez_list.append(word) dict_words[id] = rez_list index, lexeme = create_dictionary(dict_words) save_dictionary(index, lexeme, REZ_PATHS, 'dict') def release_correction_items(): dict_words = {} with open(get_dict_path('lemma'), 'rb') as f: items = pickle.load(f) for word in items: lexeme_id = word['id'] if lexeme_id not in lemma_dict \ or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])): continue if lexeme_id not in dict_words: dict_words[lexeme_id] = [] dict_words[lexeme_id].append(word) dict_words[lexeme_id].append({ 'id': lexeme_id, 'text': lemma_dict[lexeme_id][0], 'main': lemma_dict[lexeme_id][1], }) with open(os.path.join(CONFIG['bad_path'], "bad_lemma.pkl"), 'rb') as f: items = pickle.load(f) for word in items: word = word[0] lexeme_id = word['id'] if lexeme_id not in lemma_dict \ or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])): continue if lexeme_id not in dict_words: dict_words[lexeme_id] = [] lemma, lemma_cls = lemma_dict[lexeme_id] dict_words[lexeme_id].append(dict(id=lexeme_id, main=word['main_cls'], text=word['x_src'], replace_other=True)) dict_words[lexeme_id].append(dict(id=lexeme_id, main=lemma_cls, text=lemma, replace_other=True)) with open(os.path.join(CONFIG['bad_path'], "bad_inflect.pkl"), 'rb') as f: items = pickle.load(f) for word in items: word = word[0] lexeme_id = word['id'] if lexeme_id not in lemma_dict \ or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])): continue if lexeme_id not in dict_words: dict_words[lexeme_id] = [] dict_words[lexeme_id].append(dict(id=lexeme_id, main=word['x_cls'], text=word['x_src'], replace_other=True)) dict_words[lexeme_id].append(dict(id=lexeme_id, main=word['y_cls'], text=word['y_src'], replace_other=True)) with open(os.path.join(CONFIG['bad_path'], "bad_main.pkl"), 'rb') as f: items = pickle.load(f) for bad_item in items: text = bad_item[0]['src'] for word in vec_words[text]['forms']: lexeme_id = word['id'] if lexeme_id not in lemma_dict \ or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])): continue if lexeme_id not in dict_words: dict_words[lexeme_id] = [] cls_id = tpl_cls_dict[word['main']]['i'] dict_words[lexeme_id].append(dict(id=lexeme_id, main=cls_id, text=text, replace_other=True)) dict_words[lexeme_id].append(dict(id=lexeme_id, main=lemma_dict[lexeme_id][1], text=lemma_dict[lexeme_id][0], replace_other=True)) index, lexeme = create_dictionary(dict_words) save_dictionary(index, lexeme, REZ_PATHS, 'dict_correction') release_correction_items() release_dict_items()