File size: 8,227 Bytes

0240c6e

import re
import os
import gzip
import pickle
from utils import CONFIG, get_dict_path, load_datasets


NAR_REG = re.compile("\d+-.*")
RANDOM_SEED = 1917
VECT_PATH = CONFIG['vect_words_path']
DATASET_PATH = CONFIG['dataset_path']
REZ_PATHS = CONFIG['publish_dictionary_paths']
DICT_WORDS_PATH = CONFIG['dict_words_path']
NOT_DICT_WORDS_PATH = CONFIG['dataset_words_path']
MAX_WORD_SIZE = CONFIG['max_word_size']
DICT_POST_TYPES = CONFIG['dict_post_types']
GRAMMEMES_TYPES = CONFIG['grammemes_types']
IGNORE_AD_TAGS = CONFIG['dict_ignore_tags']
REPLACE_WORD_DICT_ID = 1

CLASSES_INDEX_DICT = {
    cls: GRAMMEMES_TYPES[gram]['classes'][cls]['index']
    for gram in sorted(GRAMMEMES_TYPES, key=lambda x: GRAMMEMES_TYPES[x]['index'])
    for cls in GRAMMEMES_TYPES[gram]['classes']
}

POST_POWER_DICT = {}
for key in DICT_POST_TYPES:
    POST_POWER_DICT[key] = DICT_POST_TYPES[key]['power'] if 'power' in DICT_POST_TYPES[key] else 1

p_dic = GRAMMEMES_TYPES['post']['classes']
for key in p_dic:
    POST_POWER_DICT[key] = p_dic[key]['power'] if 'power' in p_dic[key] else 1

with open(CONFIG['inflect_templates_path'], 'rb') as f:
    inflect_templates = pickle.load(f)

with open(CONFIG['tags_path'], 'rb') as f:
    tpl_cls_dict = pickle.load(f)

lemma_cls_dict = {}
for lemma_tpl in inflect_templates:
    lemma_id = tpl_cls_dict[lemma_tpl]['i']
    for tpl in inflect_templates[lemma_tpl]:
        lemma_cls_dict[tpl_cls_dict[tpl]['i']] = lemma_id

lemma_dict = {}
for item in load_datasets('inflect', 'test', 'train', 'valid'):
    if item['id'] not in lemma_dict:
        lemma_dict[item['id']] = (item['x_src'], item['x_cls'])

ad_tags_dict = {}
with open(VECT_PATH, 'rb') as f:
    vec_words = pickle.load(f)
    for word in vec_words:
        item = vec_words[word]
        for form in item['forms']:
            lexeme_id_key = 'inflect_id' if 'inflect_id' in form else 'id'
            lexeme_id = form[lexeme_id_key]

            if 'ad_tags' not in form:
                continue

            if lexeme_id not in ad_tags_dict:
                ad_tags_dict[lexeme_id] = set()

            ad_tags_dict[lexeme_id].add(form['ad_tags'])


def build_index(words_dics):
    text_forms_dict = {}
    for id in words_dics:
        for item in words_dics[id]:
            text = item['text']
            if text not in text_forms_dict:
                text_forms_dict[text] = []
            text_forms_dict[text].append(item)

    index = []
    for text in text_forms_dict:
        lexemes = [str(item['id']) for item in text_forms_dict[text]]
        lexemes = ','.join(lexemes)
        index.append(f"{text}:{lexemes}")

    index = list(set(index))
    index = '\n'.join(index)
    return index


def create_dictionary(words_dics):
    index = build_index(words_dics)

    lexeme = []
    for id in words_dics:
        cur_lexeme = [id, '\t']

        order = []
        cur_forms_dict = {}
        for item in words_dics[id]:
            if item['text'] not in cur_forms_dict:
                cur_forms_dict[item['text']] = {}

            replace_other = item['replace_other'] if 'replace_other' in item else False
            cur_form_dic = cur_forms_dict[item['text']]
            if item['main'] not in cur_forms_dict or not cur_forms_dict[item['main']]:
                cur_form_dic[item['main']] = replace_other

            if item['text'] not in order:
                order.append(item['text'])

        for text in order:
            cur_lexeme.append(text)
            cur_lexeme.append(':')
            for cls in cur_forms_dict[text]:
                replace_other = cur_forms_dict[text][cls]
                cur_lexeme.append(str(cls))
                if replace_other:
                    cur_lexeme.append('!')

                cur_lexeme.append(",")

            del cur_lexeme[-1]
            cur_lexeme.append(';')

        del cur_lexeme[-1]
        lexeme.append(''.join(cur_lexeme))

    lexeme = '\n'.join(lexeme)
    return index, lexeme


def save_dictionary(index, lexeme, paths, file_name):
    for path in paths:
       path = os.path.join(path, f"{file_name}_index.txt.gz")
       with gzip.open(path, 'wb+') as f:
           f.write(index.encode('utf-8'))

    for path in paths:
       path = os.path.join(path, f"{file_name}.txt.gz")
       with gzip.open(path, 'wb+') as f:
           f.write(lexeme.encode('utf-8'))


def release_dict_items():
    with open(DICT_WORDS_PATH, 'rb') as f:
        words = pickle.load(f)

    words = [word for word in words if word['post'] != 'numb']
    dict_words = {}
    for word in words:
        if word['id'] not in dict_words:
            dict_words[word['id']] = []
        dict_words[word['id']].append(word)

    for id in dict_words:
        un_id_dict = {}
        rez_list = []
        for word in sorted(dict_words[id], key=lambda x: x['index']):
            if word['main'] not in un_id_dict:
                un_id_dict[word['main']] = word
            else:
                un_id_dict[word['main']]['replace_other'] = True
            rez_list.append(word)
        dict_words[id] = rez_list

    index, lexeme = create_dictionary(dict_words)
    save_dictionary(index, lexeme, REZ_PATHS, 'dict')


def release_correction_items():
    dict_words = {}

    with open(get_dict_path('lemma'), 'rb') as f:
        items = pickle.load(f)
    for word in items:
        lexeme_id = word['id']
        if lexeme_id not in lemma_dict \
            or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])):
            continue

        if lexeme_id not in dict_words:
            dict_words[lexeme_id] = []

        dict_words[lexeme_id].append(word)
        dict_words[lexeme_id].append({
            'id': lexeme_id,
            'text': lemma_dict[lexeme_id][0],
            'main': lemma_dict[lexeme_id][1],
        })

    with open(os.path.join(CONFIG['bad_path'], "bad_lemma.pkl"), 'rb') as f:
        items = pickle.load(f)
    for word in items:
        word = word[0]
        lexeme_id = word['id']
        if lexeme_id not in lemma_dict \
            or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])):
            continue

        if lexeme_id not in dict_words:
            dict_words[lexeme_id] = []

        lemma, lemma_cls = lemma_dict[lexeme_id]
        dict_words[lexeme_id].append(dict(id=lexeme_id, main=word['main_cls'], text=word['x_src'], replace_other=True))
        dict_words[lexeme_id].append(dict(id=lexeme_id, main=lemma_cls, text=lemma, replace_other=True))

    with open(os.path.join(CONFIG['bad_path'], "bad_inflect.pkl"), 'rb') as f:
        items = pickle.load(f)
    for word in items:
        word = word[0]
        lexeme_id = word['id']
        if lexeme_id not in lemma_dict \
            or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])):
            continue

        if lexeme_id not in dict_words:
            dict_words[lexeme_id] = []
        dict_words[lexeme_id].append(dict(id=lexeme_id, main=word['x_cls'], text=word['x_src'], replace_other=True))
        dict_words[lexeme_id].append(dict(id=lexeme_id, main=word['y_cls'], text=word['y_src'], replace_other=True))

    with open(os.path.join(CONFIG['bad_path'], "bad_main.pkl"), 'rb') as f:
        items = pickle.load(f)
    for bad_item in items:
        text = bad_item[0]['src']
        for word in vec_words[text]['forms']:
            lexeme_id = word['id']
            if lexeme_id not in lemma_dict \
                or (lexeme_id in ad_tags_dict and any([key in ad_tags_dict[lexeme_id] for key in IGNORE_AD_TAGS])):
                continue

            if lexeme_id not in dict_words:
                dict_words[lexeme_id] = []

            cls_id = tpl_cls_dict[word['main']]['i']
            dict_words[lexeme_id].append(dict(id=lexeme_id, main=cls_id, text=text, replace_other=True))
            dict_words[lexeme_id].append(dict(id=lexeme_id, main=lemma_dict[lexeme_id][1], text=lemma_dict[lexeme_id][0], replace_other=True))

    index, lexeme = create_dictionary(dict_words)
    save_dictionary(index, lexeme, REZ_PATHS, 'dict_correction')


release_correction_items()
release_dict_items()