File size: 6,567 Bytes

0240c6e

import logging
import pickle
from tqdm import tqdm
from lxml import etree
from utils import get_grams_info, CONFIG


DIC_PATH = CONFIG['dict_path']
MAX_WORD_SIZE = CONFIG['max_word_size']
DATASET_WORDS_PATH = CONFIG['dataset_words_path']
DICTS_WORDS_PATH = CONFIG['dict_words_path']
DICT_POST_TYPES = CONFIG['dict_post_types']
LEMMAS_PROPS = CONFIG['lemma_same_word']
AD_TAGS = CONFIG['ad_tags']
SRC_CONVERT, _ = get_grams_info(CONFIG)
i = 0


def parse_words(itr):
    global i

    cur_word = None
    cur_item = None
    event, element = next(itr)

    while not (event == 'end' and element.tag == 'lemmata'):

        if event == 'start' and element.tag == 'lemma':
            cur_word = {
                'id': element.attrib['id'],
                'lemma': None,
                'forms': []
            }

        if event == 'start' and (element.tag == 'l' or element.tag == 'f'):
            cur_item = {'text': None, 'index': i}
            i += 1

        if event == 'end' and element.tag == 'l':
            cur_item['text'] = element.attrib['t']
            cur_word['lemma'] = cur_item
            cur_item = None

        if event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in SRC_CONVERT:
            src_key = element.attrib['v'].lower()
            gram_type, gram = SRC_CONVERT[src_key]
            cur_item[gram_type] = gram
        elif event == 'end' and element.tag == 'g' \
                and element.attrib['v'].lower() in AD_TAGS \
                and 'ad_tags' not in cur_item:
            cur_item['ad_tags'] = [element.attrib['v'].lower()]
        elif event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in AD_TAGS:
            cur_item['ad_tags'].append(element.attrib['v'].lower())

        if event == 'end' and element.tag == 'f':
            cur_item['text'] = element.attrib['t']
            cur_word['forms'].append(cur_item)
            cur_item = None

        if event == 'end' and element.tag == 'lemma':
            yield cur_word
            cur_word = None

        event, element = next(itr)


def get_flat_words(words):
    for item in words:
        lemma = item['lemma']
        lemma['lemma'] = lemma['text']
        lemma['id'] = item['id']
        for form in item['forms']:
            word = dict(lemma)
            for key in form:
                word[key] = form[key]

            if 'ad_tags' in word:
                word['ad_tags'] = ','.join(word['ad_tags'])

            yield word


def parse_link_types(itr):
    event, element = next(itr)
    link_types = {}

    while not (event == 'start' and element.tag == 'link_types'):
        event, element = next(itr)

    while not (event == 'end' and element.tag == 'link_types'):

        if event == 'end' and element.tag == 'type':
            link_types[element.text] = element.attrib['id']

        event, element = next(itr)
    return link_types


def parse_links(itr):
    event, element = next(itr)
    while not (event == 'end' and element.tag == 'links'):
        if event == 'end' and element.tag == 'link':
            yield {
                'from': element.attrib['from'],
                'to': element.attrib['to'],
                'type': element.attrib['type']
            }
        event, element = next(itr)


def set_lemma_and_inflect_id(words, link_types, links):
    same_inflect_id_post = ['noun', 'adjf', 'infn']
    lemmas_dict = {}
    for word in words:
        for norm_f in LEMMAS_PROPS:
            is_lemma = True
            for key in norm_f:
                if key in word and word[key] != norm_f[key]:
                    is_lemma = False
                    break

            if is_lemma:
                lemmas_dict[word['id']] = word['text']
                del word['lemma']
                break

        if 'lemma' in word and word['post'] in same_inflect_id_post:
            word['inflect_id'] = word['id']

    inv_link_type_dict = {
        link_types[key]: key
        for key in link_types
    }

    links = {
        (link['to'], inv_link_type_dict[link['type']]): link['from']
        for link in links
    }

    prtf_dict = {}
    for word in words:
        link_type = None
        if word['post'] == 'verb':
            link_type = 'INFN-VERB'
        elif word['post'] == 'prtf':
            link_type = 'INFN-PRTF'
        elif word['post'] == 'grnd':
            link_type = 'INFN-GRND'
        elif word['post'] == 'adjs':
            link_type = 'ADJF-ADJS'
        elif word['post'] == 'comp':
            link_type = 'ADJF-COMP'

        if not link_type:
            continue

        key = (word['id'], link_type)
        if key in links and links[key] in lemmas_dict:
            lemma = lemmas_dict[links[key]]
            word['lemma'] = lemma
            lemmas_dict[word['id']] = lemma
        else:
            del word['lemma']

        if key in links:
            word['inflect_id'] = links[key]

        if key in links and word['post'] == 'prtf':
            prtf_dict[word['id']] = word['inflect_id']

    for word in words:
        if word['post'] != 'prts':
            continue

        key = (word['id'], 'PRTF-PRTS')
        if key in links:
            lemma = lemmas_dict[links[key]]
            word['lemma'] = lemma
            word['inflect_id'] = prtf_dict[links[key]]
            lemmas_dict[word['id']] = lemma
        else:
            del word['lemma']


doc = etree.iterparse(DIC_PATH, events=('start', 'end'))
itr = iter(doc)
event, element = next(itr)
logging.info("Parsing dictionary xml")
while not (event == 'start' and element.tag == 'lemmata'):
    event, element = next(itr)

words = list(parse_words(itr))
link_types = parse_link_types(itr)
links = list(parse_links(itr))
words = list(get_flat_words(words))
set_lemma_and_inflect_id(words, link_types, links)
words = [dict(t) for t in {tuple(sorted(d.items())) for d in words}]
dict_words = [word for word in words if word['post'] in DICT_POST_TYPES]
dataset_words = [word for word in words if word['post'] not in DICT_POST_TYPES]
dataset_words_dic = {}
for word in tqdm(dataset_words):
    w_len = len(word['text'])
    if w_len > MAX_WORD_SIZE:
        continue

    if word['text'] not in dataset_words_dic:
        dataset_words_dic[word['text']] = []
    dataset_words_dic[word['text']].append(word)

logging.info(f"Dict words: {len(dict_words)}")
logging.info(f"Dataset words: {len(dataset_words_dic)}")

with open(DATASET_WORDS_PATH, 'wb+') as f:
    pickle.dump(dataset_words_dic, f)

with open(DICTS_WORDS_PATH, 'wb+') as f:
    pickle.dump(dict_words, f)