import logging import pickle from tqdm import tqdm from lxml import etree from utils import get_grams_info, CONFIG DIC_PATH = CONFIG['dict_path'] MAX_WORD_SIZE = CONFIG['max_word_size'] DATASET_WORDS_PATH = CONFIG['dataset_words_path'] DICTS_WORDS_PATH = CONFIG['dict_words_path'] DICT_POST_TYPES = CONFIG['dict_post_types'] LEMMAS_PROPS = CONFIG['lemma_same_word'] AD_TAGS = CONFIG['ad_tags'] SRC_CONVERT, _ = get_grams_info(CONFIG) i = 0 def parse_words(itr): global i cur_word = None cur_item = None event, element = next(itr) while not (event == 'end' and element.tag == 'lemmata'): if event == 'start' and element.tag == 'lemma': cur_word = { 'id': element.attrib['id'], 'lemma': None, 'forms': [] } if event == 'start' and (element.tag == 'l' or element.tag == 'f'): cur_item = {'text': None, 'index': i} i += 1 if event == 'end' and element.tag == 'l': cur_item['text'] = element.attrib['t'] cur_word['lemma'] = cur_item cur_item = None if event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in SRC_CONVERT: src_key = element.attrib['v'].lower() gram_type, gram = SRC_CONVERT[src_key] cur_item[gram_type] = gram elif event == 'end' and element.tag == 'g' \ and element.attrib['v'].lower() in AD_TAGS \ and 'ad_tags' not in cur_item: cur_item['ad_tags'] = [element.attrib['v'].lower()] elif event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in AD_TAGS: cur_item['ad_tags'].append(element.attrib['v'].lower()) if event == 'end' and element.tag == 'f': cur_item['text'] = element.attrib['t'] cur_word['forms'].append(cur_item) cur_item = None if event == 'end' and element.tag == 'lemma': yield cur_word cur_word = None event, element = next(itr) def get_flat_words(words): for item in words: lemma = item['lemma'] lemma['lemma'] = lemma['text'] lemma['id'] = item['id'] for form in item['forms']: word = dict(lemma) for key in form: word[key] = form[key] if 'ad_tags' in word: word['ad_tags'] = ','.join(word['ad_tags']) yield word def parse_link_types(itr): event, element = next(itr) link_types = {} while not (event == 'start' and element.tag == 'link_types'): event, element = next(itr) while not (event == 'end' and element.tag == 'link_types'): if event == 'end' and element.tag == 'type': link_types[element.text] = element.attrib['id'] event, element = next(itr) return link_types def parse_links(itr): event, element = next(itr) while not (event == 'end' and element.tag == 'links'): if event == 'end' and element.tag == 'link': yield { 'from': element.attrib['from'], 'to': element.attrib['to'], 'type': element.attrib['type'] } event, element = next(itr) def set_lemma_and_inflect_id(words, link_types, links): same_inflect_id_post = ['noun', 'adjf', 'infn'] lemmas_dict = {} for word in words: for norm_f in LEMMAS_PROPS: is_lemma = True for key in norm_f: if key in word and word[key] != norm_f[key]: is_lemma = False break if is_lemma: lemmas_dict[word['id']] = word['text'] del word['lemma'] break if 'lemma' in word and word['post'] in same_inflect_id_post: word['inflect_id'] = word['id'] inv_link_type_dict = { link_types[key]: key for key in link_types } links = { (link['to'], inv_link_type_dict[link['type']]): link['from'] for link in links } prtf_dict = {} for word in words: link_type = None if word['post'] == 'verb': link_type = 'INFN-VERB' elif word['post'] == 'prtf': link_type = 'INFN-PRTF' elif word['post'] == 'grnd': link_type = 'INFN-GRND' elif word['post'] == 'adjs': link_type = 'ADJF-ADJS' elif word['post'] == 'comp': link_type = 'ADJF-COMP' if not link_type: continue key = (word['id'], link_type) if key in links and links[key] in lemmas_dict: lemma = lemmas_dict[links[key]] word['lemma'] = lemma lemmas_dict[word['id']] = lemma else: del word['lemma'] if key in links: word['inflect_id'] = links[key] if key in links and word['post'] == 'prtf': prtf_dict[word['id']] = word['inflect_id'] for word in words: if word['post'] != 'prts': continue key = (word['id'], 'PRTF-PRTS') if key in links: lemma = lemmas_dict[links[key]] word['lemma'] = lemma word['inflect_id'] = prtf_dict[links[key]] lemmas_dict[word['id']] = lemma else: del word['lemma'] doc = etree.iterparse(DIC_PATH, events=('start', 'end')) itr = iter(doc) event, element = next(itr) logging.info("Parsing dictionary xml") while not (event == 'start' and element.tag == 'lemmata'): event, element = next(itr) words = list(parse_words(itr)) link_types = parse_link_types(itr) links = list(parse_links(itr)) words = list(get_flat_words(words)) set_lemma_and_inflect_id(words, link_types, links) words = [dict(t) for t in {tuple(sorted(d.items())) for d in words}] dict_words = [word for word in words if word['post'] in DICT_POST_TYPES] dataset_words = [word for word in words if word['post'] not in DICT_POST_TYPES] dataset_words_dic = {} for word in tqdm(dataset_words): w_len = len(word['text']) if w_len > MAX_WORD_SIZE: continue if word['text'] not in dataset_words_dic: dataset_words_dic[word['text']] = [] dataset_words_dic[word['text']].append(word) logging.info(f"Dict words: {len(dict_words)}") logging.info(f"Dataset words: {len(dataset_words_dic)}") with open(DATASET_WORDS_PATH, 'wb+') as f: pickle.dump(dataset_words_dic, f) with open(DICTS_WORDS_PATH, 'wb+') as f: pickle.dump(dict_words, f)