| import logging | |
| import pickle | |
| from tqdm import tqdm | |
| from lxml import etree | |
| from utils import get_grams_info, CONFIG | |
| DIC_PATH = CONFIG['dict_path'] | |
| MAX_WORD_SIZE = CONFIG['max_word_size'] | |
| DATASET_WORDS_PATH = CONFIG['dataset_words_path'] | |
| DICTS_WORDS_PATH = CONFIG['dict_words_path'] | |
| DICT_POST_TYPES = CONFIG['dict_post_types'] | |
| LEMMAS_PROPS = CONFIG['lemma_same_word'] | |
| AD_TAGS = CONFIG['ad_tags'] | |
| SRC_CONVERT, _ = get_grams_info(CONFIG) | |
| i = 0 | |
| def parse_words(itr): | |
| global i | |
| cur_word = None | |
| cur_item = None | |
| event, element = next(itr) | |
| while not (event == 'end' and element.tag == 'lemmata'): | |
| if event == 'start' and element.tag == 'lemma': | |
| cur_word = { | |
| 'id': element.attrib['id'], | |
| 'lemma': None, | |
| 'forms': [] | |
| } | |
| if event == 'start' and (element.tag == 'l' or element.tag == 'f'): | |
| cur_item = {'text': None, 'index': i} | |
| i += 1 | |
| if event == 'end' and element.tag == 'l': | |
| cur_item['text'] = element.attrib['t'] | |
| cur_word['lemma'] = cur_item | |
| cur_item = None | |
| if event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in SRC_CONVERT: | |
| src_key = element.attrib['v'].lower() | |
| gram_type, gram = SRC_CONVERT[src_key] | |
| cur_item[gram_type] = gram | |
| elif event == 'end' and element.tag == 'g' \ | |
| and element.attrib['v'].lower() in AD_TAGS \ | |
| and 'ad_tags' not in cur_item: | |
| cur_item['ad_tags'] = [element.attrib['v'].lower()] | |
| elif event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in AD_TAGS: | |
| cur_item['ad_tags'].append(element.attrib['v'].lower()) | |
| if event == 'end' and element.tag == 'f': | |
| cur_item['text'] = element.attrib['t'] | |
| cur_word['forms'].append(cur_item) | |
| cur_item = None | |
| if event == 'end' and element.tag == 'lemma': | |
| yield cur_word | |
| cur_word = None | |
| event, element = next(itr) | |
| def get_flat_words(words): | |
| for item in words: | |
| lemma = item['lemma'] | |
| lemma['lemma'] = lemma['text'] | |
| lemma['id'] = item['id'] | |
| for form in item['forms']: | |
| word = dict(lemma) | |
| for key in form: | |
| word[key] = form[key] | |
| if 'ad_tags' in word: | |
| word['ad_tags'] = ','.join(word['ad_tags']) | |
| yield word | |
| def parse_link_types(itr): | |
| event, element = next(itr) | |
| link_types = {} | |
| while not (event == 'start' and element.tag == 'link_types'): | |
| event, element = next(itr) | |
| while not (event == 'end' and element.tag == 'link_types'): | |
| if event == 'end' and element.tag == 'type': | |
| link_types[element.text] = element.attrib['id'] | |
| event, element = next(itr) | |
| return link_types | |
| def parse_links(itr): | |
| event, element = next(itr) | |
| while not (event == 'end' and element.tag == 'links'): | |
| if event == 'end' and element.tag == 'link': | |
| yield { | |
| 'from': element.attrib['from'], | |
| 'to': element.attrib['to'], | |
| 'type': element.attrib['type'] | |
| } | |
| event, element = next(itr) | |
| def set_lemma_and_inflect_id(words, link_types, links): | |
| same_inflect_id_post = ['noun', 'adjf', 'infn'] | |
| lemmas_dict = {} | |
| for word in words: | |
| for norm_f in LEMMAS_PROPS: | |
| is_lemma = True | |
| for key in norm_f: | |
| if key in word and word[key] != norm_f[key]: | |
| is_lemma = False | |
| break | |
| if is_lemma: | |
| lemmas_dict[word['id']] = word['text'] | |
| del word['lemma'] | |
| break | |
| if 'lemma' in word and word['post'] in same_inflect_id_post: | |
| word['inflect_id'] = word['id'] | |
| inv_link_type_dict = { | |
| link_types[key]: key | |
| for key in link_types | |
| } | |
| links = { | |
| (link['to'], inv_link_type_dict[link['type']]): link['from'] | |
| for link in links | |
| } | |
| prtf_dict = {} | |
| for word in words: | |
| link_type = None | |
| if word['post'] == 'verb': | |
| link_type = 'INFN-VERB' | |
| elif word['post'] == 'prtf': | |
| link_type = 'INFN-PRTF' | |
| elif word['post'] == 'grnd': | |
| link_type = 'INFN-GRND' | |
| elif word['post'] == 'adjs': | |
| link_type = 'ADJF-ADJS' | |
| elif word['post'] == 'comp': | |
| link_type = 'ADJF-COMP' | |
| if not link_type: | |
| continue | |
| key = (word['id'], link_type) | |
| if key in links and links[key] in lemmas_dict: | |
| lemma = lemmas_dict[links[key]] | |
| word['lemma'] = lemma | |
| lemmas_dict[word['id']] = lemma | |
| else: | |
| del word['lemma'] | |
| if key in links: | |
| word['inflect_id'] = links[key] | |
| if key in links and word['post'] == 'prtf': | |
| prtf_dict[word['id']] = word['inflect_id'] | |
| for word in words: | |
| if word['post'] != 'prts': | |
| continue | |
| key = (word['id'], 'PRTF-PRTS') | |
| if key in links: | |
| lemma = lemmas_dict[links[key]] | |
| word['lemma'] = lemma | |
| word['inflect_id'] = prtf_dict[links[key]] | |
| lemmas_dict[word['id']] = lemma | |
| else: | |
| del word['lemma'] | |
| doc = etree.iterparse(DIC_PATH, events=('start', 'end')) | |
| itr = iter(doc) | |
| event, element = next(itr) | |
| logging.info("Parsing dictionary xml") | |
| while not (event == 'start' and element.tag == 'lemmata'): | |
| event, element = next(itr) | |
| words = list(parse_words(itr)) | |
| link_types = parse_link_types(itr) | |
| links = list(parse_links(itr)) | |
| words = list(get_flat_words(words)) | |
| set_lemma_and_inflect_id(words, link_types, links) | |
| words = [dict(t) for t in {tuple(sorted(d.items())) for d in words}] | |
| dict_words = [word for word in words if word['post'] in DICT_POST_TYPES] | |
| dataset_words = [word for word in words if word['post'] not in DICT_POST_TYPES] | |
| dataset_words_dic = {} | |
| for word in tqdm(dataset_words): | |
| w_len = len(word['text']) | |
| if w_len > MAX_WORD_SIZE: | |
| continue | |
| if word['text'] not in dataset_words_dic: | |
| dataset_words_dic[word['text']] = [] | |
| dataset_words_dic[word['text']].append(word) | |
| logging.info(f"Dict words: {len(dict_words)}") | |
| logging.info(f"Dataset words: {len(dataset_words_dic)}") | |
| with open(DATASET_WORDS_PATH, 'wb+') as f: | |
| pickle.dump(dataset_words_dic, f) | |
| with open(DICTS_WORDS_PATH, 'wb+') as f: | |
| pickle.dump(dict_words, f) | |