DeepMorphy / 1_load_dictionary.py
niobures's picture
DeepMorphy
0240c6e verified
import logging
import pickle
from tqdm import tqdm
from lxml import etree
from utils import get_grams_info, CONFIG
DIC_PATH = CONFIG['dict_path']
MAX_WORD_SIZE = CONFIG['max_word_size']
DATASET_WORDS_PATH = CONFIG['dataset_words_path']
DICTS_WORDS_PATH = CONFIG['dict_words_path']
DICT_POST_TYPES = CONFIG['dict_post_types']
LEMMAS_PROPS = CONFIG['lemma_same_word']
AD_TAGS = CONFIG['ad_tags']
SRC_CONVERT, _ = get_grams_info(CONFIG)
i = 0
def parse_words(itr):
global i
cur_word = None
cur_item = None
event, element = next(itr)
while not (event == 'end' and element.tag == 'lemmata'):
if event == 'start' and element.tag == 'lemma':
cur_word = {
'id': element.attrib['id'],
'lemma': None,
'forms': []
}
if event == 'start' and (element.tag == 'l' or element.tag == 'f'):
cur_item = {'text': None, 'index': i}
i += 1
if event == 'end' and element.tag == 'l':
cur_item['text'] = element.attrib['t']
cur_word['lemma'] = cur_item
cur_item = None
if event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in SRC_CONVERT:
src_key = element.attrib['v'].lower()
gram_type, gram = SRC_CONVERT[src_key]
cur_item[gram_type] = gram
elif event == 'end' and element.tag == 'g' \
and element.attrib['v'].lower() in AD_TAGS \
and 'ad_tags' not in cur_item:
cur_item['ad_tags'] = [element.attrib['v'].lower()]
elif event == 'end' and element.tag == 'g' and element.attrib['v'].lower() in AD_TAGS:
cur_item['ad_tags'].append(element.attrib['v'].lower())
if event == 'end' and element.tag == 'f':
cur_item['text'] = element.attrib['t']
cur_word['forms'].append(cur_item)
cur_item = None
if event == 'end' and element.tag == 'lemma':
yield cur_word
cur_word = None
event, element = next(itr)
def get_flat_words(words):
for item in words:
lemma = item['lemma']
lemma['lemma'] = lemma['text']
lemma['id'] = item['id']
for form in item['forms']:
word = dict(lemma)
for key in form:
word[key] = form[key]
if 'ad_tags' in word:
word['ad_tags'] = ','.join(word['ad_tags'])
yield word
def parse_link_types(itr):
event, element = next(itr)
link_types = {}
while not (event == 'start' and element.tag == 'link_types'):
event, element = next(itr)
while not (event == 'end' and element.tag == 'link_types'):
if event == 'end' and element.tag == 'type':
link_types[element.text] = element.attrib['id']
event, element = next(itr)
return link_types
def parse_links(itr):
event, element = next(itr)
while not (event == 'end' and element.tag == 'links'):
if event == 'end' and element.tag == 'link':
yield {
'from': element.attrib['from'],
'to': element.attrib['to'],
'type': element.attrib['type']
}
event, element = next(itr)
def set_lemma_and_inflect_id(words, link_types, links):
same_inflect_id_post = ['noun', 'adjf', 'infn']
lemmas_dict = {}
for word in words:
for norm_f in LEMMAS_PROPS:
is_lemma = True
for key in norm_f:
if key in word and word[key] != norm_f[key]:
is_lemma = False
break
if is_lemma:
lemmas_dict[word['id']] = word['text']
del word['lemma']
break
if 'lemma' in word and word['post'] in same_inflect_id_post:
word['inflect_id'] = word['id']
inv_link_type_dict = {
link_types[key]: key
for key in link_types
}
links = {
(link['to'], inv_link_type_dict[link['type']]): link['from']
for link in links
}
prtf_dict = {}
for word in words:
link_type = None
if word['post'] == 'verb':
link_type = 'INFN-VERB'
elif word['post'] == 'prtf':
link_type = 'INFN-PRTF'
elif word['post'] == 'grnd':
link_type = 'INFN-GRND'
elif word['post'] == 'adjs':
link_type = 'ADJF-ADJS'
elif word['post'] == 'comp':
link_type = 'ADJF-COMP'
if not link_type:
continue
key = (word['id'], link_type)
if key in links and links[key] in lemmas_dict:
lemma = lemmas_dict[links[key]]
word['lemma'] = lemma
lemmas_dict[word['id']] = lemma
else:
del word['lemma']
if key in links:
word['inflect_id'] = links[key]
if key in links and word['post'] == 'prtf':
prtf_dict[word['id']] = word['inflect_id']
for word in words:
if word['post'] != 'prts':
continue
key = (word['id'], 'PRTF-PRTS')
if key in links:
lemma = lemmas_dict[links[key]]
word['lemma'] = lemma
word['inflect_id'] = prtf_dict[links[key]]
lemmas_dict[word['id']] = lemma
else:
del word['lemma']
doc = etree.iterparse(DIC_PATH, events=('start', 'end'))
itr = iter(doc)
event, element = next(itr)
logging.info("Parsing dictionary xml")
while not (event == 'start' and element.tag == 'lemmata'):
event, element = next(itr)
words = list(parse_words(itr))
link_types = parse_link_types(itr)
links = list(parse_links(itr))
words = list(get_flat_words(words))
set_lemma_and_inflect_id(words, link_types, links)
words = [dict(t) for t in {tuple(sorted(d.items())) for d in words}]
dict_words = [word for word in words if word['post'] in DICT_POST_TYPES]
dataset_words = [word for word in words if word['post'] not in DICT_POST_TYPES]
dataset_words_dic = {}
for word in tqdm(dataset_words):
w_len = len(word['text'])
if w_len > MAX_WORD_SIZE:
continue
if word['text'] not in dataset_words_dic:
dataset_words_dic[word['text']] = []
dataset_words_dic[word['text']].append(word)
logging.info(f"Dict words: {len(dict_words)}")
logging.info(f"Dataset words: {len(dataset_words_dic)}")
with open(DATASET_WORDS_PATH, 'wb+') as f:
pickle.dump(dataset_words_dic, f)
with open(DICTS_WORDS_PATH, 'wb+') as f:
pickle.dump(dict_words, f)