DeepMorphy / 11_release_tests.py
niobures's picture
DeepMorphy
0240c6e verified
import os
import pickle
import numpy as np
from utils import CONFIG, RANDOM, load_datasets
from lxml import etree
from xml.etree.ElementTree import ElementTree
GRAM_TYPES = CONFIG['grammemes_types']
ROOT = CONFIG['publish_tests_path']
DICT_WORDS_PATH = CONFIG['dict_words_path']
NMB_DATA_PATH = CONFIG['numb_data_path']
with open(NMB_DATA_PATH, 'rb') as f:
numb_data = pickle.load(f)
with open(CONFIG['tags_path'], 'rb') as f:
tags = pickle.load(f)
is_lemma_dict = {tags[key]['i']: tags[key]['l'] for key in tags}
tag_index_order = {tags[tag]['i']: tags[tag]['o'] for tag in tags}
def release_gram_tests(items, key, cls_dic, result_path):
root = etree.Element('Tests')
for word in items:
y = np.argwhere(word['y'] == 1).ravel()
y = ';'.join([cls_dic[index] for index in y])
test = etree.Element("T")
test.set('x', word['src'])
test.set('y', y)
root.append(test)
rez_path = os.path.join(result_path, f'{key}.xml')
tree = ElementTree(root)
with open(rez_path, 'wb+') as f:
tree.write(f, xml_declaration=True, encoding='utf-8')
def release_main_tests(items, result_path, y_is_index=True):
root = etree.Element('Tests')
for word in items:
if y_is_index:
y = word['y']
else:
y = np.argwhere(word['y'] == 1).ravel()
y = ';'.join([str(index) for index in y])
test = etree.Element("T")
test.set('x', word['src'])
test.set('y', y)
root.append(test)
rez_path = os.path.join(result_path, f'main.xml')
tree = ElementTree(root)
with open(rez_path, 'wb+') as f:
tree.write(f, xml_declaration=True, encoding='utf-8')
def release_lemma_tests(items, result_path):
root = etree.Element('Tests')
for word in items:
test = etree.Element("T")
test.set('x', word['x_src'])
test.set('x_c', str(word['main_cls']))
test.set('y', word['y_src'])
root.append(test)
rez_path = os.path.join(result_path, 'lemma.xml')
tree = ElementTree(root)
with open(rez_path, 'wb+') as f:
tree.write(f, xml_declaration=True, encoding='utf-8')
def release_inflect_tests(items, result_path):
root = etree.Element('Tests')
for word in items:
test = etree.Element("T")
test.set('x', word['x_src'])
test.set('x_c', str(word['x_cls']))
test.set('y', word['y_src'])
test.set('y_c', str(word['y_cls']))
root.append(test)
rez_path = os.path.join(result_path, 'inflect.xml')
tree = ElementTree(root)
with open(rez_path, 'wb+') as f:
tree.write(f, xml_declaration=True, encoding='utf-8')
def release_nn_tests():
res_path = os.path.join(ROOT, 'Network')
for gram in GRAM_TYPES:
items = load_datasets(gram, 'test')
cls = GRAM_TYPES[gram]['classes']
dic = {cls[g_key]['index']: g_key for g_key in cls}
release_gram_tests(items, gram, dic, res_path)
release_main_tests(load_datasets('main', 'test'), res_path, False)
release_lemma_tests(load_datasets('lemma', 'test'), res_path)
release_inflect_tests(load_datasets('inflect', 'test'), res_path)
def merge_same_main(items):
rez_dict = {}
for item in items:
if item['src'] not in rez_dict:
rez_dict[item['src']] = []
rez_dict[item['src']].append(item)
rez_list = []
for text in rez_dict:
ys = [item['y'] for item in rez_dict[text]]
rez_list.append(dict(src=text, y=ys))
return rez_list
def release_dictionary_tests():
res_path = os.path.join(ROOT, 'Dict')
with open(DICT_WORDS_PATH, 'rb') as f:
words = pickle.load(f)
lexeme_dict = {}
for word in words:
if word['post'] == 'numb':
continue
if word['id'] not in lexeme_dict:
lexeme_dict[word['id']] = []
lexeme_dict[word['id']].append(word)
main = []
inflect = []
lemmas = []
for word_id in lexeme_dict:
lexeme_words = lexeme_dict[word_id]
for item in lexeme_words:
main.append(dict(src=item['text'], y=item['main']))
for word in lexeme_words:
lemma = word['lemma'] if 'lemma' in word else word['text']
if is_lemma_dict[word['main']]:
lemma = word['text']
lemmas.append(dict(
x_src=word['text'],
main_cls=word['main'],
y_src=lemma
))
un_cls_ids = []
rez_items = []
for item in sorted(lexeme_words, key=lambda x: x['index']):
if item['main'] in un_cls_ids:
continue
un_cls_ids.append(item['main'])
rez_items.append(item)
lexeme_words = rez_items
for i in range(0, len(lexeme_words) - 2):
main_word = lexeme_words[i]
for j in range(i, len(lexeme_words) - 1):
to_word = lexeme_words[j]
if to_word['main'] == main_word['main']:
continue
inflect.append(dict(
x_src=main_word['text'],
x_cls=main_word['main'],
y_src=to_word['text'],
y_cls=to_word['main'],
id=word_id
))
main = merge_same_main(main)
release_main_tests(main, res_path)
release_lemma_tests(lemmas, res_path)
release_inflect_tests(inflect, res_path)
def release_numb_tests():
res_path = os.path.join(ROOT, 'Numb')
main = []
inflect = []
lemmas = []
for val in numb_data['numbers']:
n_el = etree.Element("N")
n_el.set('v', str(val))
for tp in numb_data['numbers'][val]:
if tp == 'nar_end' or tp == 'lemma':
continue
items = numb_data['numbers'][val][tp]
lemma, _ = items[0]
for text, index in items:
main.append(dict(src=text, y=index))
lemmas.append(dict(
x_src=text,
main_cls=index,
y_src=lemma
))
un_cls_ids = []
rez_items = []
for item in items:
if item[1] in un_cls_ids:
continue
un_cls_ids.append(item[1])
rez_items.append(item)
items = rez_items
for i in range(0, len(items) - 2):
main_text, main_index = items[i]
for j in range(i, len(items) - 1):
to_text, to_index = items[j]
inflect.append(dict(
x_src=main_text,
x_cls=main_index,
y_src=to_text,
y_cls=to_index,
id=f"{val}{tp}"
))
main = merge_same_main(main)
release_main_tests(main, res_path)
release_lemma_tests(lemmas, res_path)
release_inflect_tests(inflect, res_path)
def release_nar_numb_tests():
res_path = os.path.join(ROOT, 'NarNumb')
main = []
inflect = []
lemmas = []
for val in numb_data['numbers']:
n_el = etree.Element("N")
n_el.set('v', str(val))
items = numb_data['numbers'][val]['nar_end']
lemma_id = numb_data['numbers'][val]['p'][0][1]
lemma = f"{val}-{items[lemma_id]}"
for index in items:
text = f"{val}-{items[index]}"
main.append(dict(src=text, y=index))
lemmas.append(dict(
x_src=text,
main_cls=index,
y_src=lemma
))
ids = list(items.keys())
for i in range(0, len(ids) - 2):
main_index = ids[i]
main_text = items[main_index]
main_text = f"{val}-{main_text}"
for j in range(i, len(items) - 1):
to_index = ids[j]
to_text = items[to_index]
to_text = f"{val}-{to_text}"
inflect.append(dict(
x_src=main_text,
x_cls=main_index,
y_src=to_text,
y_cls=to_index,
id=f"{val}nar"
))
main = merge_same_main(main)
release_main_tests(main, res_path)
release_lemma_tests(lemmas, res_path)
release_inflect_tests(inflect, res_path)
def release_reg_tests():
res_path = os.path.join(ROOT, 'Reg')
int_tag = None
romn_tag = None
unkn_tag = None
punct_tag = None
for tag in tags:
if 'int' in tag:
int_tag = tags[tag]['i']
elif 'romn' in tag:
romn_tag = tags[tag]['i']
elif 'unkn' in tag:
unkn_tag = tags[tag]['i']
elif 'punct' in tag:
punct_tag = tags[tag]['i']
main = []
puncts = ['.', ',', '?', '!', '_', '"', '(', ')', ':', ';', '-']
for p in puncts:
main.append(dict(src=p, y=punct_tag))
text = [p]
for _ in range(1, RANDOM.randint(2, 5)):
text.append(puncts[RANDOM.randint(0, len(puncts)-1)])
text = ''.join(text)
main.append(dict(src=text, y=punct_tag))
for i in range(100):
val = RANDOM.randint(0, 1000000)
main.append(dict(src=str(val), y=int_tag))
roms = ['i', 'iii', 'iv', 'iv', 'c', 'd', 'm', 'md', 'mi']
for rom in roms:
main.append(dict(src=rom, y=romn_tag))
main.append(dict(src=rom.upper(), y=romn_tag))
unkn = ['test', 'sdasdas', 'home']
for v in unkn:
main.append(dict(src=v, y=unkn_tag))
main.append(dict(src=v.upper(), y=unkn_tag))
lemmas = [dict(x_src=item['src'], main_cls=item['y'], y_src=item['src']) for item in main]
inflect = [dict(x_src=item['src'], x_cls=item['y'], y_src=item['src'], y_cls=item['y'], id=item['src']) for item in main]
main = merge_same_main(main)
release_main_tests(main, res_path)
release_lemma_tests(lemmas, res_path)
release_inflect_tests(inflect, res_path)
release_reg_tests()
release_nar_numb_tests()
release_numb_tests()
release_dictionary_tests()
release_nn_tests()