| import os | |
| import pickle | |
| import numpy as np | |
| from utils import CONFIG, RANDOM, load_datasets | |
| from lxml import etree | |
| from xml.etree.ElementTree import ElementTree | |
| GRAM_TYPES = CONFIG['grammemes_types'] | |
| ROOT = CONFIG['publish_tests_path'] | |
| DICT_WORDS_PATH = CONFIG['dict_words_path'] | |
| NMB_DATA_PATH = CONFIG['numb_data_path'] | |
| with open(NMB_DATA_PATH, 'rb') as f: | |
| numb_data = pickle.load(f) | |
| with open(CONFIG['tags_path'], 'rb') as f: | |
| tags = pickle.load(f) | |
| is_lemma_dict = {tags[key]['i']: tags[key]['l'] for key in tags} | |
| tag_index_order = {tags[tag]['i']: tags[tag]['o'] for tag in tags} | |
| def release_gram_tests(items, key, cls_dic, result_path): | |
| root = etree.Element('Tests') | |
| for word in items: | |
| y = np.argwhere(word['y'] == 1).ravel() | |
| y = ';'.join([cls_dic[index] for index in y]) | |
| test = etree.Element("T") | |
| test.set('x', word['src']) | |
| test.set('y', y) | |
| root.append(test) | |
| rez_path = os.path.join(result_path, f'{key}.xml') | |
| tree = ElementTree(root) | |
| with open(rez_path, 'wb+') as f: | |
| tree.write(f, xml_declaration=True, encoding='utf-8') | |
| def release_main_tests(items, result_path, y_is_index=True): | |
| root = etree.Element('Tests') | |
| for word in items: | |
| if y_is_index: | |
| y = word['y'] | |
| else: | |
| y = np.argwhere(word['y'] == 1).ravel() | |
| y = ';'.join([str(index) for index in y]) | |
| test = etree.Element("T") | |
| test.set('x', word['src']) | |
| test.set('y', y) | |
| root.append(test) | |
| rez_path = os.path.join(result_path, f'main.xml') | |
| tree = ElementTree(root) | |
| with open(rez_path, 'wb+') as f: | |
| tree.write(f, xml_declaration=True, encoding='utf-8') | |
| def release_lemma_tests(items, result_path): | |
| root = etree.Element('Tests') | |
| for word in items: | |
| test = etree.Element("T") | |
| test.set('x', word['x_src']) | |
| test.set('x_c', str(word['main_cls'])) | |
| test.set('y', word['y_src']) | |
| root.append(test) | |
| rez_path = os.path.join(result_path, 'lemma.xml') | |
| tree = ElementTree(root) | |
| with open(rez_path, 'wb+') as f: | |
| tree.write(f, xml_declaration=True, encoding='utf-8') | |
| def release_inflect_tests(items, result_path): | |
| root = etree.Element('Tests') | |
| for word in items: | |
| test = etree.Element("T") | |
| test.set('x', word['x_src']) | |
| test.set('x_c', str(word['x_cls'])) | |
| test.set('y', word['y_src']) | |
| test.set('y_c', str(word['y_cls'])) | |
| root.append(test) | |
| rez_path = os.path.join(result_path, 'inflect.xml') | |
| tree = ElementTree(root) | |
| with open(rez_path, 'wb+') as f: | |
| tree.write(f, xml_declaration=True, encoding='utf-8') | |
| def release_nn_tests(): | |
| res_path = os.path.join(ROOT, 'Network') | |
| for gram in GRAM_TYPES: | |
| items = load_datasets(gram, 'test') | |
| cls = GRAM_TYPES[gram]['classes'] | |
| dic = {cls[g_key]['index']: g_key for g_key in cls} | |
| release_gram_tests(items, gram, dic, res_path) | |
| release_main_tests(load_datasets('main', 'test'), res_path, False) | |
| release_lemma_tests(load_datasets('lemma', 'test'), res_path) | |
| release_inflect_tests(load_datasets('inflect', 'test'), res_path) | |
| def merge_same_main(items): | |
| rez_dict = {} | |
| for item in items: | |
| if item['src'] not in rez_dict: | |
| rez_dict[item['src']] = [] | |
| rez_dict[item['src']].append(item) | |
| rez_list = [] | |
| for text in rez_dict: | |
| ys = [item['y'] for item in rez_dict[text]] | |
| rez_list.append(dict(src=text, y=ys)) | |
| return rez_list | |
| def release_dictionary_tests(): | |
| res_path = os.path.join(ROOT, 'Dict') | |
| with open(DICT_WORDS_PATH, 'rb') as f: | |
| words = pickle.load(f) | |
| lexeme_dict = {} | |
| for word in words: | |
| if word['post'] == 'numb': | |
| continue | |
| if word['id'] not in lexeme_dict: | |
| lexeme_dict[word['id']] = [] | |
| lexeme_dict[word['id']].append(word) | |
| main = [] | |
| inflect = [] | |
| lemmas = [] | |
| for word_id in lexeme_dict: | |
| lexeme_words = lexeme_dict[word_id] | |
| for item in lexeme_words: | |
| main.append(dict(src=item['text'], y=item['main'])) | |
| for word in lexeme_words: | |
| lemma = word['lemma'] if 'lemma' in word else word['text'] | |
| if is_lemma_dict[word['main']]: | |
| lemma = word['text'] | |
| lemmas.append(dict( | |
| x_src=word['text'], | |
| main_cls=word['main'], | |
| y_src=lemma | |
| )) | |
| un_cls_ids = [] | |
| rez_items = [] | |
| for item in sorted(lexeme_words, key=lambda x: x['index']): | |
| if item['main'] in un_cls_ids: | |
| continue | |
| un_cls_ids.append(item['main']) | |
| rez_items.append(item) | |
| lexeme_words = rez_items | |
| for i in range(0, len(lexeme_words) - 2): | |
| main_word = lexeme_words[i] | |
| for j in range(i, len(lexeme_words) - 1): | |
| to_word = lexeme_words[j] | |
| if to_word['main'] == main_word['main']: | |
| continue | |
| inflect.append(dict( | |
| x_src=main_word['text'], | |
| x_cls=main_word['main'], | |
| y_src=to_word['text'], | |
| y_cls=to_word['main'], | |
| id=word_id | |
| )) | |
| main = merge_same_main(main) | |
| release_main_tests(main, res_path) | |
| release_lemma_tests(lemmas, res_path) | |
| release_inflect_tests(inflect, res_path) | |
| def release_numb_tests(): | |
| res_path = os.path.join(ROOT, 'Numb') | |
| main = [] | |
| inflect = [] | |
| lemmas = [] | |
| for val in numb_data['numbers']: | |
| n_el = etree.Element("N") | |
| n_el.set('v', str(val)) | |
| for tp in numb_data['numbers'][val]: | |
| if tp == 'nar_end' or tp == 'lemma': | |
| continue | |
| items = numb_data['numbers'][val][tp] | |
| lemma, _ = items[0] | |
| for text, index in items: | |
| main.append(dict(src=text, y=index)) | |
| lemmas.append(dict( | |
| x_src=text, | |
| main_cls=index, | |
| y_src=lemma | |
| )) | |
| un_cls_ids = [] | |
| rez_items = [] | |
| for item in items: | |
| if item[1] in un_cls_ids: | |
| continue | |
| un_cls_ids.append(item[1]) | |
| rez_items.append(item) | |
| items = rez_items | |
| for i in range(0, len(items) - 2): | |
| main_text, main_index = items[i] | |
| for j in range(i, len(items) - 1): | |
| to_text, to_index = items[j] | |
| inflect.append(dict( | |
| x_src=main_text, | |
| x_cls=main_index, | |
| y_src=to_text, | |
| y_cls=to_index, | |
| id=f"{val}{tp}" | |
| )) | |
| main = merge_same_main(main) | |
| release_main_tests(main, res_path) | |
| release_lemma_tests(lemmas, res_path) | |
| release_inflect_tests(inflect, res_path) | |
| def release_nar_numb_tests(): | |
| res_path = os.path.join(ROOT, 'NarNumb') | |
| main = [] | |
| inflect = [] | |
| lemmas = [] | |
| for val in numb_data['numbers']: | |
| n_el = etree.Element("N") | |
| n_el.set('v', str(val)) | |
| items = numb_data['numbers'][val]['nar_end'] | |
| lemma_id = numb_data['numbers'][val]['p'][0][1] | |
| lemma = f"{val}-{items[lemma_id]}" | |
| for index in items: | |
| text = f"{val}-{items[index]}" | |
| main.append(dict(src=text, y=index)) | |
| lemmas.append(dict( | |
| x_src=text, | |
| main_cls=index, | |
| y_src=lemma | |
| )) | |
| ids = list(items.keys()) | |
| for i in range(0, len(ids) - 2): | |
| main_index = ids[i] | |
| main_text = items[main_index] | |
| main_text = f"{val}-{main_text}" | |
| for j in range(i, len(items) - 1): | |
| to_index = ids[j] | |
| to_text = items[to_index] | |
| to_text = f"{val}-{to_text}" | |
| inflect.append(dict( | |
| x_src=main_text, | |
| x_cls=main_index, | |
| y_src=to_text, | |
| y_cls=to_index, | |
| id=f"{val}nar" | |
| )) | |
| main = merge_same_main(main) | |
| release_main_tests(main, res_path) | |
| release_lemma_tests(lemmas, res_path) | |
| release_inflect_tests(inflect, res_path) | |
| def release_reg_tests(): | |
| res_path = os.path.join(ROOT, 'Reg') | |
| int_tag = None | |
| romn_tag = None | |
| unkn_tag = None | |
| punct_tag = None | |
| for tag in tags: | |
| if 'int' in tag: | |
| int_tag = tags[tag]['i'] | |
| elif 'romn' in tag: | |
| romn_tag = tags[tag]['i'] | |
| elif 'unkn' in tag: | |
| unkn_tag = tags[tag]['i'] | |
| elif 'punct' in tag: | |
| punct_tag = tags[tag]['i'] | |
| main = [] | |
| puncts = ['.', ',', '?', '!', '_', '"', '(', ')', ':', ';', '-'] | |
| for p in puncts: | |
| main.append(dict(src=p, y=punct_tag)) | |
| text = [p] | |
| for _ in range(1, RANDOM.randint(2, 5)): | |
| text.append(puncts[RANDOM.randint(0, len(puncts)-1)]) | |
| text = ''.join(text) | |
| main.append(dict(src=text, y=punct_tag)) | |
| for i in range(100): | |
| val = RANDOM.randint(0, 1000000) | |
| main.append(dict(src=str(val), y=int_tag)) | |
| roms = ['i', 'iii', 'iv', 'iv', 'c', 'd', 'm', 'md', 'mi'] | |
| for rom in roms: | |
| main.append(dict(src=rom, y=romn_tag)) | |
| main.append(dict(src=rom.upper(), y=romn_tag)) | |
| unkn = ['test', 'sdasdas', 'home'] | |
| for v in unkn: | |
| main.append(dict(src=v, y=unkn_tag)) | |
| main.append(dict(src=v.upper(), y=unkn_tag)) | |
| lemmas = [dict(x_src=item['src'], main_cls=item['y'], y_src=item['src']) for item in main] | |
| inflect = [dict(x_src=item['src'], x_cls=item['y'], y_src=item['src'], y_cls=item['y'], id=item['src']) for item in main] | |
| main = merge_same_main(main) | |
| release_main_tests(main, res_path) | |
| release_lemma_tests(lemmas, res_path) | |
| release_inflect_tests(inflect, res_path) | |
| release_reg_tests() | |
| release_nar_numb_tests() | |
| release_numb_tests() | |
| release_dictionary_tests() | |
| release_nn_tests() | |