| import yaml | |
| import pickle | |
| from utils import CONFIG, create_cls_tuple | |
| VECT_PATH = CONFIG['vect_words_path'] | |
| CLS_CLASSES_PATH = CONFIG['cls_classes_path'] | |
| NMB_CLASSES_PATH = CONFIG['numb_classes_path'] | |
| NMB_DATA_PATH = CONFIG['numb_data_path'] | |
| SOGL_CHARS = ['б', 'в', 'г', 'д', 'ж', 'з', 'й', 'к', 'л', 'м', 'н', 'п', 'р', 'с', 'т', 'ф', 'х', 'ц', 'ч', 'ш', 'щ'] | |
| GLASN_CHARS = ['а', 'о', 'и', 'е', 'ё', 'э', 'ы', 'у', 'ю', 'я'] | |
| with open(CLS_CLASSES_PATH, 'rb') as f: | |
| cur_classes_count = len(pickle.load(f)) + 1 | |
| with open('numb.yml') as f: | |
| numbr_src_dic = yaml.load(f) | |
| def get_nar_end(text): | |
| end = text[-1] | |
| if text[-2] in SOGL_CHARS and text[-1] in GLASN_CHARS: | |
| end = text[-2:] | |
| return end | |
| lemma_cls_ids = set() | |
| res_dict = {} | |
| numb_cls_dict = {} | |
| for n_key in numbr_src_dic: | |
| n_key_data = { | |
| 'nar_end': {} | |
| } | |
| res_dict[n_key] = n_key_data | |
| for t in numbr_src_dic[n_key]: | |
| lemma_text = None | |
| lemma_number_text = None | |
| for index, item in enumerate(numbr_src_dic[n_key][t]): | |
| item['post'] = 'numb' | |
| cls_tpl = create_cls_tuple(item) | |
| if cls_tpl not in numb_cls_dict: | |
| numb_cls_dict[cls_tpl] = cur_classes_count | |
| cur_classes_count += 1 | |
| cur_class = numb_cls_dict[cls_tpl] | |
| if index == 0 and cls_tpl not in lemma_cls_ids: | |
| lemma_cls_ids.add(cur_class) | |
| if t not in n_key_data: | |
| n_key_data[t] = [] | |
| items = n_key_data[t] | |
| items.append((item['text'], cur_class)) | |
| if t == 'p' and cur_class not in n_key_data['nar_end']: | |
| end = get_nar_end(item['text']) | |
| n_key_data['nar_end'][cur_class] = end | |
| regex = [] | |
| for val in res_dict: | |
| cur_group = [] | |
| for key in res_dict[val]: | |
| if key == 'nar_end' or key == 'lemma': | |
| continue | |
| for tpl in res_dict[val][key]: | |
| cur_group.append(tpl[0]) | |
| cur_group = list(set(cur_group)) | |
| cur_group = '|'.join(cur_group) | |
| cur_group = f'(?<_{val}>{cur_group})' | |
| regex.insert(0, cur_group) | |
| regex = '|'.join(regex) | |
| regex = f"^({regex})+$" | |
| with open(NMB_CLASSES_PATH, 'wb+') as f: | |
| pickle.dump(numb_cls_dict, f) | |
| with open(NMB_DATA_PATH, 'wb+') as f: | |
| pickle.dump({ | |
| 'regex': regex, | |
| 'lemma_cls_ids': list(lemma_cls_ids), | |
| 'numbers': res_dict | |
| }, f) | |