DeepMorphy / 7_build_numbers.py
niobures's picture
DeepMorphy
0240c6e verified
import yaml
import pickle
from utils import CONFIG, create_cls_tuple
VECT_PATH = CONFIG['vect_words_path']
CLS_CLASSES_PATH = CONFIG['cls_classes_path']
NMB_CLASSES_PATH = CONFIG['numb_classes_path']
NMB_DATA_PATH = CONFIG['numb_data_path']
SOGL_CHARS = ['б', 'в', 'г', 'д', 'ж', 'з', 'й', 'к', 'л', 'м', 'н', 'п', 'р', 'с', 'т', 'ф', 'х', 'ц', 'ч', 'ш', 'щ']
GLASN_CHARS = ['а', 'о', 'и', 'е', 'ё', 'э', 'ы', 'у', 'ю', 'я']
with open(CLS_CLASSES_PATH, 'rb') as f:
cur_classes_count = len(pickle.load(f)) + 1
with open('numb.yml') as f:
numbr_src_dic = yaml.load(f)
def get_nar_end(text):
end = text[-1]
if text[-2] in SOGL_CHARS and text[-1] in GLASN_CHARS:
end = text[-2:]
return end
lemma_cls_ids = set()
res_dict = {}
numb_cls_dict = {}
for n_key in numbr_src_dic:
n_key_data = {
'nar_end': {}
}
res_dict[n_key] = n_key_data
for t in numbr_src_dic[n_key]:
lemma_text = None
lemma_number_text = None
for index, item in enumerate(numbr_src_dic[n_key][t]):
item['post'] = 'numb'
cls_tpl = create_cls_tuple(item)
if cls_tpl not in numb_cls_dict:
numb_cls_dict[cls_tpl] = cur_classes_count
cur_classes_count += 1
cur_class = numb_cls_dict[cls_tpl]
if index == 0 and cls_tpl not in lemma_cls_ids:
lemma_cls_ids.add(cur_class)
if t not in n_key_data:
n_key_data[t] = []
items = n_key_data[t]
items.append((item['text'], cur_class))
if t == 'p' and cur_class not in n_key_data['nar_end']:
end = get_nar_end(item['text'])
n_key_data['nar_end'][cur_class] = end
regex = []
for val in res_dict:
cur_group = []
for key in res_dict[val]:
if key == 'nar_end' or key == 'lemma':
continue
for tpl in res_dict[val][key]:
cur_group.append(tpl[0])
cur_group = list(set(cur_group))
cur_group = '|'.join(cur_group)
cur_group = f'(?<_{val}>{cur_group})'
regex.insert(0, cur_group)
regex = '|'.join(regex)
regex = f"^({regex})+$"
with open(NMB_CLASSES_PATH, 'wb+') as f:
pickle.dump(numb_cls_dict, f)
with open(NMB_DATA_PATH, 'wb+') as f:
pickle.dump({
'regex': regex,
'lemma_cls_ids': list(lemma_cls_ids),
'numbers': res_dict
}, f)