niobures
/

DeepMorphy

Model card Files Files and versions

DeepMorphy / 7_build_numbers.py

niobures's picture

DeepMorphy

0240c6e verified 6 months ago

history blame contribute delete

2.45 kB

	import yaml
	import pickle
	from utils import CONFIG, create_cls_tuple


	VECT_PATH = CONFIG['vect_words_path']
	CLS_CLASSES_PATH = CONFIG['cls_classes_path']
	NMB_CLASSES_PATH = CONFIG['numb_classes_path']
	NMB_DATA_PATH = CONFIG['numb_data_path']
	SOGL_CHARS = ['б', 'в', 'г', 'д', 'ж', 'з', 'й', 'к', 'л', 'м', 'н', 'п', 'р', 'с', 'т', 'ф', 'х', 'ц', 'ч', 'ш', 'щ']
	GLASN_CHARS = ['а', 'о', 'и', 'е', 'ё', 'э', 'ы', 'у', 'ю', 'я']


	with open(CLS_CLASSES_PATH, 'rb') as f:
	cur_classes_count = len(pickle.load(f)) + 1

	with open('numb.yml') as f:
	numbr_src_dic = yaml.load(f)


	def get_nar_end(text):
	end = text[-1]
	if text[-2] in SOGL_CHARS and text[-1] in GLASN_CHARS:
	end = text[-2:]

	return end


	lemma_cls_ids = set()
	res_dict = {}
	numb_cls_dict = {}
	for n_key in numbr_src_dic:
	n_key_data = {
	'nar_end': {}
	}
	res_dict[n_key] = n_key_data
	for t in numbr_src_dic[n_key]:
	lemma_text = None
	lemma_number_text = None

	for index, item in enumerate(numbr_src_dic[n_key][t]):
	item['post'] = 'numb'
	cls_tpl = create_cls_tuple(item)
	if cls_tpl not in numb_cls_dict:
	numb_cls_dict[cls_tpl] = cur_classes_count
	cur_classes_count += 1

	cur_class = numb_cls_dict[cls_tpl]
	if index == 0 and cls_tpl not in lemma_cls_ids:
	lemma_cls_ids.add(cur_class)

	if t not in n_key_data:
	n_key_data[t] = []

	items = n_key_data[t]
	items.append((item['text'], cur_class))
	if t == 'p' and cur_class not in n_key_data['nar_end']:
	end = get_nar_end(item['text'])
	n_key_data['nar_end'][cur_class] = end

	regex = []
	for val in res_dict:
	cur_group = []
	for key in res_dict[val]:
	if key == 'nar_end' or key == 'lemma':
	continue

	for tpl in res_dict[val][key]:
	cur_group.append(tpl[0])

	cur_group = list(set(cur_group))
	cur_group = '\|'.join(cur_group)
	cur_group = f'(?<_{val}>{cur_group})'
	regex.insert(0, cur_group)

	regex = '\|'.join(regex)
	regex = f"^({regex})+$"


	with open(NMB_CLASSES_PATH, 'wb+') as f:
	pickle.dump(numb_cls_dict, f)

	with open(NMB_DATA_PATH, 'wb+') as f:
	pickle.dump({
	'regex': regex,
	'lemma_cls_ids': list(lemma_cls_ids),
	'numbers': res_dict
	}, f)