Spaces:

polinchi1
/

nlp_proj

Sleeping

App Files Files Community

nlp_proj / utils /text_utils.py

polinchi1's picture

Upload text_utils.py

79a5d22 verified 8 months ago

history blame contribute delete

460 Bytes

	def custom_tokenizer(text):
	tokens = text.split()
	unigrams = []
	bigrams_with_ne = []

	skip_next = False
	for i in range(len(tokens)):
	if skip_next:
	skip_next = False
	continue
	if tokens[i] == 'не' and i < len(tokens) - 1:
	bigrams_with_ne.append(f'не_{tokens[i+1]}')
	skip_next = True
	else:
	unigrams.append(tokens[i])
	return unigrams + bigrams_with_ne