nlp_proj / utils /text_utils.py
polinchi1's picture
Upload text_utils.py
79a5d22 verified
raw
history blame contribute delete
460 Bytes
def custom_tokenizer(text):
tokens = text.split()
unigrams = []
bigrams_with_ne = []
skip_next = False
for i in range(len(tokens)):
if skip_next:
skip_next = False
continue
if tokens[i] == 'не' and i < len(tokens) - 1:
bigrams_with_ne.append(f'не_{tokens[i+1]}')
skip_next = True
else:
unigrams.append(tokens[i])
return unigrams + bigrams_with_ne