File size: 460 Bytes
79a5d22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | def custom_tokenizer(text):
tokens = text.split()
unigrams = []
bigrams_with_ne = []
skip_next = False
for i in range(len(tokens)):
if skip_next:
skip_next = False
continue
if tokens[i] == 'не' and i < len(tokens) - 1:
bigrams_with_ne.append(f'не_{tokens[i+1]}')
skip_next = True
else:
unigrams.append(tokens[i])
return unigrams + bigrams_with_ne
|