def custom_tokenizer(text): tokens = text.split() unigrams = [] bigrams_with_ne = [] skip_next = False for i in range(len(tokens)): if skip_next: skip_next = False continue if tokens[i] == 'не' and i < len(tokens) - 1: bigrams_with_ne.append(f'не_{tokens[i+1]}') skip_next = True else: unigrams.append(tokens[i]) return unigrams + bigrams_with_ne