| def custom_tokenizer(text): | |
| tokens = text.split() | |
| unigrams = [] | |
| bigrams_with_ne = [] | |
| skip_next = False | |
| for i in range(len(tokens)): | |
| if skip_next: | |
| skip_next = False | |
| continue | |
| if tokens[i] == 'не' and i < len(tokens) - 1: | |
| bigrams_with_ne.append(f'не_{tokens[i+1]}') | |
| skip_next = True | |
| else: | |
| unigrams.append(tokens[i]) | |
| return unigrams + bigrams_with_ne | |