File size: 460 Bytes
79a5d22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def custom_tokenizer(text):
    tokens = text.split()
    unigrams = []
    bigrams_with_ne = []

    skip_next = False
    for i in range(len(tokens)):
        if skip_next:
            skip_next = False
            continue
        if tokens[i] == 'не' and i < len(tokens) - 1:
            bigrams_with_ne.append(f'не_{tokens[i+1]}')
            skip_next = True
        else:
            unigrams.append(tokens[i])
    return unigrams + bigrams_with_ne