Spaces:

polinchi1
/

nlp_proj

Sleeping

File size: 460 Bytes

79a5d22

def custom_tokenizer(text):
    tokens = text.split()
    unigrams = []
    bigrams_with_ne = []

    skip_next = False
    for i in range(len(tokens)):
        if skip_next:
            skip_next = False
            continue
        if tokens[i] == 'не' and i < len(tokens) - 1:
            bigrams_with_ne.append(f'не_{tokens[i+1]}')
            skip_next = True
        else:
            unigrams.append(tokens[i])
    return unigrams + bigrams_with_ne