Spaces:

a-v-bely
/

morph

Sleeping

File size: 6,436 Bytes

import pandas as pd
import streamlit as st

with open('suffixes.txt', encoding='utf-8') as f:
    suffixes = [l.strip() for l in f]

with open('prefixes.txt', encoding='utf-8') as f:
    prefixes = [l.strip() for l in f]

def annotate_morphemes(word, prefixes=prefixes, suffixes=suffixes):
    interfixes =  ('а', 'ар', 'е', 'ей', 'и', 'ич', 'л', 'о', 'у', 'ш')

    stack = ''
    annotation = []
    word = list(word)
    had_ending = False
    for i in range(len(word)):
        char = word.pop()
        if char == '-':
            if stack == '':
                had_ending = True
                continue
            annotation.append({stack[::-1]: 'ending'})
            stack = ''
        elif char == '=':
            if stack[::-1] in prefixes and annotation and (list(annotation[-1].values())[0] == 'root' or list(annotation[-1].values())[0] == 'prefix'):
                # print(1, stack[::-1])
                annotation.append({stack[::-1]: 'prefix'})
            elif stack[::-1] in suffixes and annotation and list(annotation[-1].values())[0] not in ('root', 'prefix'):
                # print(2, stack[::-1])
                annotation.append({stack[::-1]: 'suffix'})
            elif stack[::-1] in ('адьj', 'амт', 'ачей'):
                # print(3, stack[::-1])
                annotation.append({stack[::-1]: 'unifix'})
            elif stack[::-1] in ('же', 'либо', 'нибудь', 'с', 'сь', 'ся', 'то', 'те') and not annotation:
                # print(4, stack[::-1])
                annotation.append({stack[::-1]: 'postfix'})
            else:
                if annotation:
                    if list(annotation[-1].values())[0] == 'ending':
                        # print(5, stack[::-1])
                        annotation.append({stack[::-1]: 'root'})
                    elif list(annotation[-1].values())[0] == 'suffix':
                        # print(6, stack[::-1])
                        annotation.append({stack[::-1]: 'root'})
                    elif len(annotation) >=2 and list(annotation[-2].values())[0] == 'root' and list(annotation[-1].values())[0] in ('prefix', 'interfix'):
                        if stack[::-1] in interfixes and list(annotation[-1].keys())[0] in interfixes:
                            # print('67', stack[::-1], annotation)
                            annotation.append({stack[::-1]: 'interfix'})
                        elif stack[::-1] in suffixes and list(annotation[-1].keys())[0] in interfixes:
                            # print('68', stack[::-1], annotation)
                            annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'}
                            annotation.append({stack[::-1]: 'suffix'})
                        elif list(annotation[-1].keys())[0] in interfixes:
                            # print('69', stack[::-1], annotation)
                            annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'}
                        elif stack[::-1] in interfixes:
                            # print(70, stack[::-1])
                            annotation.append({stack[::-1]: 'interfix'})
                        elif stack[::-1] in suffixes:
                            # print(71, stack[::-1])
                            annotation.append({stack[::-1]: 'suffix'})
                        else:
                            # print(72, stack[::-1])
                            annotation.append({stack[::-1]: 'root'})    
                    elif list(annotation[-1].values())[0] == 'interfix':
                        # print(73, stack[::-1])
                        annotation.append({stack[::-1]: 'root'})
                    elif list(annotation[-1].values())[0] == 'postfix':
                        annotation.append({stack[::-1]: 'root'})
                    else:
                        # print('1111111111', stack[::-1], annotation)
                        annotation.append({stack[::-1]: 'unknown'})
                else:
                    if stack[::-1] in suffixes:
                        # print(8, stack[::-1])
                        annotation.append({stack[::-1]: 'suffix'})
                    elif had_ending:
                        # print(9, stack[::-1])
                        annotation.append({stack[::-1]: 'root'})
                    else:
                        # print('3333333', stack[::-1])
                        annotation.append({stack[::-1]: 'root'})
            stack = ''
        else:
            stack += char
            # print('time', stack[::-1])
    if stack[::-1] in prefixes:
        annotation.append({stack[::-1]: 'prefix'})
    elif stack[::-1] in suffixes:
        annotation.append({stack[::-1]: 'suffix'})
    else:
        if len(annotation) >=2 and list(annotation[-2].values())[0] == 'root' and list(annotation[-1].values())[0] == 'prefix':
            annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'}
            annotation.append({stack[::-1]: 'root'})
        elif annotation and list(annotation[-1].values())[0] in ('ending', 'suffix', 'interfix', 'root'):
            annotation.append({stack[::-1]: 'root'})
        elif not annotation:
            annotation.append({stack[::-1]: 'root'})
        else:
            annotation.append({stack[::-1]: 'unknown'})
    return [list(x.items())[0] for x in annotation[::-1]]

st.set_page_config(layout='wide')
st.header('Аннотирование морфемого состава слова')
st.markdown('Введите разобранное по составу слово или слова (разделитель — пробел) в следующем формате.'
            '\n\nОкончание отделяется от предыдущей морфемы символом "-", остальные морфемы разделяются символом "=".'
            '\n\nНапример: "у=потребл=ениj-е", "пере=двиг=а-ть=ся быстр=о" .')
inpt = st.text_input(label='Аннотировать морфемы в слове(-ах): ')
if inpt == '':
    pass
elif ' ' in inpt:
    for i, tk in enumerate(inpt.split()):
        st.dataframe(pd.DataFrame(annotate_morphemes(tk), columns=['Морфема', 'Тег']).set_index(['Морфема']), key=f'dataframe_{i}')
else:
    st.dataframe(pd.DataFrame(annotate_morphemes(inpt), columns=['Морфема', 'Тег']).set_index(['Морфема']))