import pandas as pd import streamlit as st with open('suffixes.txt', encoding='utf-8') as f: suffixes = [l.strip() for l in f] with open('prefixes.txt', encoding='utf-8') as f: prefixes = [l.strip() for l in f] def annotate_morphemes(word, prefixes=prefixes, suffixes=suffixes): interfixes = ('а', 'ар', 'е', 'ей', 'и', 'ич', 'л', 'о', 'у', 'ш') stack = '' annotation = [] word = list(word) had_ending = False for i in range(len(word)): char = word.pop() if char == '-': if stack == '': had_ending = True continue annotation.append({stack[::-1]: 'ending'}) stack = '' elif char == '=': if stack[::-1] in prefixes and annotation and (list(annotation[-1].values())[0] == 'root' or list(annotation[-1].values())[0] == 'prefix'): # print(1, stack[::-1]) annotation.append({stack[::-1]: 'prefix'}) elif stack[::-1] in suffixes and annotation and list(annotation[-1].values())[0] not in ('root', 'prefix'): # print(2, stack[::-1]) annotation.append({stack[::-1]: 'suffix'}) elif stack[::-1] in ('адьj', 'амт', 'ачей'): # print(3, stack[::-1]) annotation.append({stack[::-1]: 'unifix'}) elif stack[::-1] in ('же', 'либо', 'нибудь', 'с', 'сь', 'ся', 'то', 'те') and not annotation: # print(4, stack[::-1]) annotation.append({stack[::-1]: 'postfix'}) else: if annotation: if list(annotation[-1].values())[0] == 'ending': # print(5, stack[::-1]) annotation.append({stack[::-1]: 'root'}) elif list(annotation[-1].values())[0] == 'suffix': # print(6, stack[::-1]) annotation.append({stack[::-1]: 'root'}) elif len(annotation) >=2 and list(annotation[-2].values())[0] == 'root' and list(annotation[-1].values())[0] in ('prefix', 'interfix'): if stack[::-1] in interfixes and list(annotation[-1].keys())[0] in interfixes: # print('67', stack[::-1], annotation) annotation.append({stack[::-1]: 'interfix'}) elif stack[::-1] in suffixes and list(annotation[-1].keys())[0] in interfixes: # print('68', stack[::-1], annotation) annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'} annotation.append({stack[::-1]: 'suffix'}) elif list(annotation[-1].keys())[0] in interfixes: # print('69', stack[::-1], annotation) annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'} elif stack[::-1] in interfixes: # print(70, stack[::-1]) annotation.append({stack[::-1]: 'interfix'}) elif stack[::-1] in suffixes: # print(71, stack[::-1]) annotation.append({stack[::-1]: 'suffix'}) else: # print(72, stack[::-1]) annotation.append({stack[::-1]: 'root'}) elif list(annotation[-1].values())[0] == 'interfix': # print(73, stack[::-1]) annotation.append({stack[::-1]: 'root'}) elif list(annotation[-1].values())[0] == 'postfix': annotation.append({stack[::-1]: 'root'}) else: # print('1111111111', stack[::-1], annotation) annotation.append({stack[::-1]: 'unknown'}) else: if stack[::-1] in suffixes: # print(8, stack[::-1]) annotation.append({stack[::-1]: 'suffix'}) elif had_ending: # print(9, stack[::-1]) annotation.append({stack[::-1]: 'root'}) else: # print('3333333', stack[::-1]) annotation.append({stack[::-1]: 'root'}) stack = '' else: stack += char # print('time', stack[::-1]) if stack[::-1] in prefixes: annotation.append({stack[::-1]: 'prefix'}) elif stack[::-1] in suffixes: annotation.append({stack[::-1]: 'suffix'}) else: if len(annotation) >=2 and list(annotation[-2].values())[0] == 'root' and list(annotation[-1].values())[0] == 'prefix': annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'} annotation.append({stack[::-1]: 'root'}) elif annotation and list(annotation[-1].values())[0] in ('ending', 'suffix', 'interfix', 'root'): annotation.append({stack[::-1]: 'root'}) elif not annotation: annotation.append({stack[::-1]: 'root'}) else: annotation.append({stack[::-1]: 'unknown'}) return [list(x.items())[0] for x in annotation[::-1]] st.set_page_config(layout='wide') st.header('Аннотирование морфемого состава слова') st.markdown('Введите разобранное по составу слово или слова (разделитель — пробел) в следующем формате.' '\n\nОкончание отделяется от предыдущей морфемы символом "-", остальные морфемы разделяются символом "=".' '\n\nНапример: "у=потребл=ениj-е", "пере=двиг=а-ть=ся быстр=о" .') inpt = st.text_input(label='Аннотировать морфемы в слове(-ах): ') if inpt == '': pass elif ' ' in inpt: for i, tk in enumerate(inpt.split()): st.dataframe(pd.DataFrame(annotate_morphemes(tk), columns=['Морфема', 'Тег']).set_index(['Морфема']), key=f'dataframe_{i}') else: st.dataframe(pd.DataFrame(annotate_morphemes(inpt), columns=['Морфема', 'Тег']).set_index(['Морфема']))