import streamlit as st from annotated_text import annotated_text # from nltk.tokenize import word_tokenize import warnings import pandas as pd from pandas import DataFrame import os warnings.filterwarnings('ignore') import re, flair, random, time # from bnlp import BasicTokenizer from indicnlp.tokenize import indic_tokenize from flair.data import Corpus, Sentence from flair.datasets import ColumnCorpus from flair.models import SequenceTagger from flair.trainers import ModelTrainer from huggingface_hub import hf_hub_download st.set_page_config( page_title="Marathi POS Tagger", page_icon="✔️", layout="wide", ) model_path = hf_hub_download( repo_id="atanu0491/MarathiPOSModel", filename="marathi-best-model.pt" ) @st.cache_resource() def load_model(model_name): model = SequenceTagger.load(model_name) return (model) st.markdown('''

BIS POS Tagset

''', unsafe_allow_html=True) activity = ['Select your Choice', 'File Upload', 'Text Input'] #choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity) choice = st.selectbox('How you want to proceed?', activity) # st.sidebar.markdown('''

BIS POS Tagset

''', unsafe_allow_html=True) tag_activity = ['TAG', '', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO'] tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity) if tag_choice == 'TAG': st.sidebar.info('Select the TAG', icon="ℹ️") if tag_choice == '': st.sidebar.info('Unknown', icon="ℹ️") if tag_choice == 'CC_CCD': st.sidebar.info('Co-ordinator', icon="ℹ️") if tag_choice == 'CC_CCS': st.sidebar.info('Subordinator', icon="ℹ️") if tag_choice == 'CC_CCS_UT': st.sidebar.info('Quotative', icon="ℹ️") if tag_choice == 'DM_DMD': st.sidebar.info('Deictic demonstrative', icon="ℹ️") if tag_choice == 'DM_DMR': st.sidebar.info('Relative demonstrative', icon="ℹ️") if tag_choice == 'DM_DMQ': st.sidebar.info('Wh-word', icon="ℹ️") if tag_choice == 'JJ': st.sidebar.info('Adjective', icon="ℹ️") if tag_choice == 'N_NN': st.sidebar.info('Common noun', icon="ℹ️") if tag_choice == 'N_NNP': st.sidebar.info('Proper noun', icon="ℹ️") if tag_choice == 'N_NNV': st.sidebar.info('Verbal noun', icon="ℹ️") if tag_choice == 'N_NST': st.sidebar.info('Locative noun', icon="ℹ️") if tag_choice == 'PR_PRC': st.sidebar.info('Reciprocal pronoun', icon="ℹ️") if tag_choice == 'PR_PRF': st.sidebar.info('Reflexive pronoun', icon="ℹ️") if tag_choice == 'PR_PRL': st.sidebar.info('Relative pronoun', icon="ℹ️") if tag_choice == 'PR_PRP': st.sidebar.info('Personal pronoun', icon="ℹ️") if tag_choice == 'PR_PRQ': st.sidebar.info('Wh-word', icon="ℹ️") if tag_choice == 'PSP': st.sidebar.info('Postposition', icon="ℹ️") if tag_choice == 'QT_QTC': st.sidebar.info('Cardinals', icon="ℹ️") if tag_choice == 'QT_QTF': st.sidebar.info('General quantifier', icon="ℹ️") if tag_choice == 'RB': st.sidebar.info('Adverb', icon="ℹ️") if tag_choice == 'RD_ECH': st.sidebar.info('Echo words', icon="ℹ️") if tag_choice == 'RD_PUNC': st.sidebar.info('Punctuation', icon="ℹ️") if tag_choice == 'RD_RDF': st.sidebar.info('Foreign words', icon="ℹ️") if tag_choice == 'RD_SYM': st.sidebar.info('Symbol', icon="ℹ️") if tag_choice == 'RD_UNK': st.sidebar.info('Unknown', icon="ℹ️") if tag_choice == 'RP_CL': st.sidebar.info('Classifier particle', icon="ℹ️") if tag_choice == 'RP_INJ': st.sidebar.info('Interjection particle', icon="ℹ️") if tag_choice == 'RP_INTF': st.sidebar.info('Intensifier particle', icon="ℹ️") if tag_choice == 'RP_NEG': st.sidebar.info('Negation particle', icon="ℹ️") if tag_choice == 'RP_RPD': st.sidebar.info('Default particle', icon="ℹ️") if tag_choice == 'V_VAUX': st.sidebar.info('Auxiliary verb', icon="ℹ️") if tag_choice == 'V_VM': st.sidebar.info('Main verb', icon="ℹ️") if tag_choice == 'V_VM_VF': st.sidebar.info('Finite verb', icon="ℹ️") if tag_choice == 'V_VM_VINF': st.sidebar.info('Infinite verb', icon="ℹ️") if tag_choice == 'V_VM_VNF': st.sidebar.info('Non-finite verb', icon="ℹ️") if tag_choice == 'V_VM_VNG': st.sidebar.info('Gerund verb', icon="ℹ️") if tag_choice == 'QT_QTO': st.sidebar.info('Ordinals', icon="ℹ️") st.sidebar.info('Last updated on: 24 September 2025', icon="✅") model = load_model(model_path) #model = load_model('best-model-002.pt') #model = SequenceTagger.load('best-model-002.pt') if choice == 'Text Input': input_data = st.text_area("Write your sentence below", value="", height=68) if st.button('Click to execute'): # data = BasicTokenizer().tokenize(input_data) data = indic_tokenize.trivial_tokenize(input_data) # data = word_tokenize(input_data) sentence = Sentence(data) # model = load_model('best-model-002.pt') model.predict(sentence) my_list = [] for token in sentence: word = [] word.append(token.text) word.append(token.tag) my_list.append(tuple(word)) annotated_text(my_list) if choice == 'File Upload': uploaded_file = st.file_uploader("Upload your File in .txt format", type=["txt"]) if uploaded_file is not None: lines = uploaded_file.read().decode('utf-8').splitlines() # Define output Excel file name output_file_name = os.path.join("/tmp", uploaded_file.name.split('.')[0] + '_tagged.xlsx') raw_sentences = [] tagged_sentences = [] with st.spinner("Wait for processing the file..."): for line in lines: # data = BasicTokenizer().tokenize(line) data = indic_tokenize.trivial_tokenize(line) sentence = Sentence(data) model.predict(sentence) my_list = [] for token in sentence: word = (token.text, token.tag) my_list.append(word) raw_line = ' '.join([f"{word}" for word, tag in my_list]) tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) raw_sentences.append(raw_line) tagged_sentences.append(tagged_line) # Save to Excel df = pd.DataFrame({ "Raw Sentence": raw_sentences, "Tagged Sentence": tagged_sentences }) df.to_excel(output_file_name, index=False) # Provide download button with open(output_file_name, "rb") as f: btn = st.download_button( label="Download the tagged data in Excel (.xlsx) format", data=f, file_name=os.path.basename(output_file_name), mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚")