BIS POS Tagset

import streamlit as st


from annotated_text import annotated_text
# from nltk.tokenize import word_tokenize

import warnings
import pandas as pd
from pandas import DataFrame
import os

warnings.filterwarnings('ignore')
import re, flair, random, time
# from bnlp import BasicTokenizer
from indicnlp.tokenize import indic_tokenize  
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

from huggingface_hub import hf_hub_download


st.set_page_config(
   page_title="Marathi POS Tagger",
   page_icon="✔️",
   layout="wide",
)

model_path = hf_hub_download(
    repo_id="atanu0491/MarathiPOSModel",
    filename="marathi-best-model.pt"
)


@st.cache_resource()
def load_model(model_name):
    model = SequenceTagger.load(model_name)
    return (model)


st.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)

activity = ['Select your Choice', 'File Upload', 'Text Input']
#choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)
choice = st.selectbox('How you want to proceed?', activity)

# st.sidebar.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)

tag_activity = ['TAG', '<unk>', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO']
tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity)

if tag_choice == 'TAG':
    st.sidebar.info('Select the TAG', icon="ℹ️")
if tag_choice == '<unk>':
    st.sidebar.info('Unknown', icon="ℹ️") 
if tag_choice == 'CC_CCD':
    st.sidebar.info('Co-ordinator', icon="ℹ️") 
if tag_choice == 'CC_CCS':
    st.sidebar.info('Subordinator', icon="ℹ️") 
if tag_choice == 'CC_CCS_UT':
    st.sidebar.info('Quotative', icon="ℹ️") 
if tag_choice == 'DM_DMD':
    st.sidebar.info('Deictic demonstrative', icon="ℹ️") 
if tag_choice == 'DM_DMR':
    st.sidebar.info('Relative demonstrative', icon="ℹ️") 
if tag_choice == 'DM_DMQ':
    st.sidebar.info('Wh-word', icon="ℹ️") 
if tag_choice == 'JJ':
    st.sidebar.info('Adjective', icon="ℹ️") 
if tag_choice == 'N_NN':
    st.sidebar.info('Common noun', icon="ℹ️") 
if tag_choice == 'N_NNP':
    st.sidebar.info('Proper noun', icon="ℹ️") 
if tag_choice == 'N_NNV':
    st.sidebar.info('Verbal noun', icon="ℹ️") 
if tag_choice == 'N_NST':
    st.sidebar.info('Locative noun', icon="ℹ️") 
if tag_choice == 'PR_PRC':
    st.sidebar.info('Reciprocal pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRF':
    st.sidebar.info('Reflexive pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRL':
    st.sidebar.info('Relative pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRP':
    st.sidebar.info('Personal pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRQ':
    st.sidebar.info('Wh-word', icon="ℹ️") 
if tag_choice == 'PSP':
    st.sidebar.info('Postposition', icon="ℹ️") 
if tag_choice == 'QT_QTC':
    st.sidebar.info('Cardinals', icon="ℹ️") 
if tag_choice == 'QT_QTF':
    st.sidebar.info('General quantifier', icon="ℹ️") 
if tag_choice == 'RB':
    st.sidebar.info('Adverb', icon="ℹ️") 
if tag_choice == 'RD_ECH':
    st.sidebar.info('Echo words', icon="ℹ️") 
if tag_choice == 'RD_PUNC':
    st.sidebar.info('Punctuation', icon="ℹ️") 
if tag_choice == 'RD_RDF':
    st.sidebar.info('Foreign words', icon="ℹ️") 
if tag_choice == 'RD_SYM':
    st.sidebar.info('Symbol', icon="ℹ️") 
if tag_choice == 'RD_UNK':
    st.sidebar.info('Unknown', icon="ℹ️") 
if tag_choice == 'RP_CL':
    st.sidebar.info('Classifier particle', icon="ℹ️") 
if tag_choice == 'RP_INJ':
    st.sidebar.info('Interjection particle', icon="ℹ️") 
if tag_choice == 'RP_INTF':
    st.sidebar.info('Intensifier particle', icon="ℹ️") 
if tag_choice == 'RP_NEG':
    st.sidebar.info('Negation particle', icon="ℹ️") 
if tag_choice == 'RP_RPD':
    st.sidebar.info('Default particle', icon="ℹ️") 
if tag_choice == 'V_VAUX':
    st.sidebar.info('Auxiliary verb', icon="ℹ️") 
if tag_choice == 'V_VM':
    st.sidebar.info('Main verb', icon="ℹ️") 
if tag_choice == 'V_VM_VF':
    st.sidebar.info('Finite verb', icon="ℹ️") 
if tag_choice == 'V_VM_VINF':
    st.sidebar.info('Infinite verb', icon="ℹ️") 
if tag_choice == 'V_VM_VNF':
    st.sidebar.info('Non-finite verb', icon="ℹ️") 
if tag_choice == 'V_VM_VNG':
    st.sidebar.info('Gerund verb', icon="ℹ️") 
if tag_choice == 'QT_QTO':
    st.sidebar.info('Ordinals', icon="ℹ️") 

st.sidebar.info('Last updated on: 24 September 2025', icon="✅")


model = load_model(model_path)
#model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')


if choice == 'Text Input':
    input_data = st.text_area("Write your sentence below", value="", height=68)
    if st.button('Click to execute'):
       
        # data = BasicTokenizer().tokenize(input_data)
        data = indic_tokenize.trivial_tokenize(input_data)
        # data = word_tokenize(input_data)
        
       
        sentence = Sentence(data)
        # model = load_model('best-model-002.pt')
        model.predict(sentence)
        my_list = []
      
        for token in sentence:
            word = []
            word.append(token.text)
            word.append(token.tag)
            my_list.append(tuple(word))
        annotated_text(my_list)

if choice == 'File Upload':
    uploaded_file = st.file_uploader("Upload your File in .txt format", type=["txt"])
    if uploaded_file is not None:
        lines = uploaded_file.read().decode('utf-8').splitlines()

        # Define output Excel file name
        output_file_name = os.path.join("/tmp", uploaded_file.name.split('.')[0] + '_tagged.xlsx')
        

        raw_sentences = []
        tagged_sentences = []

        with st.spinner("Wait for processing the file..."):
            for line in lines:
                # data = BasicTokenizer().tokenize(line)
                data = indic_tokenize.trivial_tokenize(line)
                sentence = Sentence(data)
                model.predict(sentence)
                my_list = []

                for token in sentence:
                    word = (token.text, token.tag)
                    my_list.append(word)

                raw_line = ' '.join([f"{word}" for word, tag in my_list])
                tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])

                raw_sentences.append(raw_line)
                tagged_sentences.append(tagged_line)

            # Save to Excel
            df = pd.DataFrame({
                "Raw Sentence": raw_sentences,
                "Tagged Sentence": tagged_sentences
            })

            df.to_excel(output_file_name, index=False)

        # Provide download button
        with open(output_file_name, "rb") as f:
            btn = st.download_button(
                label="Download the tagged data in Excel (.xlsx) format",
                data=f,
                file_name=os.path.basename(output_file_name),
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        

st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚")