MarathiPOS / src /streamlit_app.py
atanu0491's picture
Update src/streamlit_app.py
ab693c0 verified
import streamlit as st
from annotated_text import annotated_text
# from nltk.tokenize import word_tokenize
import warnings
import pandas as pd
from pandas import DataFrame
import os
warnings.filterwarnings('ignore')
import re, flair, random, time
# from bnlp import BasicTokenizer
from indicnlp.tokenize import indic_tokenize
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from huggingface_hub import hf_hub_download
st.set_page_config(
page_title="Marathi POS Tagger",
page_icon="✔️",
layout="wide",
)
model_path = hf_hub_download(
repo_id="atanu0491/MarathiPOSModel",
filename="marathi-best-model.pt"
)
@st.cache_resource()
def load_model(model_name):
model = SequenceTagger.load(model_name)
return (model)
st.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)
activity = ['Select your Choice', 'File Upload', 'Text Input']
#choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)
choice = st.selectbox('How you want to proceed?', activity)
# st.sidebar.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)
tag_activity = ['TAG', '<unk>', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO']
tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity)
if tag_choice == 'TAG':
st.sidebar.info('Select the TAG', icon="ℹ️")
if tag_choice == '<unk>':
st.sidebar.info('Unknown', icon="ℹ️")
if tag_choice == 'CC_CCD':
st.sidebar.info('Co-ordinator', icon="ℹ️")
if tag_choice == 'CC_CCS':
st.sidebar.info('Subordinator', icon="ℹ️")
if tag_choice == 'CC_CCS_UT':
st.sidebar.info('Quotative', icon="ℹ️")
if tag_choice == 'DM_DMD':
st.sidebar.info('Deictic demonstrative', icon="ℹ️")
if tag_choice == 'DM_DMR':
st.sidebar.info('Relative demonstrative', icon="ℹ️")
if tag_choice == 'DM_DMQ':
st.sidebar.info('Wh-word', icon="ℹ️")
if tag_choice == 'JJ':
st.sidebar.info('Adjective', icon="ℹ️")
if tag_choice == 'N_NN':
st.sidebar.info('Common noun', icon="ℹ️")
if tag_choice == 'N_NNP':
st.sidebar.info('Proper noun', icon="ℹ️")
if tag_choice == 'N_NNV':
st.sidebar.info('Verbal noun', icon="ℹ️")
if tag_choice == 'N_NST':
st.sidebar.info('Locative noun', icon="ℹ️")
if tag_choice == 'PR_PRC':
st.sidebar.info('Reciprocal pronoun', icon="ℹ️")
if tag_choice == 'PR_PRF':
st.sidebar.info('Reflexive pronoun', icon="ℹ️")
if tag_choice == 'PR_PRL':
st.sidebar.info('Relative pronoun', icon="ℹ️")
if tag_choice == 'PR_PRP':
st.sidebar.info('Personal pronoun', icon="ℹ️")
if tag_choice == 'PR_PRQ':
st.sidebar.info('Wh-word', icon="ℹ️")
if tag_choice == 'PSP':
st.sidebar.info('Postposition', icon="ℹ️")
if tag_choice == 'QT_QTC':
st.sidebar.info('Cardinals', icon="ℹ️")
if tag_choice == 'QT_QTF':
st.sidebar.info('General quantifier', icon="ℹ️")
if tag_choice == 'RB':
st.sidebar.info('Adverb', icon="ℹ️")
if tag_choice == 'RD_ECH':
st.sidebar.info('Echo words', icon="ℹ️")
if tag_choice == 'RD_PUNC':
st.sidebar.info('Punctuation', icon="ℹ️")
if tag_choice == 'RD_RDF':
st.sidebar.info('Foreign words', icon="ℹ️")
if tag_choice == 'RD_SYM':
st.sidebar.info('Symbol', icon="ℹ️")
if tag_choice == 'RD_UNK':
st.sidebar.info('Unknown', icon="ℹ️")
if tag_choice == 'RP_CL':
st.sidebar.info('Classifier particle', icon="ℹ️")
if tag_choice == 'RP_INJ':
st.sidebar.info('Interjection particle', icon="ℹ️")
if tag_choice == 'RP_INTF':
st.sidebar.info('Intensifier particle', icon="ℹ️")
if tag_choice == 'RP_NEG':
st.sidebar.info('Negation particle', icon="ℹ️")
if tag_choice == 'RP_RPD':
st.sidebar.info('Default particle', icon="ℹ️")
if tag_choice == 'V_VAUX':
st.sidebar.info('Auxiliary verb', icon="ℹ️")
if tag_choice == 'V_VM':
st.sidebar.info('Main verb', icon="ℹ️")
if tag_choice == 'V_VM_VF':
st.sidebar.info('Finite verb', icon="ℹ️")
if tag_choice == 'V_VM_VINF':
st.sidebar.info('Infinite verb', icon="ℹ️")
if tag_choice == 'V_VM_VNF':
st.sidebar.info('Non-finite verb', icon="ℹ️")
if tag_choice == 'V_VM_VNG':
st.sidebar.info('Gerund verb', icon="ℹ️")
if tag_choice == 'QT_QTO':
st.sidebar.info('Ordinals', icon="ℹ️")
st.sidebar.info('Last updated on: 24 September 2025', icon="✅")
model = load_model(model_path)
#model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')
if choice == 'Text Input':
input_data = st.text_area("Write your sentence below", value="", height=68)
if st.button('Click to execute'):
# data = BasicTokenizer().tokenize(input_data)
data = indic_tokenize.trivial_tokenize(input_data)
# data = word_tokenize(input_data)
sentence = Sentence(data)
# model = load_model('best-model-002.pt')
model.predict(sentence)
my_list = []
for token in sentence:
word = []
word.append(token.text)
word.append(token.tag)
my_list.append(tuple(word))
annotated_text(my_list)
if choice == 'File Upload':
uploaded_file = st.file_uploader("Upload your File in .txt format", type=["txt"])
if uploaded_file is not None:
lines = uploaded_file.read().decode('utf-8').splitlines()
# Define output Excel file name
output_file_name = os.path.join("/tmp", uploaded_file.name.split('.')[0] + '_tagged.xlsx')
raw_sentences = []
tagged_sentences = []
with st.spinner("Wait for processing the file..."):
for line in lines:
# data = BasicTokenizer().tokenize(line)
data = indic_tokenize.trivial_tokenize(line)
sentence = Sentence(data)
model.predict(sentence)
my_list = []
for token in sentence:
word = (token.text, token.tag)
my_list.append(word)
raw_line = ' '.join([f"{word}" for word, tag in my_list])
tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
raw_sentences.append(raw_line)
tagged_sentences.append(tagged_line)
# Save to Excel
df = pd.DataFrame({
"Raw Sentence": raw_sentences,
"Tagged Sentence": tagged_sentences
})
df.to_excel(output_file_name, index=False)
# Provide download button
with open(output_file_name, "rb") as f:
btn = st.download_button(
label="Download the tagged data in Excel (.xlsx) format",
data=f,
file_name=os.path.basename(output_file_name),
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚")