Spaces:

atanu0491
/

MarathiPOS

Running

App Files Files Community

MarathiPOS / src /streamlit_app.py

atanu0491

Update src/streamlit_app.py

ab693c0 verified 3 months ago

raw

history blame contribute delete

7.64 kB

	import streamlit as st


	from annotated_text import annotated_text
	# from nltk.tokenize import word_tokenize

	import warnings
	import pandas as pd
	from pandas import DataFrame
	import os

	warnings.filterwarnings('ignore')
	import re, flair, random, time
	# from bnlp import BasicTokenizer
	from indicnlp.tokenize import indic_tokenize
	from flair.data import Corpus, Sentence
	from flair.datasets import ColumnCorpus

	from flair.models import SequenceTagger
	from flair.trainers import ModelTrainer

	from huggingface_hub import hf_hub_download


	st.set_page_config(
	page_title="Marathi POS Tagger",
	page_icon="✔️",
	layout="wide",
	)

	model_path = hf_hub_download(
	repo_id="atanu0491/MarathiPOSModel",
	filename="marathi-best-model.pt"
	)



	@st.cache_resource()
	def load_model(model_name):
	model = SequenceTagger.load(model_name)
	return (model)


	st.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)

	activity = ['Select your Choice', 'File Upload', 'Text Input']
	#choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)
	choice = st.selectbox('How you want to proceed?', activity)

	# st.sidebar.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)

	tag_activity = ['TAG', '<unk>', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO']
	tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity)

	if tag_choice == 'TAG':
	st.sidebar.info('Select the TAG', icon="ℹ️")
	if tag_choice == '<unk>':
	st.sidebar.info('Unknown', icon="ℹ️")
	if tag_choice == 'CC_CCD':
	st.sidebar.info('Co-ordinator', icon="ℹ️")
	if tag_choice == 'CC_CCS':
	st.sidebar.info('Subordinator', icon="ℹ️")
	if tag_choice == 'CC_CCS_UT':
	st.sidebar.info('Quotative', icon="ℹ️")
	if tag_choice == 'DM_DMD':
	st.sidebar.info('Deictic demonstrative', icon="ℹ️")
	if tag_choice == 'DM_DMR':
	st.sidebar.info('Relative demonstrative', icon="ℹ️")
	if tag_choice == 'DM_DMQ':
	st.sidebar.info('Wh-word', icon="ℹ️")
	if tag_choice == 'JJ':
	st.sidebar.info('Adjective', icon="ℹ️")
	if tag_choice == 'N_NN':
	st.sidebar.info('Common noun', icon="ℹ️")
	if tag_choice == 'N_NNP':
	st.sidebar.info('Proper noun', icon="ℹ️")
	if tag_choice == 'N_NNV':
	st.sidebar.info('Verbal noun', icon="ℹ️")
	if tag_choice == 'N_NST':
	st.sidebar.info('Locative noun', icon="ℹ️")
	if tag_choice == 'PR_PRC':
	st.sidebar.info('Reciprocal pronoun', icon="ℹ️")
	if tag_choice == 'PR_PRF':
	st.sidebar.info('Reflexive pronoun', icon="ℹ️")
	if tag_choice == 'PR_PRL':
	st.sidebar.info('Relative pronoun', icon="ℹ️")
	if tag_choice == 'PR_PRP':
	st.sidebar.info('Personal pronoun', icon="ℹ️")
	if tag_choice == 'PR_PRQ':
	st.sidebar.info('Wh-word', icon="ℹ️")
	if tag_choice == 'PSP':
	st.sidebar.info('Postposition', icon="ℹ️")
	if tag_choice == 'QT_QTC':
	st.sidebar.info('Cardinals', icon="ℹ️")
	if tag_choice == 'QT_QTF':
	st.sidebar.info('General quantifier', icon="ℹ️")
	if tag_choice == 'RB':
	st.sidebar.info('Adverb', icon="ℹ️")
	if tag_choice == 'RD_ECH':
	st.sidebar.info('Echo words', icon="ℹ️")
	if tag_choice == 'RD_PUNC':
	st.sidebar.info('Punctuation', icon="ℹ️")
	if tag_choice == 'RD_RDF':
	st.sidebar.info('Foreign words', icon="ℹ️")
	if tag_choice == 'RD_SYM':
	st.sidebar.info('Symbol', icon="ℹ️")
	if tag_choice == 'RD_UNK':
	st.sidebar.info('Unknown', icon="ℹ️")
	if tag_choice == 'RP_CL':
	st.sidebar.info('Classifier particle', icon="ℹ️")
	if tag_choice == 'RP_INJ':
	st.sidebar.info('Interjection particle', icon="ℹ️")
	if tag_choice == 'RP_INTF':
	st.sidebar.info('Intensifier particle', icon="ℹ️")
	if tag_choice == 'RP_NEG':
	st.sidebar.info('Negation particle', icon="ℹ️")
	if tag_choice == 'RP_RPD':
	st.sidebar.info('Default particle', icon="ℹ️")
	if tag_choice == 'V_VAUX':
	st.sidebar.info('Auxiliary verb', icon="ℹ️")
	if tag_choice == 'V_VM':
	st.sidebar.info('Main verb', icon="ℹ️")
	if tag_choice == 'V_VM_VF':
	st.sidebar.info('Finite verb', icon="ℹ️")
	if tag_choice == 'V_VM_VINF':
	st.sidebar.info('Infinite verb', icon="ℹ️")
	if tag_choice == 'V_VM_VNF':
	st.sidebar.info('Non-finite verb', icon="ℹ️")
	if tag_choice == 'V_VM_VNG':
	st.sidebar.info('Gerund verb', icon="ℹ️")
	if tag_choice == 'QT_QTO':
	st.sidebar.info('Ordinals', icon="ℹ️")

	st.sidebar.info('Last updated on: 24 September 2025', icon="✅")



	model = load_model(model_path)
	#model = load_model('best-model-002.pt')
	#model = SequenceTagger.load('best-model-002.pt')



	if choice == 'Text Input':
	input_data = st.text_area("Write your sentence below", value="", height=68)
	if st.button('Click to execute'):

	# data = BasicTokenizer().tokenize(input_data)
	data = indic_tokenize.trivial_tokenize(input_data)
	# data = word_tokenize(input_data)


	sentence = Sentence(data)
	# model = load_model('best-model-002.pt')
	model.predict(sentence)
	my_list = []

	for token in sentence:
	word = []
	word.append(token.text)
	word.append(token.tag)
	my_list.append(tuple(word))
	annotated_text(my_list)

	if choice == 'File Upload':
	uploaded_file = st.file_uploader("Upload your File in .txt format", type=["txt"])
	if uploaded_file is not None:
	lines = uploaded_file.read().decode('utf-8').splitlines()

	# Define output Excel file name
	output_file_name = os.path.join("/tmp", uploaded_file.name.split('.')[0] + '_tagged.xlsx')


	raw_sentences = []
	tagged_sentences = []

	with st.spinner("Wait for processing the file..."):
	for line in lines:
	# data = BasicTokenizer().tokenize(line)
	data = indic_tokenize.trivial_tokenize(line)
	sentence = Sentence(data)
	model.predict(sentence)
	my_list = []

	for token in sentence:
	word = (token.text, token.tag)
	my_list.append(word)

	raw_line = ' '.join([f"{word}" for word, tag in my_list])
	tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])

	raw_sentences.append(raw_line)
	tagged_sentences.append(tagged_line)

	# Save to Excel
	df = pd.DataFrame({
	"Raw Sentence": raw_sentences,
	"Tagged Sentence": tagged_sentences
	})

	df.to_excel(output_file_name, index=False)

	# Provide download button
	with open(output_file_name, "rb") as f:
	btn = st.download_button(
	label="Download the tagged data in Excel (.xlsx) format",
	data=f,
	file_name=os.path.basename(output_file_name),
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)



	st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚")