Spaces:

atanu0491
/

postagger_beta

Running

App Files Files Community

postagger_beta / app.py

atanu0491

Update app.py

c70567d verified 8 months ago

raw

history blame contribute delete

8.43 kB

	import streamlit as st
	from annotated_text import annotated_text

	import warnings
	import pandas as pd
	from pandas import DataFrame


	warnings.filterwarnings('ignore')
	import re, flair, random, time
	from bnlp import BasicTokenizer
	from flair.data import Corpus, Sentence
	from flair.datasets import ColumnCorpus

	from flair.models import SequenceTagger
	from flair.trainers import ModelTrainer





	st.set_page_config(
	page_title="বাংলা POS Tagger",
	page_icon="✔️",
	layout="wide",
	)


	@st.cache_resource()
	def load_model(model_name):
	model = SequenceTagger.load(model_name)
	return (model)

	st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚")


	activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট']
	choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)

	st.sidebar.markdown('''<h3><center><b><u>BIS POS Tagset</u></b><center></h3>''', unsafe_allow_html=True)

	st.sidebar.info('''
	<unk> --> _Unknown_
	CC_CCD --> _Co-ordinator_
	CC_CCS --> _Subordinator_
	CC_CCS_UT --> _Quotative_
	DM_DMD --> _Deictic demonstrative_
	DM_DMR --> _Relative demonstrative_
	DM_DMQ --> _Wh-word_
	JJ --> _Adjective_
	N_NN --> _Common noun_
	N_NNP --> _Proper noun_
	N_NNV --> _Verbal noun_
	N_NST --> _Locative noun_
	PR_PRC --> _Reciprocal pronoun_
	PR_PRF --> _Reflexive pronoun_
	PR_PRL --> _Relative pronoun_
	PR_PRP --> _Personal pronoun_
	PR_PRQ --> _Wh-word_
	PSP --> _Postposition_
	QT_QTC --> _Cardinals_
	QT_QTF --> _General quantifier_
	RB --> _Adverb_
	RD_ECH --> _Echo words_
	RD_PUNC --> _Punctuation_
	RD_RDF --> _Foreign words_
	RD_SYM --> _Symbol_
	RD_UNK --> _Unknown_
	RP_CL --> _Classifier particle_
	RP_INJ --> _Interjection particle_
	RP_INTF --> _Intensifier particle_
	RP_NEG --> _Negation particle_
	RP_RPD --> _Default particle_
	V_VAUX --> _Auxiliary verb_
	V_VM --> _Main verb_
	V_VM_VF --> _Finite verb_
	V_VM_VINF --> _Infinite verb_
	V_VM_VNF --> _Non-finite verb_
	V_VM_VNG --> _Gerund verb_
	QT_QTO --> _Ordinals_
	''')
	st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️")



	model = load_model('best-model-002.pt')
	#model = SequenceTagger.load('best-model-002.pt')



	if choice == 'টেক্সট ইনপুট':
	input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10)
	if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'):

	data = BasicTokenizer().tokenize(input_data)


	sentence = Sentence(data)
	# model = load_model('best-model-002.pt')
	model.predict(sentence)
	my_list = []

	for token in sentence:
	word = []
	word.append(token.text)
	word.append(token.tag)
	my_list.append(tuple(word))
	annotated_text(my_list)

	if choice == 'ফাইল আপলোড (for PUBLIC use)':
	uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
	if uploaded_file is not None:
	lines = uploaded_file.read().decode('utf-8').splitlines()

	# Define output Excel file name
	output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx'

	raw_sentences = []
	tagged_sentences = []

	with st.spinner("Wait for processing the file..."):
	for line in lines:
	data = BasicTokenizer().tokenize(line)
	sentence = Sentence(data)
	model.predict(sentence)
	my_list = []

	for token in sentence:
	word = (token.text, token.tag)
	my_list.append(word)

	raw_line = ' '.join([f"{word}" for word, tag in my_list])
	tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])

	raw_sentences.append(raw_line)
	tagged_sentences.append(tagged_line)

	# Save to Excel
	df = pd.DataFrame({
	"Raw Sentence": raw_sentences,
	"Tagged Sentence": tagged_sentences
	})

	df.to_excel(output_file_name, index=False)

	# Provide download button
	with open(output_file_name, "rb") as f:
	btn = st.download_button(
	label="Download the tagged data in Excel (.xlsx) format",
	data=f,
	file_name=output_file_name,
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	# if choice == 'ফাইল আপলোড (for PUBLIC use)':
	# uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
	# if uploaded_file is not None:
	# lines = uploaded_file.read().decode('utf-8').splitlines()

	# # Define output file name
	# output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt'

	# with open(output_file_name, 'w', encoding='utf-8') as out_file:
	# for line in lines:
	# data = BasicTokenizer().tokenize(line)
	# sentence = Sentence(data)
	# model.predict(sentence)

	# my_list = []

	# for token in sentence:
	# word = (token.text, token.tag)
	# my_list.append(word)

	# # Write line to output file
	# tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
	# out_file.write(tagged_line + '\n')

	# # Show annotated text
	# # annotated_text(*my_list)

	# #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name)
	# with open(output_file_name, "rb") as f:
	# btn = st.download_button(
	# label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",
	# data=f,
	# file_name=output_file_name,
	# mime="text/plain"
	# )







	if choice == 'ফাইল আপলোড (for SCTR use only)':
	uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন")

	if uploaded_files is not None:
	search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1]
	f_name = search_word_def + '.tsv'

	f = open(f_name, 'a')
	dataframe = pd.read_excel(uploaded_files)
	# st.write(dataframe.head())

	for index, row in dataframe.iterrows():
	if pd.notnull(row['Unnamed: 4']):
	data = BasicTokenizer().tokenize(row['Unnamed: 4'])

	sentence = Sentence(data)
	model.predict(sentence)

	search_w_d = []
	search_w = []
	my_list = []
	for token in sentence:
	if token.text == search_word_def:
	w_d = []
	w_d.append(token.text)
	w_d.append(token.tag)
	search_w_d.append("/".join(tuple(w_d)))
	word = []
	word.append(token.text)
	word.append(token.tag)
	my_list.append("/".join(tuple(word)))

	f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n")
	f.close()
	with open(f_name, "rb") as file:
	btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name)