postagger_beta / app.py
atanu0491's picture
Update app.py
c70567d verified
import streamlit as st
from annotated_text import annotated_text
import warnings
import pandas as pd
from pandas import DataFrame
warnings.filterwarnings('ignore')
import re, flair, random, time
from bnlp import BasicTokenizer
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
st.set_page_config(
page_title="বাংলা POS Tagger",
page_icon="✔️",
layout="wide",
)
@st.cache_resource()
def load_model(model_name):
model = SequenceTagger.load(model_name)
return (model)
st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚")
activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট']
choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)
st.sidebar.markdown('''<h3><center><b><u>BIS POS Tagset</u></b><center></h3>''', unsafe_allow_html=True)
st.sidebar.info('''
<unk> --> _Unknown_
CC_CCD --> _Co-ordinator_
CC_CCS --> _Subordinator_
CC_CCS_UT --> _Quotative_
DM_DMD --> _Deictic demonstrative_
DM_DMR --> _Relative demonstrative_
DM_DMQ --> _Wh-word_
JJ --> _Adjective_
N_NN --> _Common noun_
N_NNP --> _Proper noun_
N_NNV --> _Verbal noun_
N_NST --> _Locative noun_
PR_PRC --> _Reciprocal pronoun_
PR_PRF --> _Reflexive pronoun_
PR_PRL --> _Relative pronoun_
PR_PRP --> _Personal pronoun_
PR_PRQ --> _Wh-word_
PSP --> _Postposition_
QT_QTC --> _Cardinals_
QT_QTF --> _General quantifier_
RB --> _Adverb_
RD_ECH --> _Echo words_
RD_PUNC --> _Punctuation_
RD_RDF --> _Foreign words_
RD_SYM --> _Symbol_
RD_UNK --> _Unknown_
RP_CL --> _Classifier particle_
RP_INJ --> _Interjection particle_
RP_INTF --> _Intensifier particle_
RP_NEG --> _Negation particle_
RP_RPD --> _Default particle_
V_VAUX --> _Auxiliary verb_
V_VM --> _Main verb_
V_VM_VF --> _Finite verb_
V_VM_VINF --> _Infinite verb_
V_VM_VNF --> _Non-finite verb_
V_VM_VNG --> _Gerund verb_
QT_QTO --> _Ordinals_
''')
st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️")
model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')
if choice == 'টেক্সট ইনপুট':
input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10)
if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'):
data = BasicTokenizer().tokenize(input_data)
sentence = Sentence(data)
# model = load_model('best-model-002.pt')
model.predict(sentence)
my_list = []
for token in sentence:
word = []
word.append(token.text)
word.append(token.tag)
my_list.append(tuple(word))
annotated_text(my_list)
if choice == 'ফাইল আপলোড (for PUBLIC use)':
uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
if uploaded_file is not None:
lines = uploaded_file.read().decode('utf-8').splitlines()
# Define output Excel file name
output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx'
raw_sentences = []
tagged_sentences = []
with st.spinner("Wait for processing the file..."):
for line in lines:
data = BasicTokenizer().tokenize(line)
sentence = Sentence(data)
model.predict(sentence)
my_list = []
for token in sentence:
word = (token.text, token.tag)
my_list.append(word)
raw_line = ' '.join([f"{word}" for word, tag in my_list])
tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
raw_sentences.append(raw_line)
tagged_sentences.append(tagged_line)
# Save to Excel
df = pd.DataFrame({
"Raw Sentence": raw_sentences,
"Tagged Sentence": tagged_sentences
})
df.to_excel(output_file_name, index=False)
# Provide download button
with open(output_file_name, "rb") as f:
btn = st.download_button(
label="Download the tagged data in Excel (.xlsx) format",
data=f,
file_name=output_file_name,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# if choice == 'ফাইল আপলোড (for PUBLIC use)':
# uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
# if uploaded_file is not None:
# lines = uploaded_file.read().decode('utf-8').splitlines()
# # Define output file name
# output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt'
# with open(output_file_name, 'w', encoding='utf-8') as out_file:
# for line in lines:
# data = BasicTokenizer().tokenize(line)
# sentence = Sentence(data)
# model.predict(sentence)
# my_list = []
# for token in sentence:
# word = (token.text, token.tag)
# my_list.append(word)
# # Write line to output file
# tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
# out_file.write(tagged_line + '\n')
# # Show annotated text
# # annotated_text(*my_list)
# #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name)
# with open(output_file_name, "rb") as f:
# btn = st.download_button(
# label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",
# data=f,
# file_name=output_file_name,
# mime="text/plain"
# )
if choice == 'ফাইল আপলোড (for SCTR use only)':
uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন")
if uploaded_files is not None:
search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1]
f_name = search_word_def + '.tsv'
f = open(f_name, 'a')
dataframe = pd.read_excel(uploaded_files)
# st.write(dataframe.head())
for index, row in dataframe.iterrows():
if pd.notnull(row['Unnamed: 4']):
data = BasicTokenizer().tokenize(row['Unnamed: 4'])
sentence = Sentence(data)
model.predict(sentence)
search_w_d = []
search_w = []
my_list = []
for token in sentence:
if token.text == search_word_def:
w_d = []
w_d.append(token.text)
w_d.append(token.tag)
search_w_d.append("/".join(tuple(w_d)))
word = []
word.append(token.text)
word.append(token.tag)
my_list.append("/".join(tuple(word)))
f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n")
f.close()
with open(f_name, "rb") as file:
btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name)