Spaces:
Sleeping
Sleeping
File size: 8,425 Bytes
1276577 2480de6 5233188 663a99e b428cbd 1276577 db2835b 1276577 b428cbd 1276577 578b222 5ae8e1a e719bac 533a1fc ff2ae0d 999d27c ff2ae0d 13de761 f7f6558 a1b9fef 1276577 f7f6558 bf5fd82 85b069f 3eb697a 3eace1e 5992751 55344d6 1999770 be9b287 fa5ea7d 9fec093 c625cd0 04cfc60 c625cd0 0c8faf0 c625cd0 0c8faf0 c625cd0 fa5ea7d be9b287 04cfc60 cbaf2f7 ce9b7db 5ee82f2 a41710a 5992751 1276577 74fa79d 753d2e6 74fa79d f4ef5b0 66024cb 74fa79d f4ef5b0 ea7950a b7507ff 71f87e2 74fa79d 16e813a 71f87e2 28121b0 3eace1e 1b4d60f d30ebcf cd48228 3eace1e c70567d 8ca33fd c70567d 8ca33fd c70567d 8ca33fd c70567d 8ca33fd c70567d 26a5a7d c70567d 26a5a7d c70567d 26a5a7d 3eace1e c70567d 5b621e7 3eace1e de5556d 3eace1e f189b6f a8ddd45 5233188 cce15d0 41ba46c 9b6e733 709e17d 0638c34 41ba46c 0be453c 709e17d 0242d98 af59036 dbdd7c2 0242d98 709e17d af59036 9e288ed 0638c34 bfc3ac8 55344d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
import streamlit as st
from annotated_text import annotated_text
import warnings
import pandas as pd
from pandas import DataFrame
warnings.filterwarnings('ignore')
import re, flair, random, time
from bnlp import BasicTokenizer
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
st.set_page_config(
page_title="বাংলা POS Tagger",
page_icon="✔️",
layout="wide",
)
@st.cache_resource()
def load_model(model_name):
model = SequenceTagger.load(model_name)
return (model)
st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚")
activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট']
choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)
st.sidebar.markdown('''<h3><center><b><u>BIS POS Tagset</u></b><center></h3>''', unsafe_allow_html=True)
st.sidebar.info('''
<unk> --> _Unknown_
CC_CCD --> _Co-ordinator_
CC_CCS --> _Subordinator_
CC_CCS_UT --> _Quotative_
DM_DMD --> _Deictic demonstrative_
DM_DMR --> _Relative demonstrative_
DM_DMQ --> _Wh-word_
JJ --> _Adjective_
N_NN --> _Common noun_
N_NNP --> _Proper noun_
N_NNV --> _Verbal noun_
N_NST --> _Locative noun_
PR_PRC --> _Reciprocal pronoun_
PR_PRF --> _Reflexive pronoun_
PR_PRL --> _Relative pronoun_
PR_PRP --> _Personal pronoun_
PR_PRQ --> _Wh-word_
PSP --> _Postposition_
QT_QTC --> _Cardinals_
QT_QTF --> _General quantifier_
RB --> _Adverb_
RD_ECH --> _Echo words_
RD_PUNC --> _Punctuation_
RD_RDF --> _Foreign words_
RD_SYM --> _Symbol_
RD_UNK --> _Unknown_
RP_CL --> _Classifier particle_
RP_INJ --> _Interjection particle_
RP_INTF --> _Intensifier particle_
RP_NEG --> _Negation particle_
RP_RPD --> _Default particle_
V_VAUX --> _Auxiliary verb_
V_VM --> _Main verb_
V_VM_VF --> _Finite verb_
V_VM_VINF --> _Infinite verb_
V_VM_VNF --> _Non-finite verb_
V_VM_VNG --> _Gerund verb_
QT_QTO --> _Ordinals_
''')
st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️")
model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')
if choice == 'টেক্সট ইনপুট':
input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10)
if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'):
data = BasicTokenizer().tokenize(input_data)
sentence = Sentence(data)
# model = load_model('best-model-002.pt')
model.predict(sentence)
my_list = []
for token in sentence:
word = []
word.append(token.text)
word.append(token.tag)
my_list.append(tuple(word))
annotated_text(my_list)
if choice == 'ফাইল আপলোড (for PUBLIC use)':
uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
if uploaded_file is not None:
lines = uploaded_file.read().decode('utf-8').splitlines()
# Define output Excel file name
output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx'
raw_sentences = []
tagged_sentences = []
with st.spinner("Wait for processing the file..."):
for line in lines:
data = BasicTokenizer().tokenize(line)
sentence = Sentence(data)
model.predict(sentence)
my_list = []
for token in sentence:
word = (token.text, token.tag)
my_list.append(word)
raw_line = ' '.join([f"{word}" for word, tag in my_list])
tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
raw_sentences.append(raw_line)
tagged_sentences.append(tagged_line)
# Save to Excel
df = pd.DataFrame({
"Raw Sentence": raw_sentences,
"Tagged Sentence": tagged_sentences
})
df.to_excel(output_file_name, index=False)
# Provide download button
with open(output_file_name, "rb") as f:
btn = st.download_button(
label="Download the tagged data in Excel (.xlsx) format",
data=f,
file_name=output_file_name,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# if choice == 'ফাইল আপলোড (for PUBLIC use)':
# uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
# if uploaded_file is not None:
# lines = uploaded_file.read().decode('utf-8').splitlines()
# # Define output file name
# output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt'
# with open(output_file_name, 'w', encoding='utf-8') as out_file:
# for line in lines:
# data = BasicTokenizer().tokenize(line)
# sentence = Sentence(data)
# model.predict(sentence)
# my_list = []
# for token in sentence:
# word = (token.text, token.tag)
# my_list.append(word)
# # Write line to output file
# tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
# out_file.write(tagged_line + '\n')
# # Show annotated text
# # annotated_text(*my_list)
# #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name)
# with open(output_file_name, "rb") as f:
# btn = st.download_button(
# label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",
# data=f,
# file_name=output_file_name,
# mime="text/plain"
# )
if choice == 'ফাইল আপলোড (for SCTR use only)':
uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন")
if uploaded_files is not None:
search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1]
f_name = search_word_def + '.tsv'
f = open(f_name, 'a')
dataframe = pd.read_excel(uploaded_files)
# st.write(dataframe.head())
for index, row in dataframe.iterrows():
if pd.notnull(row['Unnamed: 4']):
data = BasicTokenizer().tokenize(row['Unnamed: 4'])
sentence = Sentence(data)
model.predict(sentence)
search_w_d = []
search_w = []
my_list = []
for token in sentence:
if token.text == search_word_def:
w_d = []
w_d.append(token.text)
w_d.append(token.tag)
search_w_d.append("/".join(tuple(w_d)))
word = []
word.append(token.text)
word.append(token.tag)
my_list.append("/".join(tuple(word)))
f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n")
f.close()
with open(f_name, "rb") as file:
btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name)
|