File size: 8,425 Bytes
1276577
 
 
 
2480de6
5233188
663a99e
b428cbd
1276577
 
 
db2835b
1276577
b428cbd
1276577
 
 
578b222
5ae8e1a
e719bac
533a1fc
ff2ae0d
 
999d27c
ff2ae0d
 
13de761
f7f6558
a1b9fef
1276577
 
 
f7f6558
bf5fd82
85b069f
3eb697a
3eace1e
5992751
55344d6
1999770
be9b287
fa5ea7d
9fec093
c625cd0
 
 
 
 
 
 
04cfc60
c625cd0
 
 
 
 
 
 
 
 
 
 
 
 
0c8faf0
c625cd0
 
 
 
 
 
 
 
 
 
 
 
 
0c8faf0
c625cd0
fa5ea7d
be9b287
04cfc60
cbaf2f7
ce9b7db
5ee82f2
 
a41710a
5992751
1276577
74fa79d
 
753d2e6
74fa79d
f4ef5b0
66024cb
74fa79d
f4ef5b0
ea7950a
b7507ff
71f87e2
74fa79d
16e813a
71f87e2
 
 
 
 
28121b0
3eace1e
1b4d60f
d30ebcf
cd48228
3eace1e
c70567d
 
 
 
 
8ca33fd
c70567d
8ca33fd
 
 
 
 
 
 
 
 
 
c70567d
8ca33fd
 
c70567d
 
8ca33fd
c70567d
 
 
 
 
 
 
 
 
26a5a7d
 
c70567d
26a5a7d
 
c70567d
26a5a7d
3eace1e
c70567d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b621e7
3eace1e
de5556d
3eace1e
 
f189b6f
a8ddd45
5233188
cce15d0
41ba46c
 
9b6e733
709e17d
0638c34
41ba46c
0be453c
709e17d
0242d98
af59036
 
dbdd7c2
0242d98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709e17d
af59036
9e288ed
0638c34
bfc3ac8
55344d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import streamlit as st
from annotated_text import annotated_text

import warnings
import pandas as pd
from pandas import DataFrame


warnings.filterwarnings('ignore')
import re, flair, random, time
from bnlp import BasicTokenizer
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer





st.set_page_config(
   page_title="বাংলা POS Tagger",
   page_icon="✔️",
   layout="wide",
)


@st.cache_resource()
def load_model(model_name):
    model = SequenceTagger.load(model_name)
    return (model)

st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚")


activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট']
choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)

st.sidebar.markdown('''<h3><center><b><u>BIS POS Tagset</u></b><center></h3>''', unsafe_allow_html=True)

st.sidebar.info('''
<unk> --> _Unknown_  
CC_CCD --> _Co-ordinator_  
CC_CCS --> _Subordinator_  
CC_CCS_UT --> _Quotative_  
DM_DMD --> _Deictic demonstrative_  
DM_DMR --> _Relative demonstrative_  
DM_DMQ --> _Wh-word_  
JJ --> _Adjective_  
N_NN --> _Common noun_  
N_NNP --> _Proper noun_  
N_NNV --> _Verbal noun_  
N_NST --> _Locative noun_  
PR_PRC --> _Reciprocal pronoun_  
PR_PRF --> _Reflexive pronoun_  
PR_PRL --> _Relative pronoun_  
PR_PRP --> _Personal pronoun_  
PR_PRQ --> _Wh-word_  
PSP --> _Postposition_  
QT_QTC --> _Cardinals_  
QT_QTF --> _General quantifier_  
RB --> _Adverb_  
RD_ECH --> _Echo words_  
RD_PUNC --> _Punctuation_  
RD_RDF --> _Foreign words_  
RD_SYM --> _Symbol_  
RD_UNK --> _Unknown_  
RP_CL --> _Classifier particle_  
RP_INJ --> _Interjection particle_  
RP_INTF --> _Intensifier particle_  
RP_NEG --> _Negation particle_  
RP_RPD --> _Default particle_  
V_VAUX --> _Auxiliary verb_  
V_VM --> _Main verb_  
V_VM_VF --> _Finite verb_  
V_VM_VINF --> _Infinite verb_  
V_VM_VNF --> _Non-finite verb_  
V_VM_VNG --> _Gerund verb_  
QT_QTO --> _Ordinals_  
''')
st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️")



model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')



if choice == 'টেক্সট ইনপুট':
    input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10)
    if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'):
       
        data = BasicTokenizer().tokenize(input_data)
        
       
        sentence = Sentence(data)
        # model = load_model('best-model-002.pt')
        model.predict(sentence)
        my_list = []
      
        for token in sentence:
            word = []
            word.append(token.text)
            word.append(token.tag)
            my_list.append(tuple(word))
        annotated_text(my_list)

if choice == 'ফাইল আপলোড (for PUBLIC use)':
    uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
    if uploaded_file is not None:
        lines = uploaded_file.read().decode('utf-8').splitlines()

        # Define output Excel file name
        output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx'

        raw_sentences = []
        tagged_sentences = []

        with st.spinner("Wait for processing the file..."):
            for line in lines:
                data = BasicTokenizer().tokenize(line)
                sentence = Sentence(data)
                model.predict(sentence)
                my_list = []

                for token in sentence:
                    word = (token.text, token.tag)
                    my_list.append(word)

                raw_line = ' '.join([f"{word}" for word, tag in my_list])
                tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])

                raw_sentences.append(raw_line)
                tagged_sentences.append(tagged_line)

            # Save to Excel
            df = pd.DataFrame({
                "Raw Sentence": raw_sentences,
                "Tagged Sentence": tagged_sentences
            })

            df.to_excel(output_file_name, index=False)

        # Provide download button
        with open(output_file_name, "rb") as f:
            btn = st.download_button(
                label="Download the tagged data in Excel (.xlsx) format",
                data=f,
                file_name=output_file_name,
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )

# if choice == 'ফাইল আপলোড (for PUBLIC use)':
#     uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
#     if uploaded_file is not None:
#         lines = uploaded_file.read().decode('utf-8').splitlines()

#         # Define output file name
#         output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt'

#         with open(output_file_name, 'w', encoding='utf-8') as out_file:
#             for line in lines:
#                 data = BasicTokenizer().tokenize(line)
#                 sentence = Sentence(data)
#                 model.predict(sentence)

#                 my_list = []

#                 for token in sentence:
#                     word = (token.text, token.tag)
#                     my_list.append(word)

#                 # Write line to output file
#                 tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
#                 out_file.write(tagged_line + '\n')

#                 # Show annotated text
#                 # annotated_text(*my_list)

#             #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name)
#         with open(output_file_name, "rb") as f:
#             btn = st.download_button(
#                 label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",
#                 data=f,
#                 file_name=output_file_name,
#                 mime="text/plain"
#             )

        


    
    

if choice == 'ফাইল আপলোড (for SCTR use only)':
    uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন")

    if uploaded_files is not None:
        search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1]
        f_name = search_word_def + '.tsv'

        f = open(f_name, 'a')
        dataframe = pd.read_excel(uploaded_files)
        # st.write(dataframe.head())

        for index, row in dataframe.iterrows():
            if pd.notnull(row['Unnamed: 4']):
                data = BasicTokenizer().tokenize(row['Unnamed: 4'])
                
                sentence = Sentence(data)
                model.predict(sentence)
                
                search_w_d = []
                search_w = []
                my_list = []
                for token in sentence:
                    if token.text == search_word_def:
                        w_d = []
                        w_d.append(token.text)
                        w_d.append(token.tag)
                        search_w_d.append("/".join(tuple(w_d)))
                    word = []
                    word.append(token.text)
                    word.append(token.tag)
                    my_list.append("/".join(tuple(word)))

                f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n")
        f.close()
        with open(f_name, "rb") as file:
            btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name)