File size: 7,644 Bytes
ce55d8a
199c8c7
a909add
 
c215acd
a909add
 
 
 
97ab123
a909add
 
 
3b2769b
 
a909add
 
 
 
 
 
 
22fe396
a909add
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c215acd
a909add
 
 
 
 
 
 
 
 
 
 
 
 
 
3b2769b
 
a909add
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
071047a
a909add
 
 
 
ab693c0
b3a25e5
a909add
 
 
 
 
 
ed281a8
 
a909add
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3a25e5
a909add
 
 
 
 
 
 
c215acd
a909add
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import streamlit as st


from annotated_text import annotated_text
# from nltk.tokenize import word_tokenize

import warnings
import pandas as pd
from pandas import DataFrame
import os

warnings.filterwarnings('ignore')
import re, flair, random, time
# from bnlp import BasicTokenizer
from indicnlp.tokenize import indic_tokenize  
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

from huggingface_hub import hf_hub_download


st.set_page_config(
   page_title="Marathi POS Tagger",
   page_icon="✔️",
   layout="wide",
)

model_path = hf_hub_download(
    repo_id="atanu0491/MarathiPOSModel",
    filename="marathi-best-model.pt"
)



@st.cache_resource()
def load_model(model_name):
    model = SequenceTagger.load(model_name)
    return (model)


st.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)

activity = ['Select your Choice', 'File Upload', 'Text Input']
#choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)
choice = st.selectbox('How you want to proceed?', activity)

# st.sidebar.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True)

tag_activity = ['TAG', '<unk>', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO']
tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity)

if tag_choice == 'TAG':
    st.sidebar.info('Select the TAG', icon="ℹ️")
if tag_choice == '<unk>':
    st.sidebar.info('Unknown', icon="ℹ️") 
if tag_choice == 'CC_CCD':
    st.sidebar.info('Co-ordinator', icon="ℹ️") 
if tag_choice == 'CC_CCS':
    st.sidebar.info('Subordinator', icon="ℹ️") 
if tag_choice == 'CC_CCS_UT':
    st.sidebar.info('Quotative', icon="ℹ️") 
if tag_choice == 'DM_DMD':
    st.sidebar.info('Deictic demonstrative', icon="ℹ️") 
if tag_choice == 'DM_DMR':
    st.sidebar.info('Relative demonstrative', icon="ℹ️") 
if tag_choice == 'DM_DMQ':
    st.sidebar.info('Wh-word', icon="ℹ️") 
if tag_choice == 'JJ':
    st.sidebar.info('Adjective', icon="ℹ️") 
if tag_choice == 'N_NN':
    st.sidebar.info('Common noun', icon="ℹ️") 
if tag_choice == 'N_NNP':
    st.sidebar.info('Proper noun', icon="ℹ️") 
if tag_choice == 'N_NNV':
    st.sidebar.info('Verbal noun', icon="ℹ️") 
if tag_choice == 'N_NST':
    st.sidebar.info('Locative noun', icon="ℹ️") 
if tag_choice == 'PR_PRC':
    st.sidebar.info('Reciprocal pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRF':
    st.sidebar.info('Reflexive pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRL':
    st.sidebar.info('Relative pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRP':
    st.sidebar.info('Personal pronoun', icon="ℹ️") 
if tag_choice == 'PR_PRQ':
    st.sidebar.info('Wh-word', icon="ℹ️") 
if tag_choice == 'PSP':
    st.sidebar.info('Postposition', icon="ℹ️") 
if tag_choice == 'QT_QTC':
    st.sidebar.info('Cardinals', icon="ℹ️") 
if tag_choice == 'QT_QTF':
    st.sidebar.info('General quantifier', icon="ℹ️") 
if tag_choice == 'RB':
    st.sidebar.info('Adverb', icon="ℹ️") 
if tag_choice == 'RD_ECH':
    st.sidebar.info('Echo words', icon="ℹ️") 
if tag_choice == 'RD_PUNC':
    st.sidebar.info('Punctuation', icon="ℹ️") 
if tag_choice == 'RD_RDF':
    st.sidebar.info('Foreign words', icon="ℹ️") 
if tag_choice == 'RD_SYM':
    st.sidebar.info('Symbol', icon="ℹ️") 
if tag_choice == 'RD_UNK':
    st.sidebar.info('Unknown', icon="ℹ️") 
if tag_choice == 'RP_CL':
    st.sidebar.info('Classifier particle', icon="ℹ️") 
if tag_choice == 'RP_INJ':
    st.sidebar.info('Interjection particle', icon="ℹ️") 
if tag_choice == 'RP_INTF':
    st.sidebar.info('Intensifier particle', icon="ℹ️") 
if tag_choice == 'RP_NEG':
    st.sidebar.info('Negation particle', icon="ℹ️") 
if tag_choice == 'RP_RPD':
    st.sidebar.info('Default particle', icon="ℹ️") 
if tag_choice == 'V_VAUX':
    st.sidebar.info('Auxiliary verb', icon="ℹ️") 
if tag_choice == 'V_VM':
    st.sidebar.info('Main verb', icon="ℹ️") 
if tag_choice == 'V_VM_VF':
    st.sidebar.info('Finite verb', icon="ℹ️") 
if tag_choice == 'V_VM_VINF':
    st.sidebar.info('Infinite verb', icon="ℹ️") 
if tag_choice == 'V_VM_VNF':
    st.sidebar.info('Non-finite verb', icon="ℹ️") 
if tag_choice == 'V_VM_VNG':
    st.sidebar.info('Gerund verb', icon="ℹ️") 
if tag_choice == 'QT_QTO':
    st.sidebar.info('Ordinals', icon="ℹ️") 

st.sidebar.info('Last updated on: 24 September 2025', icon="✅")



model = load_model(model_path)
#model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')



if choice == 'Text Input':
    input_data = st.text_area("Write your sentence below", value="", height=68)
    if st.button('Click to execute'):
       
        # data = BasicTokenizer().tokenize(input_data)
        data = indic_tokenize.trivial_tokenize(input_data)
        # data = word_tokenize(input_data)
        
       
        sentence = Sentence(data)
        # model = load_model('best-model-002.pt')
        model.predict(sentence)
        my_list = []
      
        for token in sentence:
            word = []
            word.append(token.text)
            word.append(token.tag)
            my_list.append(tuple(word))
        annotated_text(my_list)

if choice == 'File Upload':
    uploaded_file = st.file_uploader("Upload your File in .txt format", type=["txt"])
    if uploaded_file is not None:
        lines = uploaded_file.read().decode('utf-8').splitlines()

        # Define output Excel file name
        output_file_name = os.path.join("/tmp", uploaded_file.name.split('.')[0] + '_tagged.xlsx')
        

        raw_sentences = []
        tagged_sentences = []

        with st.spinner("Wait for processing the file..."):
            for line in lines:
                # data = BasicTokenizer().tokenize(line)
                data = indic_tokenize.trivial_tokenize(line)
                sentence = Sentence(data)
                model.predict(sentence)
                my_list = []

                for token in sentence:
                    word = (token.text, token.tag)
                    my_list.append(word)

                raw_line = ' '.join([f"{word}" for word, tag in my_list])
                tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])

                raw_sentences.append(raw_line)
                tagged_sentences.append(tagged_line)

            # Save to Excel
            df = pd.DataFrame({
                "Raw Sentence": raw_sentences,
                "Tagged Sentence": tagged_sentences
            })

            df.to_excel(output_file_name, index=False)

        # Provide download button
        with open(output_file_name, "rb") as f:
            btn = st.download_button(
                label="Download the tagged data in Excel (.xlsx) format",
                data=f,
                file_name=os.path.basename(output_file_name),
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        


st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚")