|
|
import streamlit as st |
|
|
|
|
|
|
|
|
from annotated_text import annotated_text |
|
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
import warnings |
|
|
import pandas as pd |
|
|
from pandas import DataFrame |
|
|
|
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
import re, flair, random, time |
|
|
from bnlp import BasicTokenizer |
|
|
from flair.data import Corpus, Sentence |
|
|
from flair.datasets import ColumnCorpus |
|
|
|
|
|
from flair.models import SequenceTagger |
|
|
from flair.trainers import ModelTrainer |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Indic POS Tagger", |
|
|
page_icon="✔️", |
|
|
layout="wide", |
|
|
) |
|
|
|
|
|
model_path = hf_hub_download( |
|
|
repo_id="atanu0491/IndicPOSModel", |
|
|
filename="best-model-002.pt" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource() |
|
|
def load_model(model_name): |
|
|
model = SequenceTagger.load(model_name) |
|
|
return (model) |
|
|
|
|
|
|
|
|
st.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True) |
|
|
|
|
|
activity = ['Select your Choice', 'File Upload', 'Text Input'] |
|
|
|
|
|
choice = st.selectbox('How you want to proceed?', activity) |
|
|
|
|
|
|
|
|
|
|
|
tag_activity = ['TAG', '<unk>', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO'] |
|
|
tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity) |
|
|
|
|
|
if tag_choice == 'TAG': |
|
|
st.sidebar.info('Select the TAG', icon="ℹ️") |
|
|
if tag_choice == '<unk>': |
|
|
st.sidebar.info('Unknown', icon="ℹ️") |
|
|
if tag_choice == 'CC_CCD': |
|
|
st.sidebar.info('Co-ordinator', icon="ℹ️") |
|
|
if tag_choice == 'CC_CCS': |
|
|
st.sidebar.info('Subordinator', icon="ℹ️") |
|
|
if tag_choice == 'CC_CCS_UT': |
|
|
st.sidebar.info('Quotative', icon="ℹ️") |
|
|
if tag_choice == 'DM_DMD': |
|
|
st.sidebar.info('Deictic demonstrative', icon="ℹ️") |
|
|
if tag_choice == 'DM_DMR': |
|
|
st.sidebar.info('Relative demonstrative', icon="ℹ️") |
|
|
if tag_choice == 'DM_DMQ': |
|
|
st.sidebar.info('Wh-word', icon="ℹ️") |
|
|
if tag_choice == 'JJ': |
|
|
st.sidebar.info('Adjective', icon="ℹ️") |
|
|
if tag_choice == 'N_NN': |
|
|
st.sidebar.info('Common noun', icon="ℹ️") |
|
|
if tag_choice == 'N_NNP': |
|
|
st.sidebar.info('Proper noun', icon="ℹ️") |
|
|
if tag_choice == 'N_NNV': |
|
|
st.sidebar.info('Verbal noun', icon="ℹ️") |
|
|
if tag_choice == 'N_NST': |
|
|
st.sidebar.info('Locative noun', icon="ℹ️") |
|
|
if tag_choice == 'PR_PRC': |
|
|
st.sidebar.info('Reciprocal pronoun', icon="ℹ️") |
|
|
if tag_choice == 'PR_PRF': |
|
|
st.sidebar.info('Reflexive pronoun', icon="ℹ️") |
|
|
if tag_choice == 'PR_PRL': |
|
|
st.sidebar.info('Relative pronoun', icon="ℹ️") |
|
|
if tag_choice == 'PR_PRP': |
|
|
st.sidebar.info('Personal pronoun', icon="ℹ️") |
|
|
if tag_choice == 'PR_PRQ': |
|
|
st.sidebar.info('Wh-word', icon="ℹ️") |
|
|
if tag_choice == 'PSP': |
|
|
st.sidebar.info('Postposition', icon="ℹ️") |
|
|
if tag_choice == 'QT_QTC': |
|
|
st.sidebar.info('Cardinals', icon="ℹ️") |
|
|
if tag_choice == 'QT_QTF': |
|
|
st.sidebar.info('General quantifier', icon="ℹ️") |
|
|
if tag_choice == 'RB': |
|
|
st.sidebar.info('Adverb', icon="ℹ️") |
|
|
if tag_choice == 'RD_ECH': |
|
|
st.sidebar.info('Echo words', icon="ℹ️") |
|
|
if tag_choice == 'RD_PUNC': |
|
|
st.sidebar.info('Punctuation', icon="ℹ️") |
|
|
if tag_choice == 'RD_RDF': |
|
|
st.sidebar.info('Foreign words', icon="ℹ️") |
|
|
if tag_choice == 'RD_SYM': |
|
|
st.sidebar.info('Symbol', icon="ℹ️") |
|
|
if tag_choice == 'RD_UNK': |
|
|
st.sidebar.info('Unknown', icon="ℹ️") |
|
|
if tag_choice == 'RP_CL': |
|
|
st.sidebar.info('Classifier particle', icon="ℹ️") |
|
|
if tag_choice == 'RP_INJ': |
|
|
st.sidebar.info('Interjection particle', icon="ℹ️") |
|
|
if tag_choice == 'RP_INTF': |
|
|
st.sidebar.info('Intensifier particle', icon="ℹ️") |
|
|
if tag_choice == 'RP_NEG': |
|
|
st.sidebar.info('Negation particle', icon="ℹ️") |
|
|
if tag_choice == 'RP_RPD': |
|
|
st.sidebar.info('Default particle', icon="ℹ️") |
|
|
if tag_choice == 'V_VAUX': |
|
|
st.sidebar.info('Auxiliary verb', icon="ℹ️") |
|
|
if tag_choice == 'V_VM': |
|
|
st.sidebar.info('Main verb', icon="ℹ️") |
|
|
if tag_choice == 'V_VM_VF': |
|
|
st.sidebar.info('Finite verb', icon="ℹ️") |
|
|
if tag_choice == 'V_VM_VINF': |
|
|
st.sidebar.info('Infinite verb', icon="ℹ️") |
|
|
if tag_choice == 'V_VM_VNF': |
|
|
st.sidebar.info('Non-finite verb', icon="ℹ️") |
|
|
if tag_choice == 'V_VM_VNG': |
|
|
st.sidebar.info('Gerund verb', icon="ℹ️") |
|
|
if tag_choice == 'QT_QTO': |
|
|
st.sidebar.info('Ordinals', icon="ℹ️") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.sidebar.info('Last updated on: 23 April 2025', icon="✅") |
|
|
|
|
|
|
|
|
|
|
|
model = load_model(model_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if choice == 'Text Input': |
|
|
input_data = st.text_area("Write your sentence below", value="", height=68) |
|
|
if st.button('Click to execute'): |
|
|
|
|
|
data = BasicTokenizer().tokenize(input_data) |
|
|
|
|
|
|
|
|
|
|
|
sentence = Sentence(data) |
|
|
|
|
|
model.predict(sentence) |
|
|
my_list = [] |
|
|
|
|
|
for token in sentence: |
|
|
word = [] |
|
|
word.append(token.text) |
|
|
word.append(token.tag) |
|
|
my_list.append(tuple(word)) |
|
|
annotated_text(my_list) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if choice == 'File Upload': |
|
|
uploaded_file = st.file_uploader("Upload your File in .txt format", type='.txt') |
|
|
if uploaded_file is not None: |
|
|
lines = uploaded_file.read().decode('utf-8').splitlines() |
|
|
|
|
|
|
|
|
output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx' |
|
|
|
|
|
raw_sentences = [] |
|
|
tagged_sentences = [] |
|
|
|
|
|
with st.spinner("Wait for processing the file..."): |
|
|
for line in lines: |
|
|
data = BasicTokenizer().tokenize(line) |
|
|
sentence = Sentence(data) |
|
|
model.predict(sentence) |
|
|
my_list = [] |
|
|
|
|
|
for token in sentence: |
|
|
word = (token.text, token.tag) |
|
|
my_list.append(word) |
|
|
|
|
|
raw_line = ' '.join([f"{word}" for word, tag in my_list]) |
|
|
tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) |
|
|
|
|
|
raw_sentences.append(raw_line) |
|
|
tagged_sentences.append(tagged_line) |
|
|
|
|
|
|
|
|
df = pd.DataFrame({ |
|
|
"Raw Sentence": raw_sentences, |
|
|
"Tagged Sentence": tagged_sentences |
|
|
}) |
|
|
|
|
|
df.to_excel(output_file_name, index=False) |
|
|
|
|
|
|
|
|
with open(output_file_name, "rb") as f: |
|
|
btn = st.download_button( |
|
|
label="Download the tagged data in Excel (.xlsx) format", |
|
|
data=f, |
|
|
file_name=output_file_name, |
|
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|