Spaces:
Running
Running
| import streamlit as st | |
| from annotated_text import annotated_text | |
| # from nltk.tokenize import word_tokenize | |
| import warnings | |
| import pandas as pd | |
| from pandas import DataFrame | |
| import os | |
| warnings.filterwarnings('ignore') | |
| import re, flair, random, time | |
| # from bnlp import BasicTokenizer | |
| from indicnlp.tokenize import indic_tokenize | |
| from flair.data import Corpus, Sentence | |
| from flair.datasets import ColumnCorpus | |
| from flair.models import SequenceTagger | |
| from flair.trainers import ModelTrainer | |
| from huggingface_hub import hf_hub_download | |
| st.set_page_config( | |
| page_title="Marathi POS Tagger", | |
| page_icon="✔️", | |
| layout="wide", | |
| ) | |
| model_path = hf_hub_download( | |
| repo_id="atanu0491/MarathiPOSModel", | |
| filename="marathi-best-model.pt" | |
| ) | |
| def load_model(model_name): | |
| model = SequenceTagger.load(model_name) | |
| return (model) | |
| st.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True) | |
| activity = ['Select your Choice', 'File Upload', 'Text Input'] | |
| #choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity) | |
| choice = st.selectbox('How you want to proceed?', activity) | |
| # st.sidebar.markdown('''<h1><center><b><u>BIS POS Tagset</u></b><center></h1>''', unsafe_allow_html=True) | |
| tag_activity = ['TAG', '<unk>', 'CC_CCD', 'CC_CCS', 'CC_CCS_UT', 'DM_DMD', 'DM_DMR', 'DM_DMQ', 'JJ', 'N_NN', 'N_NNP', 'N_NNV', 'N_NST', 'PR_PRC', 'PR_PRF', 'PR_PRL', 'PR_PRP', 'PR_PRQ', 'PSP', 'QT_QTC', 'QT_QTF', 'RB', 'RD_ECH', 'RD_PUNC', 'RD_RDF', 'RD_SYM', 'RD_UNK', 'RP_CL', 'RP_INJ', 'RP_INTF', 'RP_NEG', 'RP_RPD', 'V_VAUX', 'V_VM', 'V_VM_VF', 'V_VM_VINF', 'V_VM_VNF', 'V_VM_VNG', 'QT_QTO'] | |
| tag_choice = st.sidebar.selectbox('Search the Tag you want to know', tag_activity) | |
| if tag_choice == 'TAG': | |
| st.sidebar.info('Select the TAG', icon="ℹ️") | |
| if tag_choice == '<unk>': | |
| st.sidebar.info('Unknown', icon="ℹ️") | |
| if tag_choice == 'CC_CCD': | |
| st.sidebar.info('Co-ordinator', icon="ℹ️") | |
| if tag_choice == 'CC_CCS': | |
| st.sidebar.info('Subordinator', icon="ℹ️") | |
| if tag_choice == 'CC_CCS_UT': | |
| st.sidebar.info('Quotative', icon="ℹ️") | |
| if tag_choice == 'DM_DMD': | |
| st.sidebar.info('Deictic demonstrative', icon="ℹ️") | |
| if tag_choice == 'DM_DMR': | |
| st.sidebar.info('Relative demonstrative', icon="ℹ️") | |
| if tag_choice == 'DM_DMQ': | |
| st.sidebar.info('Wh-word', icon="ℹ️") | |
| if tag_choice == 'JJ': | |
| st.sidebar.info('Adjective', icon="ℹ️") | |
| if tag_choice == 'N_NN': | |
| st.sidebar.info('Common noun', icon="ℹ️") | |
| if tag_choice == 'N_NNP': | |
| st.sidebar.info('Proper noun', icon="ℹ️") | |
| if tag_choice == 'N_NNV': | |
| st.sidebar.info('Verbal noun', icon="ℹ️") | |
| if tag_choice == 'N_NST': | |
| st.sidebar.info('Locative noun', icon="ℹ️") | |
| if tag_choice == 'PR_PRC': | |
| st.sidebar.info('Reciprocal pronoun', icon="ℹ️") | |
| if tag_choice == 'PR_PRF': | |
| st.sidebar.info('Reflexive pronoun', icon="ℹ️") | |
| if tag_choice == 'PR_PRL': | |
| st.sidebar.info('Relative pronoun', icon="ℹ️") | |
| if tag_choice == 'PR_PRP': | |
| st.sidebar.info('Personal pronoun', icon="ℹ️") | |
| if tag_choice == 'PR_PRQ': | |
| st.sidebar.info('Wh-word', icon="ℹ️") | |
| if tag_choice == 'PSP': | |
| st.sidebar.info('Postposition', icon="ℹ️") | |
| if tag_choice == 'QT_QTC': | |
| st.sidebar.info('Cardinals', icon="ℹ️") | |
| if tag_choice == 'QT_QTF': | |
| st.sidebar.info('General quantifier', icon="ℹ️") | |
| if tag_choice == 'RB': | |
| st.sidebar.info('Adverb', icon="ℹ️") | |
| if tag_choice == 'RD_ECH': | |
| st.sidebar.info('Echo words', icon="ℹ️") | |
| if tag_choice == 'RD_PUNC': | |
| st.sidebar.info('Punctuation', icon="ℹ️") | |
| if tag_choice == 'RD_RDF': | |
| st.sidebar.info('Foreign words', icon="ℹ️") | |
| if tag_choice == 'RD_SYM': | |
| st.sidebar.info('Symbol', icon="ℹ️") | |
| if tag_choice == 'RD_UNK': | |
| st.sidebar.info('Unknown', icon="ℹ️") | |
| if tag_choice == 'RP_CL': | |
| st.sidebar.info('Classifier particle', icon="ℹ️") | |
| if tag_choice == 'RP_INJ': | |
| st.sidebar.info('Interjection particle', icon="ℹ️") | |
| if tag_choice == 'RP_INTF': | |
| st.sidebar.info('Intensifier particle', icon="ℹ️") | |
| if tag_choice == 'RP_NEG': | |
| st.sidebar.info('Negation particle', icon="ℹ️") | |
| if tag_choice == 'RP_RPD': | |
| st.sidebar.info('Default particle', icon="ℹ️") | |
| if tag_choice == 'V_VAUX': | |
| st.sidebar.info('Auxiliary verb', icon="ℹ️") | |
| if tag_choice == 'V_VM': | |
| st.sidebar.info('Main verb', icon="ℹ️") | |
| if tag_choice == 'V_VM_VF': | |
| st.sidebar.info('Finite verb', icon="ℹ️") | |
| if tag_choice == 'V_VM_VINF': | |
| st.sidebar.info('Infinite verb', icon="ℹ️") | |
| if tag_choice == 'V_VM_VNF': | |
| st.sidebar.info('Non-finite verb', icon="ℹ️") | |
| if tag_choice == 'V_VM_VNG': | |
| st.sidebar.info('Gerund verb', icon="ℹ️") | |
| if tag_choice == 'QT_QTO': | |
| st.sidebar.info('Ordinals', icon="ℹ️") | |
| st.sidebar.info('Last updated on: 24 September 2025', icon="✅") | |
| model = load_model(model_path) | |
| #model = load_model('best-model-002.pt') | |
| #model = SequenceTagger.load('best-model-002.pt') | |
| if choice == 'Text Input': | |
| input_data = st.text_area("Write your sentence below", value="", height=68) | |
| if st.button('Click to execute'): | |
| # data = BasicTokenizer().tokenize(input_data) | |
| data = indic_tokenize.trivial_tokenize(input_data) | |
| # data = word_tokenize(input_data) | |
| sentence = Sentence(data) | |
| # model = load_model('best-model-002.pt') | |
| model.predict(sentence) | |
| my_list = [] | |
| for token in sentence: | |
| word = [] | |
| word.append(token.text) | |
| word.append(token.tag) | |
| my_list.append(tuple(word)) | |
| annotated_text(my_list) | |
| if choice == 'File Upload': | |
| uploaded_file = st.file_uploader("Upload your File in .txt format", type=["txt"]) | |
| if uploaded_file is not None: | |
| lines = uploaded_file.read().decode('utf-8').splitlines() | |
| # Define output Excel file name | |
| output_file_name = os.path.join("/tmp", uploaded_file.name.split('.')[0] + '_tagged.xlsx') | |
| raw_sentences = [] | |
| tagged_sentences = [] | |
| with st.spinner("Wait for processing the file..."): | |
| for line in lines: | |
| # data = BasicTokenizer().tokenize(line) | |
| data = indic_tokenize.trivial_tokenize(line) | |
| sentence = Sentence(data) | |
| model.predict(sentence) | |
| my_list = [] | |
| for token in sentence: | |
| word = (token.text, token.tag) | |
| my_list.append(word) | |
| raw_line = ' '.join([f"{word}" for word, tag in my_list]) | |
| tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) | |
| raw_sentences.append(raw_line) | |
| tagged_sentences.append(tagged_line) | |
| # Save to Excel | |
| df = pd.DataFrame({ | |
| "Raw Sentence": raw_sentences, | |
| "Tagged Sentence": tagged_sentences | |
| }) | |
| df.to_excel(output_file_name, index=False) | |
| # Provide download button | |
| with open(output_file_name, "rb") as f: | |
| btn = st.download_button( | |
| label="Download the tagged data in Excel (.xlsx) format", | |
| data=f, | |
| file_name=os.path.basename(output_file_name), | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| st.info('An initiative of Natural Language Processing Lab, Jadavpur University', icon="📚") | |