Spaces:
Running
Running
| import streamlit as st | |
| from annotated_text import annotated_text | |
| import warnings | |
| import pandas as pd | |
| from pandas import DataFrame | |
| warnings.filterwarnings('ignore') | |
| import re, flair, random, time | |
| from bnlp import BasicTokenizer | |
| from flair.data import Corpus, Sentence | |
| from flair.datasets import ColumnCorpus | |
| from flair.models import SequenceTagger | |
| from flair.trainers import ModelTrainer | |
| st.set_page_config( | |
| page_title="বাংলা POS Tagger", | |
| page_icon="✔️", | |
| layout="wide", | |
| ) | |
| def load_model(model_name): | |
| model = SequenceTagger.load(model_name) | |
| return (model) | |
| st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚") | |
| activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট'] | |
| choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity) | |
| st.sidebar.markdown('''<h3><center><b><u>BIS POS Tagset</u></b><center></h3>''', unsafe_allow_html=True) | |
| st.sidebar.info(''' | |
| <unk> --> _Unknown_ | |
| CC_CCD --> _Co-ordinator_ | |
| CC_CCS --> _Subordinator_ | |
| CC_CCS_UT --> _Quotative_ | |
| DM_DMD --> _Deictic demonstrative_ | |
| DM_DMR --> _Relative demonstrative_ | |
| DM_DMQ --> _Wh-word_ | |
| JJ --> _Adjective_ | |
| N_NN --> _Common noun_ | |
| N_NNP --> _Proper noun_ | |
| N_NNV --> _Verbal noun_ | |
| N_NST --> _Locative noun_ | |
| PR_PRC --> _Reciprocal pronoun_ | |
| PR_PRF --> _Reflexive pronoun_ | |
| PR_PRL --> _Relative pronoun_ | |
| PR_PRP --> _Personal pronoun_ | |
| PR_PRQ --> _Wh-word_ | |
| PSP --> _Postposition_ | |
| QT_QTC --> _Cardinals_ | |
| QT_QTF --> _General quantifier_ | |
| RB --> _Adverb_ | |
| RD_ECH --> _Echo words_ | |
| RD_PUNC --> _Punctuation_ | |
| RD_RDF --> _Foreign words_ | |
| RD_SYM --> _Symbol_ | |
| RD_UNK --> _Unknown_ | |
| RP_CL --> _Classifier particle_ | |
| RP_INJ --> _Interjection particle_ | |
| RP_INTF --> _Intensifier particle_ | |
| RP_NEG --> _Negation particle_ | |
| RP_RPD --> _Default particle_ | |
| V_VAUX --> _Auxiliary verb_ | |
| V_VM --> _Main verb_ | |
| V_VM_VF --> _Finite verb_ | |
| V_VM_VINF --> _Infinite verb_ | |
| V_VM_VNF --> _Non-finite verb_ | |
| V_VM_VNG --> _Gerund verb_ | |
| QT_QTO --> _Ordinals_ | |
| ''') | |
| st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️") | |
| model = load_model('best-model-002.pt') | |
| #model = SequenceTagger.load('best-model-002.pt') | |
| if choice == 'টেক্সট ইনপুট': | |
| input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10) | |
| if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'): | |
| data = BasicTokenizer().tokenize(input_data) | |
| sentence = Sentence(data) | |
| # model = load_model('best-model-002.pt') | |
| model.predict(sentence) | |
| my_list = [] | |
| for token in sentence: | |
| word = [] | |
| word.append(token.text) | |
| word.append(token.tag) | |
| my_list.append(tuple(word)) | |
| annotated_text(my_list) | |
| if choice == 'ফাইল আপলোড (for PUBLIC use)': | |
| uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt') | |
| if uploaded_file is not None: | |
| lines = uploaded_file.read().decode('utf-8').splitlines() | |
| # Define output Excel file name | |
| output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx' | |
| raw_sentences = [] | |
| tagged_sentences = [] | |
| with st.spinner("Wait for processing the file..."): | |
| for line in lines: | |
| data = BasicTokenizer().tokenize(line) | |
| sentence = Sentence(data) | |
| model.predict(sentence) | |
| my_list = [] | |
| for token in sentence: | |
| word = (token.text, token.tag) | |
| my_list.append(word) | |
| raw_line = ' '.join([f"{word}" for word, tag in my_list]) | |
| tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) | |
| raw_sentences.append(raw_line) | |
| tagged_sentences.append(tagged_line) | |
| # Save to Excel | |
| df = pd.DataFrame({ | |
| "Raw Sentence": raw_sentences, | |
| "Tagged Sentence": tagged_sentences | |
| }) | |
| df.to_excel(output_file_name, index=False) | |
| # Provide download button | |
| with open(output_file_name, "rb") as f: | |
| btn = st.download_button( | |
| label="Download the tagged data in Excel (.xlsx) format", | |
| data=f, | |
| file_name=output_file_name, | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| # if choice == 'ফাইল আপলোড (for PUBLIC use)': | |
| # uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt') | |
| # if uploaded_file is not None: | |
| # lines = uploaded_file.read().decode('utf-8').splitlines() | |
| # # Define output file name | |
| # output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt' | |
| # with open(output_file_name, 'w', encoding='utf-8') as out_file: | |
| # for line in lines: | |
| # data = BasicTokenizer().tokenize(line) | |
| # sentence = Sentence(data) | |
| # model.predict(sentence) | |
| # my_list = [] | |
| # for token in sentence: | |
| # word = (token.text, token.tag) | |
| # my_list.append(word) | |
| # # Write line to output file | |
| # tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) | |
| # out_file.write(tagged_line + '\n') | |
| # # Show annotated text | |
| # # annotated_text(*my_list) | |
| # #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name) | |
| # with open(output_file_name, "rb") as f: | |
| # btn = st.download_button( | |
| # label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন", | |
| # data=f, | |
| # file_name=output_file_name, | |
| # mime="text/plain" | |
| # ) | |
| if choice == 'ফাইল আপলোড (for SCTR use only)': | |
| uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন") | |
| if uploaded_files is not None: | |
| search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1] | |
| f_name = search_word_def + '.tsv' | |
| f = open(f_name, 'a') | |
| dataframe = pd.read_excel(uploaded_files) | |
| # st.write(dataframe.head()) | |
| for index, row in dataframe.iterrows(): | |
| if pd.notnull(row['Unnamed: 4']): | |
| data = BasicTokenizer().tokenize(row['Unnamed: 4']) | |
| sentence = Sentence(data) | |
| model.predict(sentence) | |
| search_w_d = [] | |
| search_w = [] | |
| my_list = [] | |
| for token in sentence: | |
| if token.text == search_word_def: | |
| w_d = [] | |
| w_d.append(token.text) | |
| w_d.append(token.tag) | |
| search_w_d.append("/".join(tuple(w_d))) | |
| word = [] | |
| word.append(token.text) | |
| word.append(token.tag) | |
| my_list.append("/".join(tuple(word))) | |
| f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n") | |
| f.close() | |
| with open(f_name, "rb") as file: | |
| btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name) | |