Spaces:
Build error
Build error
| import streamlit as st | |
| import numpy as np | |
| from pandas import DataFrame | |
| from keybert import KeyBERT | |
| # For Flair (Keybert) | |
| from flair.embeddings import TransformerDocumentEmbeddings | |
| import seaborn as sns | |
| # For download buttons | |
| from functionforDownloadButtons import download_button | |
| import os | |
| import json | |
| st.set_page_config( | |
| page_title="BERT Keyword Extractor", | |
| page_icon="π", | |
| ) | |
| def _max_width_(): | |
| max_width_str = f"max-width: 1400px;" | |
| st.markdown( | |
| f""" | |
| <style> | |
| .reportview-container .main .block-container{{ | |
| {max_width_str} | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| _max_width_() | |
| c30, c31, c32 = st.columns([2.5, 1, 3]) | |
| with c30: | |
| # st.image("logo.png", width=400) | |
| st.title("π BERT Keyword Extractor") | |
| st.header("") | |
| with st.expander("βΉοΈ - About this app", expanded=True): | |
| st.write( | |
| """ | |
| - The *BERT Keyword Extractor* app is an easy-to-use interface built in Streamlit for the amazing [KeyBERT](https://github.com/MaartenGr/KeyBERT) library from Maarten Grootendorst! | |
| - It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) π€ to create keywords/keyphrases that are most similar to a document. | |
| """ | |
| ) | |
| st.markdown("") | |
| st.markdown("") | |
| st.markdown("## **π Paste document **") | |
| with st.form(key="my_form"): | |
| ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 5, 0.07]) | |
| with c1: | |
| ModelType = st.radio( | |
| "Choose your model", | |
| ["DistilBERT (Default)", "Flair"], | |
| help="At present, you can choose between 2 models (Flair or DistilBERT) to embed your text. More to come!", | |
| ) | |
| if ModelType == "Default (DistilBERT)": | |
| # kw_model = KeyBERT(model=roberta) | |
| def load_model(): | |
| return KeyBERT(model=roberta) | |
| kw_model = load_model() | |
| else: | |
| def load_model(): | |
| return KeyBERT("distilbert-base-nli-mean-tokens") | |
| kw_model = load_model() | |
| top_N = st.slider( | |
| "# of results", | |
| min_value=1, | |
| max_value=30, | |
| value=10, | |
| help="You can choose the number of keywords/keyphrases to display. Between 1 and 30, default number is 10.", | |
| ) | |
| min_Ngrams = st.number_input( | |
| "Minimum Ngram", | |
| min_value=1, | |
| max_value=4, | |
| help="""The minimum value for the ngram range. | |
| *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases. | |
| To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""", | |
| # help="Minimum value for the keyphrase_ngram_range. keyphrase_ngram_range sets the length of the resulting keywords/keyphrases. To extract keyphrases, simply set keyphrase_ngram_range to (1, # 2) or higher depending on the number of words you would like in the resulting keyphrases.", | |
| ) | |
| max_Ngrams = st.number_input( | |
| "Maximum Ngram", | |
| value=2, | |
| min_value=1, | |
| max_value=4, | |
| help="""The maximum value for the keyphrase_ngram_range. | |
| *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases. | |
| To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""", | |
| ) | |
| StopWordsCheckbox = st.checkbox( | |
| "Remove stop words", | |
| help="Tick this box to remove stop words from the document (currently English only)", | |
| ) | |
| use_MMR = st.checkbox( | |
| "Use MMR", | |
| value=True, | |
| help="You can use Maximal Margin Relevance (MMR) to diversify the results. It creates keywords/keyphrases based on cosine similarity. Try high/low 'Diversity' settings below for interesting variations.", | |
| ) | |
| Diversity = st.slider( | |
| "Keyword diversity (MMR only)", | |
| value=0.5, | |
| min_value=0.0, | |
| max_value=1.0, | |
| step=0.1, | |
| help="""The higher the setting, the more diverse the keywords. | |
| Note that the *Keyword diversity* slider only works if the *MMR* checkbox is ticked. | |
| """, | |
| ) | |
| with c2: | |
| doc = st.text_area( | |
| "Paste your text below (max 500 words)", | |
| height=510, | |
| ) | |
| MAX_WORDS = 500 | |
| import re | |
| res = len(re.findall(r"\w+", doc)) | |
| if res > MAX_WORDS: | |
| st.warning( | |
| "β οΈ Your text contains " | |
| + str(res) | |
| + " words." | |
| + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! π" | |
| ) | |
| doc = doc[:MAX_WORDS] | |
| submit_button = st.form_submit_button(label="β¨ Get me the data!") | |
| if use_MMR: | |
| mmr = True | |
| else: | |
| mmr = False | |
| if StopWordsCheckbox: | |
| StopWords = "english" | |
| else: | |
| StopWords = None | |
| if not submit_button: | |
| st.stop() | |
| if min_Ngrams > max_Ngrams: | |
| st.warning("min_Ngrams can't be greater than max_Ngrams") | |
| st.stop() | |
| keywords = kw_model.extract_keywords( | |
| doc, | |
| keyphrase_ngram_range=(min_Ngrams, max_Ngrams), | |
| use_mmr=mmr, | |
| stop_words=StopWords, | |
| top_n=top_N, | |
| diversity=Diversity, | |
| ) | |
| st.markdown("## **π Check & download results **") | |
| st.header("") | |
| cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2]) | |
| with c1: | |
| CSVButton2 = download_button(keywords, "Data.csv", "π₯ Download (.csv)") | |
| with c2: | |
| CSVButton2 = download_button(keywords, "Data.txt", "π₯ Download (.txt)") | |
| with c3: | |
| CSVButton2 = download_button(keywords, "Data.json", "π₯ Download (.json)") | |
| st.header("") | |
| df = ( | |
| DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"]) | |
| .sort_values(by="Relevancy", ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| df.index += 1 | |
| # Add styling | |
| cmGreen = sns.light_palette("green", as_cmap=True) | |
| cmRed = sns.light_palette("red", as_cmap=True) | |
| df = df.style.background_gradient( | |
| cmap=cmGreen, | |
| subset=[ | |
| "Relevancy", | |
| ], | |
| ) | |
| c1, c2, c3 = st.columns([1, 3, 1]) | |
| format_dictionary = { | |
| "Relevancy": "{:.1%}", | |
| } | |
| df = df.format(format_dictionary) | |
| with c2: | |
| st.table(df) | |