Aigerimtbin commited on
Commit
db55bf7
Β·
verified Β·
1 Parent(s): 8f84a28

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -0
app.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from pandas import DataFrame
4
+ from keybert import KeyBERT
5
+ # For Flair (Keybert)
6
+ from flair.embeddings import TransformerDocumentEmbeddings
7
+ import seaborn as sns
8
+ # For download buttons
9
+ from functionforDownloadButtons import download_button
10
+ import os
11
+ import json
12
+
13
+ st.set_page_config(
14
+ page_title="BERT Keyword Extractor",
15
+ page_icon="🎈",
16
+ )
17
+
18
+
19
+ def _max_width_():
20
+ max_width_str = f"max-width: 1400px;"
21
+ st.markdown(
22
+ f"""
23
+ <style>
24
+ .reportview-container .main .block-container{{
25
+ {max_width_str}
26
+ }}
27
+ </style>
28
+ """,
29
+ unsafe_allow_html=True,
30
+ )
31
+
32
+
33
+ _max_width_()
34
+
35
+ c30, c31, c32 = st.columns([2.5, 1, 3])
36
+
37
+ with c30:
38
+ # st.image("logo.png", width=400)
39
+ st.title("πŸ”‘ BERT Keyword Extractor")
40
+ st.header("")
41
+
42
+
43
+
44
+ with st.expander("ℹ️ - About this app", expanded=True):
45
+
46
+ st.write(
47
+ """
48
+ - The *BERT Keyword Extractor* app is an easy-to-use interface built in Streamlit for the amazing [KeyBERT](https://github.com/MaartenGr/KeyBERT) library from Maarten Grootendorst!
49
+ - It uses a minimal keyword extraction technique that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) πŸ€— to create keywords/keyphrases that are most similar to a document.
50
+ """
51
+ )
52
+
53
+ st.markdown("")
54
+
55
+ st.markdown("")
56
+ st.markdown("## **πŸ“Œ Paste document **")
57
+ with st.form(key="my_form"):
58
+
59
+
60
+ ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 5, 0.07])
61
+ with c1:
62
+ ModelType = st.radio(
63
+ "Choose your model",
64
+ ["DistilBERT (Default)", "Flair"],
65
+ help="At present, you can choose between 2 models (Flair or DistilBERT) to embed your text. More to come!",
66
+ )
67
+
68
+ if ModelType == "Default (DistilBERT)":
69
+ # kw_model = KeyBERT(model=roberta)
70
+
71
+ @st.cache(allow_output_mutation=True)
72
+ def load_model():
73
+ return KeyBERT(model=roberta)
74
+
75
+ kw_model = load_model()
76
+
77
+ else:
78
+ @st.cache(allow_output_mutation=True)
79
+ def load_model():
80
+ return KeyBERT("distilbert-base-nli-mean-tokens")
81
+
82
+ kw_model = load_model()
83
+
84
+ top_N = st.slider(
85
+ "# of results",
86
+ min_value=1,
87
+ max_value=30,
88
+ value=10,
89
+ help="You can choose the number of keywords/keyphrases to display. Between 1 and 30, default number is 10.",
90
+ )
91
+ min_Ngrams = st.number_input(
92
+ "Minimum Ngram",
93
+ min_value=1,
94
+ max_value=4,
95
+ help="""The minimum value for the ngram range.
96
+
97
+ *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.
98
+
99
+ To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
100
+ # help="Minimum value for the keyphrase_ngram_range. keyphrase_ngram_range sets the length of the resulting keywords/keyphrases. To extract keyphrases, simply set keyphrase_ngram_range to (1, # 2) or higher depending on the number of words you would like in the resulting keyphrases.",
101
+ )
102
+
103
+ max_Ngrams = st.number_input(
104
+ "Maximum Ngram",
105
+ value=2,
106
+ min_value=1,
107
+ max_value=4,
108
+ help="""The maximum value for the keyphrase_ngram_range.
109
+
110
+ *Keyphrase_ngram_range* sets the length of the resulting keywords/keyphrases.
111
+
112
+ To extract keyphrases, simply set *keyphrase_ngram_range* to (1, 2) or higher depending on the number of words you would like in the resulting keyphrases.""",
113
+ )
114
+
115
+ StopWordsCheckbox = st.checkbox(
116
+ "Remove stop words",
117
+ help="Tick this box to remove stop words from the document (currently English only)",
118
+ )
119
+
120
+ use_MMR = st.checkbox(
121
+ "Use MMR",
122
+ value=True,
123
+ help="You can use Maximal Margin Relevance (MMR) to diversify the results. It creates keywords/keyphrases based on cosine similarity. Try high/low 'Diversity' settings below for interesting variations.",
124
+ )
125
+
126
+ Diversity = st.slider(
127
+ "Keyword diversity (MMR only)",
128
+ value=0.5,
129
+ min_value=0.0,
130
+ max_value=1.0,
131
+ step=0.1,
132
+ help="""The higher the setting, the more diverse the keywords.
133
+
134
+ Note that the *Keyword diversity* slider only works if the *MMR* checkbox is ticked.
135
+
136
+ """,
137
+ )
138
+
139
+ with c2:
140
+ doc = st.text_area(
141
+ "Paste your text below (max 500 words)",
142
+ height=510,
143
+ )
144
+
145
+ MAX_WORDS = 500
146
+ import re
147
+ res = len(re.findall(r"\w+", doc))
148
+ if res > MAX_WORDS:
149
+ st.warning(
150
+ "⚠️ Your text contains "
151
+ + str(res)
152
+ + " words."
153
+ + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
154
+ )
155
+
156
+ doc = doc[:MAX_WORDS]
157
+
158
+ submit_button = st.form_submit_button(label="✨ Get me the data!")
159
+
160
+ if use_MMR:
161
+ mmr = True
162
+ else:
163
+ mmr = False
164
+
165
+ if StopWordsCheckbox:
166
+ StopWords = "english"
167
+ else:
168
+ StopWords = None
169
+
170
+ if not submit_button:
171
+ st.stop()
172
+
173
+ if min_Ngrams > max_Ngrams:
174
+ st.warning("min_Ngrams can't be greater than max_Ngrams")
175
+ st.stop()
176
+
177
+ keywords = kw_model.extract_keywords(
178
+ doc,
179
+ keyphrase_ngram_range=(min_Ngrams, max_Ngrams),
180
+ use_mmr=mmr,
181
+ stop_words=StopWords,
182
+ top_n=top_N,
183
+ diversity=Diversity,
184
+ )
185
+
186
+ st.markdown("## **🎈 Check & download results **")
187
+
188
+ st.header("")
189
+
190
+ cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
191
+
192
+ with c1:
193
+ CSVButton2 = download_button(keywords, "Data.csv", "πŸ“₯ Download (.csv)")
194
+ with c2:
195
+ CSVButton2 = download_button(keywords, "Data.txt", "πŸ“₯ Download (.txt)")
196
+ with c3:
197
+ CSVButton2 = download_button(keywords, "Data.json", "πŸ“₯ Download (.json)")
198
+
199
+ st.header("")
200
+
201
+ df = (
202
+ DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
203
+ .sort_values(by="Relevancy", ascending=False)
204
+ .reset_index(drop=True)
205
+ )
206
+
207
+ df.index += 1
208
+
209
+ # Add styling
210
+ cmGreen = sns.light_palette("green", as_cmap=True)
211
+ cmRed = sns.light_palette("red", as_cmap=True)
212
+ df = df.style.background_gradient(
213
+ cmap=cmGreen,
214
+ subset=[
215
+ "Relevancy",
216
+ ],
217
+ )
218
+
219
+ c1, c2, c3 = st.columns([1, 3, 1])
220
+
221
+ format_dictionary = {
222
+ "Relevancy": "{:.1%}",
223
+ }
224
+
225
+ df = df.format(format_dictionary)
226
+
227
+ with c2:
228
+ st.table(df)
229
+