AIEcosystem commited on
Commit
f9b3fc6
·
verified ·
1 Parent(s): dfb6588

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +251 -184
src/streamlit_app.py CHANGED
@@ -12,6 +12,7 @@ from streamlit_extras.stylable_container import stylable_container
12
  from typing import Optional
13
  from gliner import GLiNER
14
  from comet_ml import Experiment
 
15
  st.markdown(
16
  """
17
  <style>
@@ -51,7 +52,9 @@ st.markdown(
51
  }
52
  </style>
53
  """,
54
- unsafe_allow_html=True)
 
 
55
  # --- Page Configuration and UI Elements ---
56
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
57
  st.subheader("PiiGuard", divider="violet")
@@ -59,17 +62,18 @@ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
59
  expander = st.expander("**Important notes**")
60
  expander.write("""**Named Entities:** This PiiGuard web app predicts fifty-one (51) labels: "person", "organization", "social_media_handle", "username", "insurance_company", "phone_number", "email", "email_address", "mobile_phone_number", "landline_phone_number", "fax_number", "credit_card_number", "credit_card_expiration_date", "credit_card_brand", "cvv", "cvc", "bank_account_number", "iban", "transaction_number", "cpf", "cnpj", "passport_number", "passport_expiration_date", "driver's_license_number", "tax_identification_number", "identity_card_number", "national_id_number", "identity_document_number", "birth_certificate_number", "social_security_number", "health_insurance_id_number", "health_insurance_number", "national_health_insurance_number", "student_id_number", "registration_number", "insurance_number", "serial_number", "visa_number", "reservation_number", "train_ticket_number", "medication", "medical_condition", "blood_type", "date_of_birth", "address", "ip_address", "postal_code", "flight_number", "license_plate_number", "vehicle_registration_number", "digital_signature"
61
 
62
- Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
63
 
64
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
65
 
66
  **Usage Limits:** You can request results unlimited times for one (1) month.
67
 
68
- **Supported Languages:** English
69
 
70
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
71
 
72
  For any errors or inquiries, please contact us at info@nlpblogs.com""")
 
73
  with st.sidebar:
74
  st.write("Use the following code to embed the PiiGuard web app on your website. Feel free to adjust the width and height values to fit your page.")
75
  code = '''
@@ -87,6 +91,7 @@ with st.sidebar:
87
  st.divider()
88
  st.subheader("🚀 Ready to build your own AI Web App?", divider="violet")
89
  st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
 
90
  # --- Comet ML Setup ---
91
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
92
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -94,68 +99,20 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
94
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
95
  if not comet_initialized:
96
  st.warning("Comet ML not initialized. Check environment variables.")
 
97
  # --- Label Definitions ---
98
- labels = [ "person",
99
- "organization",
100
- "social_media_handle",
101
- "username",
102
- "insurance_company", "phone_number",
103
- "email",
104
- "email_address",
105
- "mobile_phone_number",
106
- "landline_phone_number",
107
- "fax_number", "credit_card_number",
108
- "credit_card_expiration_date",
109
- "credit_card_brand",
110
- "cvv",
111
- "cvc",
112
- "bank_account_number",
113
- "iban",
114
- "transaction_number",
115
- "cpf",
116
- "cnpj", "passport_number",
117
- "passport_expiration_date",
118
- "driver's_license_number",
119
- "tax_identification_number",
120
- "identity_card_number",
121
- "national_id_number",
122
- "identity_document_number",
123
- "birth_certificate_number",
124
- "social_security_number",
125
- "health_insurance_id_number",
126
- "health_insurance_number",
127
- "national_health_insurance_number",
128
- "student_id_number",
129
- "registration_number",
130
- "insurance_number",
131
- "serial_number",
132
- "visa_number",
133
- "reservation_number",
134
- "train_ticket_number", "medication",
135
- "medical_condition",
136
- "blood_type",
137
- "date_of_birth", "address",
138
- "ip_address",
139
- "postal_code", "flight_number",
140
- "license_plate_number",
141
- "vehicle_registration_number", "digital_signature"]
142
- # Corrected mapping dictionary
143
- category_mapping = { "People_and_Groups": [
144
  "person",
145
  "organization",
146
  "social_media_handle",
147
  "username",
148
- "insurance_company"
149
- ],
150
- "Contact_Information": [
151
  "phone_number",
152
  "email",
153
  "email_address",
154
  "mobile_phone_number",
155
  "landline_phone_number",
156
- "fax_number"
157
- ],
158
- "Financial_and_Transactions": [
159
  "credit_card_number",
160
  "credit_card_expiration_date",
161
  "credit_card_brand",
@@ -165,9 +122,7 @@ category_mapping = { "People_and_Groups": [
165
  "iban",
166
  "transaction_number",
167
  "cpf",
168
- "cnpj"
169
- ],
170
- "Identification_and_Documents": [
171
  "passport_number",
172
  "passport_expiration_date",
173
  "driver's_license_number",
@@ -186,164 +141,276 @@ category_mapping = { "People_and_Groups": [
186
  "serial_number",
187
  "visa_number",
188
  "reservation_number",
189
- "train_ticket_number"
190
- ],
191
- "Health_and_Personal": [
192
  "medication",
193
  "medical_condition",
194
  "blood_type",
195
- "date_of_birth"
196
- ],
197
- "Locations_and_Addresses": [
198
  "address",
199
  "ip_address",
200
- "postal_code"
201
- ],
202
- "Transportation_and_Logistics": [
203
  "flight_number",
204
  "license_plate_number",
205
- "vehicle_registration_number"
206
- ],
207
- "Digital_and_Security": [
208
  "digital_signature"
209
- ]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # --- Model Loading ---
211
- @st.cache_resourcedef load_ner_model():
 
212
  """Loads the GLiNER model and caches it."""
213
  try:
214
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-v1.0", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
215
  except Exception as e:
216
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
217
  st.stop()
218
  model = load_ner_model()
 
219
  # Flatten the mapping to a single dictionary
220
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
 
 
 
 
 
 
 
 
 
 
221
  # --- Text Input and Clear Button ---
222
  word_limit = 200
223
  text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
224
  word_count = len(text.split())
225
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
 
226
  def clear_text():
227
- """Clears the text area."""
228
  st.session_state['my_text_area'] = ""
 
 
 
 
229
  st.button("Clear text", on_click=clear_text)
 
230
  # --- Results Section ---
231
  if st.button("Results"):
232
- start_time = time.time()
233
  if not text.strip():
234
  st.warning("Please enter some text to extract entities.")
 
235
  elif word_count > word_limit:
236
  st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
 
237
  else:
238
- with st.spinner("Extracting entities...", show_time=True):
239
- entities = model.predict_entities(text, labels)
240
- df = pd.DataFrame(entities)
241
- if not df.empty:
242
- df['category'] = df['label'].map(reverse_category_mapping)
243
- if comet_initialized:
244
- experiment = Experiment(
245
- api_key=COMET_API_KEY,
246
- workspace=COMET_WORKSPACE,
247
- project_name=COMET_PROJECT_NAME,
248
- )
249
- experiment.log_parameter("input_text", text)
250
- experiment.log_table("predicted_entities", df)
251
- st.subheader("Grouped Entities by Category", divider = "violet")
252
- # Create tabs for each category
253
- category_names = sorted(list(category_mapping.keys()))
254
- category_tabs = st.tabs(category_names)
255
- for i, category_name in enumerate(category_names):
256
- with category_tabs[i]:
257
- df_category_filtered = df[df['category'] == category_name]
258
- if not df_category_filtered.empty:
259
- st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
260
- else:
261
- st.info(f"No entities found for the '{category_name}' category.")
262
- with st.expander("See Glossary of tags"):
263
- st.write('''
264
- - **text**: ['entity extracted from your text data']
265
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
266
- - **label**: ['label (tag) assigned to a given extracted entity']
267
- - **start**: ['index of the start of the corresponding entity']
268
- - **end**: ['index of the end of the corresponding entity']
269
- ''')
270
- st.divider()
271
- # Tree map
272
- st.subheader("Tree map", divider = "violet")
273
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
274
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#E8F5E9', plot_bgcolor='#E8F5E9')
275
- st.plotly_chart(fig_treemap)
276
- # Pie and Bar charts
277
- grouped_counts = df['category'].value_counts().reset_index()
278
- grouped_counts.columns = ['category', 'count']
279
- col1, col2 = st.columns(2)
280
- with col1:
281
- st.subheader("Pie chart", divider = "violet")
282
- fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
283
- fig_pie.update_traces(textposition='inside', textinfo='percent+label')
284
- fig_pie.update_layout(
285
- paper_bgcolor='#E8F5E9',
286
- plot_bgcolor='#E8F5E9'
287
- )
288
- st.plotly_chart(fig_pie)
289
- with col2:
290
- st.subheader("Bar chart", divider = "violet")
291
- fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
292
- fig_bar.update_layout( # Changed from fig_pie to fig_bar
293
- paper_bgcolor='#E8F5E9',
294
- plot_bgcolor='#E8F5E9'
295
- )
296
- st.plotly_chart(fig_bar)
297
- # Most Frequent Entities
298
- st.subheader("Most Frequent Entities", divider="gray")
299
- word_counts = df['text'].value_counts().reset_index()
300
- word_counts.columns = ['Entity', 'Count']
301
- repeating_entities = word_counts[word_counts['Count'] > 1]
302
- if not repeating_entities.empty:
303
- st.dataframe(repeating_entities, use_container_width=True)
304
- fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
305
- fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'},
306
- paper_bgcolor='#F5F5F5',
307
- plot_bgcolor='#F5F5F5')
308
- st.plotly_chart(fig_repeating_bar)
309
  else:
310
- st.warning("No entities were found that occur more than once.")
311
- # Download Section
312
- st.divider()
313
- dfa = pd.DataFrame(
314
- data={
315
- 'Column Name': ['text', 'label', 'score', 'start', 'end'],
316
- 'Description': [
317
- 'entity extracted from your text data',
318
- 'label (tag) assigned to a given extracted entity',
319
- 'accuracy score; how accurately a tag has been assigned to a given entity',
320
- 'index of the start of the corresponding entity',
321
- 'index of the end of the corresponding entity',
322
- ]
323
- }
324
- )
325
- buf = io.BytesIO()
326
- with zipfile.ZipFile(buf, "w") as myzip:
327
- myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
328
- myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
329
- with stylable_container(
330
- key="download_button",
331
- css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
332
- ):
333
- st.download_button(
334
- label="Download results and glossary (zip)",
335
- data=buf.getvalue(),
336
- file_name="nlpblogs_results.zip",
337
- mime="application/zip",
338
- )
339
- if comet_initialized:
340
- experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap_categories")
341
- experiment.end()
342
- else: # If df is empty
343
- st.warning("No entities were found in the provided text.")
344
- end_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  elapsed_time = end_time - start_time
346
  st.text("")
347
  st.text("")
348
- st.info(f"Results processed in **{elapsed_time:.2f} seconds**.")
349
-
 
12
  from typing import Optional
13
  from gliner import GLiNER
14
  from comet_ml import Experiment
15
+
16
  st.markdown(
17
  """
18
  <style>
 
52
  }
53
  </style>
54
  """,
55
+ unsafe_allow_html=True
56
+ )
57
+
58
  # --- Page Configuration and UI Elements ---
59
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
60
  st.subheader("PiiGuard", divider="violet")
 
62
  expander = st.expander("**Important notes**")
63
  expander.write("""**Named Entities:** This PiiGuard web app predicts fifty-one (51) labels: "person", "organization", "social_media_handle", "username", "insurance_company", "phone_number", "email", "email_address", "mobile_phone_number", "landline_phone_number", "fax_number", "credit_card_number", "credit_card_expiration_date", "credit_card_brand", "cvv", "cvc", "bank_account_number", "iban", "transaction_number", "cpf", "cnpj", "passport_number", "passport_expiration_date", "driver's_license_number", "tax_identification_number", "identity_card_number", "national_id_number", "identity_document_number", "birth_certificate_number", "social_security_number", "health_insurance_id_number", "health_insurance_number", "national_health_insurance_number", "student_id_number", "registration_number", "insurance_number", "serial_number", "visa_number", "reservation_number", "train_ticket_number", "medication", "medical_condition", "blood_type", "date_of_birth", "address", "ip_address", "postal_code", "flight_number", "license_plate_number", "vehicle_registration_number", "digital_signature"
64
 
65
+ Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
66
 
67
+ **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
68
 
69
  **Usage Limits:** You can request results unlimited times for one (1) month.
70
 
71
+ **Supported Languages:** English
72
 
73
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
74
 
75
  For any errors or inquiries, please contact us at info@nlpblogs.com""")
76
+
77
  with st.sidebar:
78
  st.write("Use the following code to embed the PiiGuard web app on your website. Feel free to adjust the width and height values to fit your page.")
79
  code = '''
 
91
  st.divider()
92
  st.subheader("🚀 Ready to build your own AI Web App?", divider="violet")
93
  st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
94
+
95
  # --- Comet ML Setup ---
96
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
97
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 
99
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
100
  if not comet_initialized:
101
  st.warning("Comet ML not initialized. Check environment variables.")
102
+
103
  # --- Label Definitions ---
104
+ labels = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  "person",
106
  "organization",
107
  "social_media_handle",
108
  "username",
109
+ "insurance_company",
 
 
110
  "phone_number",
111
  "email",
112
  "email_address",
113
  "mobile_phone_number",
114
  "landline_phone_number",
115
+ "fax_number",
 
 
116
  "credit_card_number",
117
  "credit_card_expiration_date",
118
  "credit_card_brand",
 
122
  "iban",
123
  "transaction_number",
124
  "cpf",
125
+ "cnpj",
 
 
126
  "passport_number",
127
  "passport_expiration_date",
128
  "driver's_license_number",
 
141
  "serial_number",
142
  "visa_number",
143
  "reservation_number",
144
+ "train_ticket_number",
 
 
145
  "medication",
146
  "medical_condition",
147
  "blood_type",
148
+ "date_of_birth",
 
 
149
  "address",
150
  "ip_address",
151
+ "postal_code",
 
 
152
  "flight_number",
153
  "license_plate_number",
154
+ "vehicle_registration_number",
 
 
155
  "digital_signature"
156
+ ]
157
+ # Corrected mapping dictionary
158
+ category_mapping = {
159
+ "People_and_Groups": [
160
+ "person",
161
+ "organization",
162
+ "social_media_handle",
163
+ "username",
164
+ "insurance_company"
165
+ ],
166
+ "Contact_Information": [
167
+ "phone_number",
168
+ "email",
169
+ "email_address",
170
+ "mobile_phone_number",
171
+ "landline_phone_number",
172
+ "fax_number"
173
+ ],
174
+ "Financial_and_Transactions": [
175
+ "credit_card_number",
176
+ "credit_card_expiration_date",
177
+ "credit_card_brand",
178
+ "cvv",
179
+ "cvc",
180
+ "bank_account_number",
181
+ "iban",
182
+ "transaction_number",
183
+ "cpf",
184
+ "cnpj"
185
+ ],
186
+ "Identification_and_Documents": [
187
+ "passport_number",
188
+ "passport_expiration_date",
189
+ "driver's_license_number",
190
+ "tax_identification_number",
191
+ "identity_card_number",
192
+ "national_id_number",
193
+ "identity_document_number",
194
+ "birth_certificate_number",
195
+ "social_security_number",
196
+ "health_insurance_id_number",
197
+ "health_insurance_number",
198
+ "national_health_insurance_number",
199
+ "student_id_number",
200
+ "registration_number",
201
+ "insurance_number",
202
+ "serial_number",
203
+ "visa_number",
204
+ "reservation_number",
205
+ "train_ticket_number"
206
+ ],
207
+ "Health_and_Personal": [
208
+ "medication",
209
+ "medical_condition",
210
+ "blood_type",
211
+ "date_of_birth"
212
+ ],
213
+ "Locations_and_Addresses": [
214
+ "address",
215
+ "ip_address",
216
+ "postal_code"
217
+ ],
218
+ "Transportation_and_Logistics": [
219
+ "flight_number",
220
+ "license_plate_number",
221
+ "vehicle_registration_number"
222
+ ],
223
+ "Digital_and_Security": [
224
+ "digital_signature"
225
+ ]
226
+ }
227
+
228
  # --- Model Loading ---
229
+ @st.cache_resource
230
+ def load_ner_model():
231
  """Loads the GLiNER model and caches it."""
232
  try:
233
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-v1.0", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
234
  except Exception as e:
235
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
236
  st.stop()
237
  model = load_ner_model()
238
+
239
  # Flatten the mapping to a single dictionary
240
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
241
+
242
+ # --- Session State Initialization ---
243
+ if 'show_results' not in st.session_state:
244
+ st.session_state.show_results = False
245
+ if 'last_text' not in st.session_state:
246
+ st.session_state.last_text = ""
247
+ if 'results_df' not in st.session_state:
248
+ st.session_state.results_df = pd.DataFrame()
249
+ if 'elapsed_time' not in st.session_state:
250
+ st.session_state.elapsed_time = 0.0
251
+
252
  # --- Text Input and Clear Button ---
253
  word_limit = 200
254
  text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
255
  word_count = len(text.split())
256
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
257
+
258
  def clear_text():
259
+ """Clears the text area and hides results."""
260
  st.session_state['my_text_area'] = ""
261
+ st.session_state.show_results = False
262
+ st.session_state.last_text = ""
263
+ st.session_state.results_df = pd.DataFrame()
264
+ st.session_state.elapsed_time = 0.0
265
  st.button("Clear text", on_click=clear_text)
266
+
267
  # --- Results Section ---
268
  if st.button("Results"):
 
269
  if not text.strip():
270
  st.warning("Please enter some text to extract entities.")
271
+ st.session_state.show_results = False
272
  elif word_count > word_limit:
273
  st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
274
+ st.session_state.show_results = False
275
  else:
276
+ # Check if the text is different from the last time
277
+ if text != st.session_state.last_text:
278
+ st.session_state.show_results = True
279
+ st.session_state.last_text = text
280
+ start_time = time.time()
281
+ with st.spinner("Extracting entities...", show_time=True):
282
+ entities = model.predict_entities(text, labels)
283
+ df = pd.DataFrame(entities)
284
+ st.session_state.results_df = df
285
+ if not df.empty:
286
+ df['category'] = df['label'].map(reverse_category_mapping)
287
+ if comet_initialized:
288
+ experiment = Experiment(
289
+ api_key=COMET_API_KEY,
290
+ workspace=COMET_WORKSPACE,
291
+ project_name=COMET_PROJECT_NAME,
292
+ )
293
+ experiment.log_parameter("input_text", text)
294
+ experiment.log_table("predicted_entities", df)
295
+ experiment.end()
296
+ end_time = time.time()
297
+ st.session_state.elapsed_time = end_time - start_time
298
+ else:
299
+ # If the text is the same, just show the cached results without re-running
300
+ st.session_state.show_results = True
301
+
302
+ # Display results if the state variable is True
303
+ if st.session_state.show_results:
304
+ df = st.session_state.results_df
305
+ if not df.empty:
306
+ df['category'] = df['label'].map(reverse_category_mapping)
307
+ st.subheader("Grouped Entities by Category", divider="violet")
308
+
309
+ # Create tabs for each category
310
+ category_names = sorted(list(category_mapping.keys()))
311
+ category_tabs = st.tabs(category_names)
312
+
313
+ for i, category_name in enumerate(category_names):
314
+ with category_tabs[i]:
315
+ df_category_filtered = df[df['category'] == category_name]
316
+ if not df_category_filtered.empty:
317
+ st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  else:
319
+ st.info(f"No entities found for the '{category_name}' category.")
320
+
321
+ with st.expander("See Glossary of tags"):
322
+ st.write('''
323
+ - **text**: ['entity extracted from your text data']
324
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
325
+ - **label**: ['label (tag) assigned to a given extracted entity']
326
+ - **start**: ['index of the start of the corresponding entity']
327
+ - **end**: ['index of the end of the corresponding entity']
328
+ ''')
329
+ st.divider()
330
+
331
+ # Tree map
332
+ st.subheader("Tree map", divider="violet")
333
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
334
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#E8F5E9', plot_bgcolor='#E8F5E9')
335
+ st.plotly_chart(fig_treemap)
336
+
337
+ # Pie and Bar charts
338
+ grouped_counts = df['category'].value_counts().reset_index()
339
+ grouped_counts.columns = ['category', 'count']
340
+ col1, col2 = st.columns(2)
341
+
342
+ with col1:
343
+ st.subheader("Pie chart", divider="violet")
344
+ fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
345
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
346
+ fig_pie.update_layout(
347
+ paper_bgcolor='#E8F5E9',
348
+ plot_bgcolor='#E8F5E9'
349
+ )
350
+ st.plotly_chart(fig_pie)
351
+
352
+ with col2:
353
+ st.subheader("Bar chart", divider="violet")
354
+ fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
355
+ fig_bar.update_layout(
356
+ paper_bgcolor='#E8F5E9',
357
+ plot_bgcolor='#E8F5E9'
358
+ )
359
+ st.plotly_chart(fig_bar)
360
+
361
+ # Most Frequent Entities
362
+ st.subheader("Most Frequent Entities", divider="gray")
363
+ word_counts = df['text'].value_counts().reset_index()
364
+ word_counts.columns = ['Entity', 'Count']
365
+ repeating_entities = word_counts[word_counts['Count'] > 1]
366
+
367
+ if not repeating_entities.empty:
368
+ st.dataframe(repeating_entities, use_container_width=True)
369
+ fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
370
+ fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'},
371
+ paper_bgcolor='#E8F5E9',
372
+ plot_bgcolor='#E8F5E9')
373
+ st.plotly_chart(fig_repeating_bar)
374
+ else:
375
+ st.warning("No entities were found that occur more than once.")
376
+
377
+ # Download Section
378
+ st.divider()
379
+ dfa = pd.DataFrame(
380
+ data={
381
+ 'Column Name': ['text', 'label', 'score', 'start', 'end'],
382
+ 'Description': [
383
+ 'entity extracted from your text data',
384
+ 'label (tag) assigned to a given extracted entity',
385
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
386
+ 'index of the start of the corresponding entity',
387
+ 'index of the end of the corresponding entity',
388
+ ]
389
+ }
390
+ )
391
+ buf = io.BytesIO()
392
+ with zipfile.ZipFile(buf, "w") as myzip:
393
+ myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
394
+ myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
395
+
396
+ with stylable_container(
397
+ key="download_button",
398
+ css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
399
+ ):
400
+ st.download_button(
401
+ label="Download results and glossary (zip)",
402
+ data=buf.getvalue(),
403
+ file_name="nlpblogs_results.zip",
404
+ mime="application/zip",
405
+ )
406
+
407
+ if comet_initialized:
408
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap_categories")
409
+ experiment.end()
410
+ else: # If df is empty
411
+ st.warning("No entities were found in the provided text.")
412
+ end_time = time.time()
413
  elapsed_time = end_time - start_time
414
  st.text("")
415
  st.text("")
416
+ st.info(f"Results processed in **{elapsed_time:.2f} seconds**.") like the excellent one?