AIEcosystem commited on
Commit
c9ea574
·
verified ·
1 Parent(s): 871a036

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +174 -119
src/streamlit_app.py CHANGED
@@ -7,11 +7,11 @@ import io
7
  import plotly.express as px
8
  import zipfile
9
  import json
10
- from cryptography.fernet import Fernet
11
  from streamlit_extras.stylable_container import stylable_container
12
  from typing import Optional
13
  from gliner import GLiNER
14
  from comet_ml import Experiment
 
15
  st.markdown(
16
  """
17
  <style>
@@ -55,7 +55,9 @@ st.markdown(
55
  }
56
  </style>
57
  """,
58
- unsafe_allow_html=True)
 
 
59
  # --- Page Configuration and UI Elements ---
60
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
61
  st.subheader("Uncover", divider="red")
@@ -63,13 +65,13 @@ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
63
  expander = st.expander("**Important notes**")
64
  expander.write("""**Named Entities:** This Uncover web app predicts twenty-eight (28) labels: "Names", "Aliases", "Identifiers", "Roles", "Government_agencies", "Businesses", "Criminal_groups", "Financial_institutions", "Addresses", "Geographic_coordinates", "Landmarks", "Jurisdictions", "Dates", "Timestamps", "Time_ranges", "Weapons", "Vehicles", "Financial_information", "Evidence", "Relationships", "Demographics", "Biometrics", "Psychological_states", "Software_types", "Hardware_components", "Equipment", "Events", "Activities"
65
 
66
- Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
67
 
68
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
69
 
70
- **Usage Limits:** You can request results unlimited times for one (1) month.
71
 
72
- **Supported Languages:** English
73
 
74
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
75
 
@@ -92,6 +94,7 @@ with st.sidebar:
92
  st.divider()
93
  st.subheader("🚀 Ready to build your own AI Web App?", divider="red")
94
  st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
 
95
  # --- Comet ML Setup ---
96
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
97
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -99,153 +102,205 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
99
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
100
  if not comet_initialized:
101
  st.warning("Comet ML not initialized. Check environment variables.")
 
102
  # --- Label Definitions ---
103
  labels = ["Names","Aliases","Identifiers","Roles","Government_agencies","Businesses","Criminal_groups","Financial_institutions","Addresses","Geographic_coordinates","Landmarks","Jurisdictions","Dates","Timestamps","Time_ranges","Weapons","Vehicles","Financial_information","Evidence","Relationships","Demographics","Biometrics","Psychological_states","Software_types","Hardware_components","Equipment","Events","Activities"]
 
104
  # Create a mapping dictionary for labels to categories
105
  category_mapping = {
106
- "People & Identities": ["Names", "Aliases", "Identifiers", "Roles", "Demographics", "Biometrics", "Psychological_states", "Relationships"],
107
  "Organizations & Groups": ["Government_agencies", "Businesses", "Criminal_groups", "Financial_institutions"],
108
  "Locations & Jurisdictions": ["Addresses", "Geographic_coordinates", "Landmarks", "Jurisdictions"],
109
  "Times & Events" : ["Dates", "Timestamps", "Time_ranges", "Events", "Activities"],
110
  "Objects & Information": ["Weapons", "Vehicles", "Equipment", "Financial_information", "Evidence", "Software_types", "Hardware_components"],
111
- }
 
112
  # --- Model Loading ---
113
  @st.cache_resource
114
  def load_ner_model():
115
  """Loads the GLiNER model and caches it."""
116
  try:
117
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
118
  except Exception as e:
119
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
120
  st.stop()
121
  model = load_ner_model()
 
122
  # Flatten the mapping to a single dictionary
123
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
 
 
 
 
 
 
 
 
 
 
124
  # --- Text Input and Clear Button ---
125
  word_limit = 200
126
  text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
127
  word_count = len(text.split())
128
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
 
129
  def clear_text():
130
- """Clears the text area."""
131
  st.session_state['my_text_area'] = ""
 
 
 
 
132
  st.button("Clear text", on_click=clear_text)
 
133
  # --- Results Section ---
134
  if st.button("Results"):
135
- start_time = time.time()
136
  if not text.strip():
137
  st.warning("Please enter some text to extract entities.")
 
138
  elif word_count > word_limit:
139
  st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
 
140
  else:
141
- with st.spinner("Extracting entities...", show_time=True):
142
- entities = model.predict_entities(text, labels)
143
- df = pd.DataFrame(entities)
144
- if not df.empty:
145
- df['category'] = df['label'].map(reverse_category_mapping)
146
- if comet_initialized:
147
- experiment = Experiment(
148
- api_key=COMET_API_KEY,
149
- workspace=COMET_WORKSPACE,
150
- project_name=COMET_PROJECT_NAME,
151
- )
152
- experiment.log_parameter("input_text", text)
153
- experiment.log_table("predicted_entities", df)
154
- st.subheader("Grouped Entities by Category", divider = "red")
155
- # Create tabs for each category
156
- category_names = sorted(list(category_mapping.keys()))
157
- category_tabs = st.tabs(category_names)
158
- for i, category_name in enumerate(category_names):
159
- with category_tabs[i]:
160
- df_category_filtered = df[df['category'] == category_name]
161
- if not df_category_filtered.empty:
162
- st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
163
- else:
164
- st.info(f"No entities found for the '{category_name}' category.")
165
- with st.expander("See Glossary of tags"):
166
- st.write('''
167
- - **text**: ['entity extracted from your text data']
168
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
169
- - **label**: ['label (tag) assigned to a given extracted entity']
170
- - **start**: ['index of the start of the corresponding entity']
171
- - **end**: ['index of the end of the corresponding entity']
172
- ''')
173
- st.divider()
174
- # Tree map
175
- st.subheader("Tree map", divider = "red")
176
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
177
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#FFE5E5', plot_bgcolor='#FFE5E5')
178
- st.plotly_chart(fig_treemap)
179
- # Pie and Bar charts
180
- grouped_counts = df['category'].value_counts().reset_index()
181
- grouped_counts.columns = ['category', 'count']
182
- col1, col2 = st.columns(2)
183
- with col1:
184
- st.subheader("Pie chart", divider = "red")
185
- fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
186
- fig_pie.update_traces(textposition='inside', textinfo='percent+label')
187
- fig_pie.update_layout(
188
- paper_bgcolor='#FFE5E5',
189
- plot_bgcolor='#FFE5E5'
190
- )
191
- st.plotly_chart(fig_pie)
192
- with col2:
193
- st.subheader("Bar chart", divider = "red")
194
- fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
195
- fig_bar.update_layout(
196
- paper_bgcolor='#FFE5E5',
197
- plot_bgcolor='#FFE5E5'
198
- )
199
- st.plotly_chart(fig_bar)
200
- # Most Frequent Entities
201
- st.subheader("Most Frequent Entities", divider="red")
202
- word_counts = df['text'].value_counts().reset_index()
203
- word_counts.columns = ['Entity', 'Count']
204
- repeating_entities = word_counts[word_counts['Count'] > 1]
205
- if not repeating_entities.empty:
206
- st.dataframe(repeating_entities, use_container_width=True)
207
- fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
208
- fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'},
209
- paper_bgcolor='#FFE5E5',
210
- plot_bgcolor='#FFE5E5')
211
- st.plotly_chart(fig_repeating_bar)
212
  else:
213
- st.warning("No entities were found that occur more than once.")
214
- # Download Section
215
- st.divider()
216
- dfa = pd.DataFrame(
217
- data={
218
- 'Column Name': ['text', 'label', 'score', 'start', 'end'],
219
- 'Description': [
220
- 'entity extracted from your text data',
221
- 'label (tag) assigned to a given extracted entity',
222
- 'accuracy score; how accurately a tag has been assigned to a given entity',
223
- 'index of the start of the corresponding entity',
224
- 'index of the end of the corresponding entity',
225
- ]
226
- }
227
- )
228
- buf = io.BytesIO()
229
- with zipfile.ZipFile(buf, "w") as myzip:
230
- myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
231
- myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
232
- with stylable_container(
233
- key="download_button",
234
- css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
235
- ):
236
- st.download_button(
237
- label="Download results and glossary (zip)",
238
- data=buf.getvalue(),
239
- file_name="nlpblogs_results.zip",
240
- mime="application/zip",
241
- )
242
- if comet_initialized:
243
- experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap_categories")
244
- experiment.end()
245
- else: # If df is empty
246
- st.warning("No entities were found in the provided text.")
247
- end_time = time.time()
248
- elapsed_time = end_time - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  st.text("")
250
  st.text("")
251
- st.info(f"Results processed in **{elapsed_time:.2f} seconds**.")
 
7
  import plotly.express as px
8
  import zipfile
9
  import json
 
10
  from streamlit_extras.stylable_container import stylable_container
11
  from typing import Optional
12
  from gliner import GLiNER
13
  from comet_ml import Experiment
14
+
15
  st.markdown(
16
  """
17
  <style>
 
55
  }
56
  </style>
57
  """,
58
+ unsafe_allow_html=True
59
+ )
60
+
61
  # --- Page Configuration and UI Elements ---
62
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
63
  st.subheader("Uncover", divider="red")
 
65
  expander = st.expander("**Important notes**")
66
  expander.write("""**Named Entities:** This Uncover web app predicts twenty-eight (28) labels: "Names", "Aliases", "Identifiers", "Roles", "Government_agencies", "Businesses", "Criminal_groups", "Financial_institutions", "Addresses", "Geographic_coordinates", "Landmarks", "Jurisdictions", "Dates", "Timestamps", "Time_ranges", "Weapons", "Vehicles", "Financial_information", "Evidence", "Relationships", "Demographics", "Biometrics", "Psychological_states", "Software_types", "Hardware_components", "Equipment", "Events", "Activities"
67
 
68
+ Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
69
 
70
+ **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
71
 
72
+ **Usage Limits:** You can request results unlimited times for one (1) month.
73
 
74
+ **Supported Languages:** English
75
 
76
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
77
 
 
94
  st.divider()
95
  st.subheader("🚀 Ready to build your own AI Web App?", divider="red")
96
  st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
97
+
98
  # --- Comet ML Setup ---
99
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
100
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 
102
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
103
  if not comet_initialized:
104
  st.warning("Comet ML not initialized. Check environment variables.")
105
+
106
  # --- Label Definitions ---
107
  labels = ["Names","Aliases","Identifiers","Roles","Government_agencies","Businesses","Criminal_groups","Financial_institutions","Addresses","Geographic_coordinates","Landmarks","Jurisdictions","Dates","Timestamps","Time_ranges","Weapons","Vehicles","Financial_information","Evidence","Relationships","Demographics","Biometrics","Psychological_states","Software_types","Hardware_components","Equipment","Events","Activities"]
108
+
109
  # Create a mapping dictionary for labels to categories
110
  category_mapping = {
111
+ "People & Identities": ["Names", "Aliases", "Identifiers", "Roles", "Demographics", "Biometrics", "Psychological_states", "Relationships"],
112
  "Organizations & Groups": ["Government_agencies", "Businesses", "Criminal_groups", "Financial_institutions"],
113
  "Locations & Jurisdictions": ["Addresses", "Geographic_coordinates", "Landmarks", "Jurisdictions"],
114
  "Times & Events" : ["Dates", "Timestamps", "Time_ranges", "Events", "Activities"],
115
  "Objects & Information": ["Weapons", "Vehicles", "Equipment", "Financial_information", "Evidence", "Software_types", "Hardware_components"],
116
+ }
117
+
118
  # --- Model Loading ---
119
  @st.cache_resource
120
  def load_ner_model():
121
  """Loads the GLiNER model and caches it."""
122
  try:
123
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
124
  except Exception as e:
125
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
126
  st.stop()
127
  model = load_ner_model()
128
+
129
  # Flatten the mapping to a single dictionary
130
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
131
+
132
+ # --- Session State Initialization ---
133
+ if 'show_results' not in st.session_state:
134
+ st.session_state.show_results = False
135
+ if 'last_text' not in st.session_state:
136
+ st.session_state.last_text = ""
137
+ if 'results_df' not in st.session_state:
138
+ st.session_state.results_df = pd.DataFrame()
139
+ if 'elapsed_time' not in st.session_state:
140
+ st.session_state.elapsed_time = 0.0
141
+
142
  # --- Text Input and Clear Button ---
143
  word_limit = 200
144
  text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
145
  word_count = len(text.split())
146
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
147
+
148
  def clear_text():
149
+ """Clears the text area and hides results."""
150
  st.session_state['my_text_area'] = ""
151
+ st.session_state.show_results = False
152
+ st.session_state.last_text = ""
153
+ st.session_state.results_df = pd.DataFrame()
154
+ st.session_state.elapsed_time = 0.0
155
  st.button("Clear text", on_click=clear_text)
156
+
157
  # --- Results Section ---
158
  if st.button("Results"):
 
159
  if not text.strip():
160
  st.warning("Please enter some text to extract entities.")
161
+ st.session_state.show_results = False
162
  elif word_count > word_limit:
163
  st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
164
+ st.session_state.show_results = False
165
  else:
166
+ # Check if the text is different from the last time
167
+ if text != st.session_state.last_text:
168
+ st.session_state.show_results = True
169
+ st.session_state.last_text = text
170
+ start_time = time.time()
171
+ with st.spinner("Extracting entities...", show_time=True):
172
+ entities = model.predict_entities(text, labels)
173
+ df = pd.DataFrame(entities)
174
+ st.session_state.results_df = df
175
+ if not df.empty:
176
+ df['category'] = df['label'].map(reverse_category_mapping)
177
+ if comet_initialized:
178
+ experiment = Experiment(
179
+ api_key=COMET_API_KEY,
180
+ workspace=COMET_WORKSPACE,
181
+ project_name=COMET_PROJECT_NAME,
182
+ )
183
+ experiment.log_parameter("input_text", text)
184
+ experiment.log_table("predicted_entities", df)
185
+ experiment.end()
186
+ end_time = time.time()
187
+ st.session_state.elapsed_time = end_time - start_time
188
+ else:
189
+ # If the text is the same, just show the cached results without re-running
190
+ st.session_state.show_results = True
191
+
192
+ # Display results if the state variable is True
193
+ if st.session_state.show_results:
194
+ df = st.session_state.results_df
195
+ if not df.empty:
196
+ df['category'] = df['label'].map(reverse_category_mapping)
197
+ st.subheader("Grouped Entities by Category", divider="red")
198
+
199
+ # Create tabs for each category
200
+ category_names = sorted(list(category_mapping.keys()))
201
+ category_tabs = st.tabs(category_names)
202
+
203
+ for i, category_name in enumerate(category_names):
204
+ with category_tabs[i]:
205
+ df_category_filtered = df[df['category'] == category_name]
206
+ if not df_category_filtered.empty:
207
+ st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  else:
209
+ st.info(f"No entities found for the '{category_name}' category.")
210
+
211
+ with st.expander("See Glossary of tags"):
212
+ st.write('''
213
+ - **text**: ['entity extracted from your text data']
214
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
215
+ - **label**: ['label (tag) assigned to a given extracted entity']
216
+ - **start**: ['index of the start of the corresponding entity']
217
+ - **end**: ['index of the end of the corresponding entity']
218
+ ''')
219
+ st.divider()
220
+
221
+ # Tree map
222
+ st.subheader("Tree map", divider="red")
223
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
224
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#FFE5E5', plot_bgcolor='#FFE5E5')
225
+ st.plotly_chart(fig_treemap)
226
+
227
+ # Pie and Bar charts
228
+ grouped_counts = df['category'].value_counts().reset_index()
229
+ grouped_counts.columns = ['category', 'count']
230
+ col1, col2 = st.columns(2)
231
+
232
+ with col1:
233
+ st.subheader("Pie chart", divider="red")
234
+ fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
235
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
236
+ fig_pie.update_layout(
237
+ paper_bgcolor='#FFE5E5',
238
+ plot_bgcolor='#FFE5E5'
239
+ )
240
+ st.plotly_chart(fig_pie)
241
+
242
+ with col2:
243
+ st.subheader("Bar chart", divider="red")
244
+ fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
245
+ fig_bar.update_layout(
246
+ paper_bgcolor='#FFE5E5',
247
+ plot_bgcolor='#FFE5E5'
248
+ )
249
+ st.plotly_chart(fig_bar)
250
+
251
+ # Most Frequent Entities
252
+ st.subheader("Most Frequent Entities", divider="red")
253
+ word_counts = df['text'].value_counts().reset_index()
254
+ word_counts.columns = ['Entity', 'Count']
255
+ repeating_entities = word_counts[word_counts['Count'] > 1]
256
+
257
+ if not repeating_entities.empty:
258
+ st.dataframe(repeating_entities, use_container_width=True)
259
+ fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
260
+ fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'},
261
+ paper_bgcolor='#FFE5E5',
262
+ plot_bgcolor='#FFE5E5')
263
+ st.plotly_chart(fig_repeating_bar)
264
+ else:
265
+ st.warning("No entities were found that occur more than once.")
266
+
267
+ # Download Section
268
+ st.divider()
269
+ dfa = pd.DataFrame(
270
+ data={
271
+ 'Column Name': ['text', 'label', 'score', 'start', 'end'],
272
+ 'Description': [
273
+ 'entity extracted from your text data',
274
+ 'label (tag) assigned to a given extracted entity',
275
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
276
+ 'index of the start of the corresponding entity',
277
+ 'index of the end of the corresponding entity',
278
+ ]
279
+ }
280
+ )
281
+ buf = io.BytesIO()
282
+ with zipfile.ZipFile(buf, "w") as myzip:
283
+ myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
284
+ myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
285
+
286
+ with stylable_container(
287
+ key="download_button",
288
+ css_styles="""button { background-color: #FF9999; border: 1px solid black; padding: 5px; color: #000000; }""",
289
+ ):
290
+ st.download_button(
291
+ label="Download results and glossary (zip)",
292
+ data=buf.getvalue(),
293
+ file_name="nlpblogs_results.zip",
294
+ mime="application/zip",
295
+ )
296
+
297
+ if comet_initialized:
298
+ experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap_categories")
299
+ experiment.end()
300
+
301
+ else: # If df is empty
302
+ st.warning("No entities were found in the provided text.")
303
+
304
  st.text("")
305
  st.text("")
306
+ st.info(f"Results processed in **{st.session_state.elapsed_time:.2f} seconds**.")