c-ho commited on
Commit
71f491e
·
verified ·
1 Parent(s): 216163d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -180
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import html
2
  import gradio as gr
3
  from transformers import pipeline
4
 
@@ -40,13 +39,11 @@ model_info = {
40
  m: {
41
  "link": f"https://huggingface.co/{m}",
42
  "usage": f'''from transformers import pipeline
43
-
44
  ner = pipeline(
45
  "ner",
46
  model="{m}",
47
  aggregation_strategy="simple"
48
  )
49
-
50
  result = ner("Hello world")
51
  print(result)
52
  '''
@@ -116,26 +113,38 @@ def merge_subwords(results):
116
  # ---------------------------------------------------
117
 
118
  def analyze_text(text, model_name):
119
-
120
  ner = get_model(model_name)
121
 
122
  results = ner(text)
123
 
 
124
  results = merge_subwords(results)
125
 
126
- entities = []
 
 
127
 
128
  table_rows = []
129
 
130
  for ent in results:
131
 
 
 
 
132
  label = ent["entity_group"]
133
 
134
- entities.append({
135
- "start": ent["start"],
136
- "end": ent["end"],
137
- "label": label,
138
- })
 
 
 
 
 
 
 
139
 
140
  table_rows.append([
141
  ent["word"],
@@ -143,13 +152,13 @@ def analyze_text(text, model_name):
143
  round(ent["score"], 3)
144
  ])
145
 
146
- rendered_html = render_highlighted_html(
147
- text,
148
- entities,
149
- COLOR_MAP
150
- )
151
 
152
- return rendered_html, table_rows
153
 
154
  # ---------------------------------------------------
155
  # Entity colors
@@ -159,36 +168,36 @@ COLOR_MAP = {
159
  # -----------------------------------
160
  # Academic / theoretical
161
  # -----------------------------------
162
- "AcademicDiscipline": "#5339a8", # intense purple
163
- "AmbiguouslyDefinedConcept": "#ab8fbd", # muted purple
164
- "UnclassifiedLinguisticConcept": "#d4a1c7", # soft gray-pink
165
 
166
  # -----------------------------------
167
  # Language / general linguistic
168
  # -----------------------------------
169
  "LanguageRelatedTerm": "#E9C46A", # warm sand yellow
170
- "OtherLinguisticTerm": "#b2d1d1", # pale cyan
171
- "LanguageResourceInformation": "#5397c2", # medium blue
172
 
173
  # -----------------------------------
174
  # Phonology / graphemics
175
  # -----------------------------------
176
- "PhonologicalPhenomenon": "#eb8167", # coral red
177
- "GraphemicPhenomenon": "#bd9779", # latte
178
 
179
  # -----------------------------------
180
  # Morphology / syntax
181
  # -----------------------------------
182
- "MorphologicalPhenomenon": "#37bdac", # turquoise green
183
- "MorphosyntacticPhenomenon": "#43916d", # medium green
184
- "SyntacticPhenomenon": "#53703a", # darker moss
185
 
186
  # -----------------------------------
187
  # Lexicon / semantics / discourse
188
  # -----------------------------------
189
  "LexicalPhenomenon": "#577590", # slate blue
190
  "SemanticPhenomenon": "#4361EE", # vivid blue
191
- "DiscoursePhenomenon": "#3a488c", # deep blue
192
 
193
  # -----------------------------------
194
  # Special / misc
@@ -200,154 +209,6 @@ COLOR_MAP = {
200
  "O": "#FFFFFF"
201
  }
202
 
203
- def render_highlighted_html(text, entities, color_map):
204
- """
205
- Creates:
206
- - clickable category legend
207
- - inline highlighted entities
208
- - stable spacing/layout during filtering
209
- """
210
-
211
- escaped_text = html.escape(text)
212
-
213
- # Sort entities by start position
214
- entities = sorted(entities, key=lambda x: x["start"])
215
-
216
- html_parts = []
217
-
218
- last_idx = 0
219
-
220
- for ent in entities:
221
- start = ent["start"]
222
- end = ent["end"]
223
- label = ent["label"]
224
-
225
- color = color_map.get(label, "#cccccc")
226
-
227
- # normal text
228
- if start > last_idx:
229
- html_parts.append(
230
- html.escape(text[last_idx:start])
231
- )
232
-
233
- entity_text = html.escape(text[start:end])
234
-
235
- html_parts.append(f'''
236
- <span
237
- class="entity entity-{label}"
238
- data-label="{label}"
239
- style="
240
- background:{color};
241
- padding:2px 4px;
242
- margin:1px;
243
- border-radius:4px;
244
- display:inline-block;
245
- white-space:pre-wrap;
246
- "
247
- >
248
- {entity_text}
249
- <span style="
250
- font-size:0.7em;
251
- opacity:0.75;
252
- margin-left:4px;
253
- ">
254
- {label}
255
- </span>
256
- </span>
257
- ''')
258
-
259
- last_idx = end
260
-
261
- # remaining text
262
- if last_idx < len(text):
263
- html_parts.append(
264
- html.escape(text[last_idx:])
265
- )
266
-
267
- categories = sorted(set(ent["label"] for ent in entities))
268
-
269
- legend_html = ""
270
-
271
- for cat in categories:
272
- color = color_map.get(cat, "#cccccc")
273
-
274
- legend_html += f'''
275
- <button
276
- class="legend-btn"
277
- data-label="{cat}"
278
- onclick="toggleCategory('{cat}')"
279
- style="
280
- background:{color};
281
- border:none;
282
- padding:6px 10px;
283
- margin:4px;
284
- border-radius:6px;
285
- cursor:pointer;
286
- font-weight:600;
287
- "
288
- >
289
- {cat}
290
- </button>
291
- '''
292
-
293
- final_html = f'''
294
- <div>
295
-
296
- <div style="margin-bottom:12px;">
297
- {legend_html}
298
- </div>
299
-
300
- <div
301
- id="annotated-text"
302
- style="
303
- line-height:2.1;
304
- white-space:pre-wrap;
305
- font-size:1rem;
306
- "
307
- >
308
- {''.join(html_parts)}
309
- </div>
310
-
311
- </div>
312
-
313
- <script>
314
- let activeCategory = null;
315
-
316
- function toggleCategory(category) {{
317
-
318
- const entities = document.querySelectorAll('.entity');
319
-
320
- // second click = restore all
321
- if (activeCategory === category) {{
322
- activeCategory = null;
323
-
324
- entities.forEach(el => {{
325
- el.style.opacity = '1';
326
- el.style.visibility = 'visible';
327
- }});
328
-
329
- return;
330
- }}
331
-
332
- activeCategory = category;
333
-
334
- entities.forEach(el => {{
335
- if (el.dataset.label === category) {{
336
- el.style.opacity = '1';
337
- el.style.visibility = 'visible';
338
- }} else {{
339
- // IMPORTANT:
340
- // preserve spacing/layout
341
- el.style.opacity = '0.15';
342
- el.style.visibility = 'visible';
343
- }}
344
- }});
345
- }}
346
- </script>
347
- '''
348
-
349
- return final_html
350
-
351
  # ---------------------------------------------------
352
  # UI
353
  # ---------------------------------------------------
@@ -357,7 +218,6 @@ with gr.Blocks(title="Linguistic Annotation Demo") as demo:
357
  gr.Markdown(
358
  """
359
  # Linguistic Annotation Demo
360
-
361
  This Space demonstrates custom linguistic sequence tagging models
362
  for detecting linguistic terminology and phenomena.
363
  """
@@ -389,16 +249,12 @@ for detecting linguistic terminology and phenomena.
389
 
390
  link_output = gr.Markdown()
391
 
392
- '''
393
  highlighted_output = gr.HighlightedText(
394
  label="Annotated Text",
395
  combine_adjacent=True,
396
  color_map=COLOR_MAP,
397
  show_legend=True
398
  )
399
- '''
400
-
401
- highlighted_output = gr.HTML(label="Annotated Text")
402
 
403
  entity_table = gr.Dataframe(
404
  headers=["Text", "Label", "Confidence"],
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
 
 
39
  m: {
40
  "link": f"https://huggingface.co/{m}",
41
  "usage": f'''from transformers import pipeline
 
42
  ner = pipeline(
43
  "ner",
44
  model="{m}",
45
  aggregation_strategy="simple"
46
  )
 
47
  result = ner("Hello world")
48
  print(result)
49
  '''
 
113
  # ---------------------------------------------------
114
 
115
  def analyze_text(text, model_name):
 
116
  ner = get_model(model_name)
117
 
118
  results = ner(text)
119
 
120
+ # merge subwords first
121
  results = merge_subwords(results)
122
 
123
+ highlighted_text = []
124
+
125
+ last_idx = 0
126
 
127
  table_rows = []
128
 
129
  for ent in results:
130
 
131
+ start = ent["start"]
132
+ end = ent["end"]
133
+
134
  label = ent["entity_group"]
135
 
136
+ # Add normal text before entity
137
+ if start > last_idx:
138
+ highlighted_text.append(
139
+ (text[last_idx:start], None)
140
+ )
141
+
142
+ # Add highlighted entity
143
+ highlighted_text.append(
144
+ (text[start:end], label)
145
+ )
146
+
147
+ last_idx = end
148
 
149
  table_rows.append([
150
  ent["word"],
 
152
  round(ent["score"], 3)
153
  ])
154
 
155
+ # Add remaining text
156
+ if last_idx < len(text):
157
+ highlighted_text.append(
158
+ (text[last_idx:], None)
159
+ )
160
 
161
+ return highlighted_text, table_rows
162
 
163
  # ---------------------------------------------------
164
  # Entity colors
 
168
  # -----------------------------------
169
  # Academic / theoretical
170
  # -----------------------------------
171
+ "AcademicDiscipline": "#264653", # deep teal
172
+ "AmbiguouslyDefinedConcept": "#6D597A", # muted purple
173
+ "UnclassifiedLinguisticConcept": "#9A8C98", # soft gray-purple
174
 
175
  # -----------------------------------
176
  # Language / general linguistic
177
  # -----------------------------------
178
  "LanguageRelatedTerm": "#E9C46A", # warm sand yellow
179
+ "OtherLinguisticTerm": "#A8DADC", # pale cyan
180
+ "LanguageResourceInformation": "#457B9D", # medium blue
181
 
182
  # -----------------------------------
183
  # Phonology / graphemics
184
  # -----------------------------------
185
+ "PhonologicalPhenomenon": "#E76F51", # coral red
186
+ "GraphemicPhenomenon": "#F4A261", # orange
187
 
188
  # -----------------------------------
189
  # Morphology / syntax
190
  # -----------------------------------
191
+ "MorphologicalPhenomenon": "#2A9D8F", # turquoise green
192
+ "MorphosyntacticPhenomenon": "#52B788", # medium green
193
+ "SyntacticPhenomenon": "#40916C", # darker green
194
 
195
  # -----------------------------------
196
  # Lexicon / semantics / discourse
197
  # -----------------------------------
198
  "LexicalPhenomenon": "#577590", # slate blue
199
  "SemanticPhenomenon": "#4361EE", # vivid blue
200
+ "DiscoursePhenomenon": "#B5179E", # magenta-purple
201
 
202
  # -----------------------------------
203
  # Special / misc
 
209
  "O": "#FFFFFF"
210
  }
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  # ---------------------------------------------------
213
  # UI
214
  # ---------------------------------------------------
 
218
  gr.Markdown(
219
  """
220
  # Linguistic Annotation Demo
 
221
  This Space demonstrates custom linguistic sequence tagging models
222
  for detecting linguistic terminology and phenomena.
223
  """
 
249
 
250
  link_output = gr.Markdown()
251
 
 
252
  highlighted_output = gr.HighlightedText(
253
  label="Annotated Text",
254
  combine_adjacent=True,
255
  color_map=COLOR_MAP,
256
  show_legend=True
257
  )
 
 
 
258
 
259
  entity_table = gr.Dataframe(
260
  headers=["Text", "Label", "Confidence"],