c-ho commited on
Commit
5c82ebc
·
verified ·
1 Parent(s): f520020

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -16
app.py CHANGED
@@ -119,20 +119,34 @@ def analyze_text(text, model_name):
119
 
120
  results = ner(text)
121
 
 
122
  results = merge_subwords(results)
123
 
124
- entities = []
 
 
125
 
126
  table_rows = []
127
 
128
  for ent in results:
 
 
 
 
129
  label = ent["entity_group"]
130
 
131
- entities.append({
132
- "start": ent["start"],
133
- "end": ent["end"],
134
- "label": label,
135
- })
 
 
 
 
 
 
 
136
 
137
  table_rows.append([
138
  ent["word"],
@@ -140,23 +154,113 @@ def analyze_text(text, model_name):
140
  round(ent["score"], 3)
141
  ])
142
 
143
- highlighted_output = {
144
- "text": text,
145
- "entities": entities
146
- }
 
147
 
148
- return highlighted_output, table_rows
149
 
150
  # ---------------------------------------------------
151
  # Entity colors
152
  # ---------------------------------------------------
153
 
154
  COLOR_MAP = {
155
- "LanguageRelatedTerm": "#ffcc00",
156
- "OtherLinguisticTerm": "#99ccff",
157
- "PhonologicalPhenomenon": "#ff9999",
158
- "MorphosyntacticPhenomenon": "#99ff99",
159
- "TOPNODE_DUMMY": "#dddddd",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  }
161
 
162
  # ---------------------------------------------------
 
119
 
120
  results = ner(text)
121
 
122
+ # merge subwords first
123
  results = merge_subwords(results)
124
 
125
+ highlighted_text = []
126
+
127
+ last_idx = 0
128
 
129
  table_rows = []
130
 
131
  for ent in results:
132
+
133
+ start = ent["start"]
134
+ end = ent["end"]
135
+
136
  label = ent["entity_group"]
137
 
138
+ # Add normal text before entity
139
+ if start > last_idx:
140
+ highlighted_text.append(
141
+ (text[last_idx:start], None)
142
+ )
143
+
144
+ # Add highlighted entity
145
+ highlighted_text.append(
146
+ (text[start:end], label)
147
+ )
148
+
149
+ last_idx = end
150
 
151
  table_rows.append([
152
  ent["word"],
 
154
  round(ent["score"], 3)
155
  ])
156
 
157
+ # Add remaining text
158
+ if last_idx < len(text):
159
+ highlighted_text.append(
160
+ (text[last_idx:], None)
161
+ )
162
 
163
+ return highlighted_text, table_rows
164
 
165
  # ---------------------------------------------------
166
  # Entity colors
167
  # ---------------------------------------------------
168
 
169
  COLOR_MAP = {
170
+ # -----------------------------------
171
+ # Academic / theoretical
172
+ # -----------------------------------
173
+ "AcademicDiscipline": "#264653", # deep teal
174
+ "AmbiguouslyDefinedConcept": "#6D597A", # muted purple
175
+ "UnclassifiedLinguisticConcept": "#9A8C98", # soft gray-purple
176
+
177
+ # -----------------------------------
178
+ # Language / general linguistic
179
+ # -----------------------------------
180
+ "LanguageRelatedTerm": "#E9C46A", # warm sand yellow
181
+ "OtherLinguisticTerm": "#A8DADC", # pale cyan
182
+ "LanguageResourceInformation": "#457B9D", # medium blue
183
+
184
+ # -----------------------------------
185
+ # Phonology / graphemics
186
+ # -----------------------------------
187
+ "PhonologicalPhenomenon": "#E76F51", # coral red
188
+ "GraphemicPhenomenon": "#F4A261", # orange
189
+
190
+ # -----------------------------------
191
+ # Morphology / syntax
192
+ # -----------------------------------
193
+ "MorphologicalPhenomenon": "#2A9D8F", # turquoise green
194
+ "MorphosyntacticPhenomenon": "#52B788", # medium green
195
+ "SyntacticPhenomenon": "#40916C", # darker green
196
+
197
+ # -----------------------------------
198
+ # Lexicon / semantics / discourse
199
+ # -----------------------------------
200
+ "LexicalPhenomenon": "#577590", # slate blue
201
+ "SemanticPhenomenon": "#4361EE", # vivid blue
202
+ "DiscoursePhenomenon": "#B5179E", # magenta-purple
203
+
204
+ # -----------------------------------
205
+ # Special / misc
206
+ # -----------------------------------
207
+ "NEW_TAG": "#FF006E", # neon pink
208
+ "TOPNODE_DUMMY": "#BDBDBD", # neutral gray
209
+
210
+ # -----------------------------------
211
+ # Optional BIO aliases
212
+ # (safe fallback if model outputs raw BIO labels)
213
+ # -----------------------------------
214
+ "B-AcademicDiscipline": "#264653",
215
+ "I-AcademicDiscipline": "#264653",
216
+
217
+ "B-AmbiguouslyDefinedConcept": "#6D597A",
218
+ "I-AmbiguouslyDefinedConcept": "#6D597A",
219
+
220
+ "B-DiscoursePhenomenon": "#B5179E",
221
+ "I-DiscoursePhenomenon": "#B5179E",
222
+
223
+ "B-GraphemicPhenomenon": "#F4A261",
224
+ "I-GraphemicPhenomenon": "#F4A261",
225
+
226
+ "B-LanguageRelatedTerm": "#E9C46A",
227
+ "I-LanguageRelatedTerm": "#E9C46A",
228
+
229
+ "B-LanguageResourceInformation": "#457B9D",
230
+ "I-LanguageResourceInformation": "#457B9D",
231
+
232
+ "B-LexicalPhenomenon": "#577590",
233
+ "I-LexicalPhenomenon": "#577590",
234
+
235
+ "B-MorphologicalPhenomenon": "#2A9D8F",
236
+ "I-MorphologicalPhenomenon": "#2A9D8F",
237
+
238
+ "B-MorphosyntacticPhenomenon": "#52B788",
239
+ "I-MorphosyntacticPhenomenon": "#52B788",
240
+
241
+ "B-OtherLinguisticTerm": "#A8DADC",
242
+ "I-OtherLinguisticTerm": "#A8DADC",
243
+
244
+ "B-PhonologicalPhenomenon": "#E76F51",
245
+ "I-PhonologicalPhenomenon": "#E76F51",
246
+
247
+ "B-SemanticPhenomenon": "#4361EE",
248
+ "I-SemanticPhenomenon": "#4361EE",
249
+
250
+ "B-SyntacticPhenomenon": "#40916C",
251
+ "I-SyntacticPhenomenon": "#40916C",
252
+
253
+ "B-TOPNODE_DUMMY": "#BDBDBD",
254
+ "I-TOPNODE_DUMMY": "#BDBDBD",
255
+
256
+ "B-UnclassifiedLinguisticConcept": "#9A8C98",
257
+ "I-UnclassifiedLinguisticConcept": "#9A8C98",
258
+
259
+ "B-NEW_TAG": "#FF006E",
260
+ "I-NEW_TAG": "#FF006E",
261
+
262
+ # Outside tag
263
+ "O": "#FFFFFF"
264
  }
265
 
266
  # ---------------------------------------------------