File size: 31,748 Bytes
6c37d4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dd46f692-4e46-40f2-bbb3-73240922a3e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import spacy\n",
    "from spacy.training import offsets_to_biluo_tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d2795420-7d1b-4bff-8662-fe0c7e41cdb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('data/ner-training/03-15-labeled.json', 'r') as file:\n",
    "    raw_data = json.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e8f1f763-f649-4e89-af08-209c72d17093",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Record 8] Mismatch in offset for label 'Usage', text='to develop a sensory garde'\n",
      "[Record 12] Mismatch in offset for label 'Context', text='in a place of lower income, and a very tight budge'\n",
      "[Record 14] Leading/trailing whitespace in span ' This is an extra activity to the curriculum, hence, budgets for seeds, etc are limited.'\n",
      "[Record 20] Mismatch in offset for label 'Usage', text='to support all our children to access the gardening projects around nurser'\n",
      "[Record 20] Leading/trailing whitespace in span ' We are a very outdoors nursery'\n",
      "[Record 25] Mismatch in offset for label 'Context', text='We are a small village Preschool within the main school and We are following the Hygge approach in Preschoo'\n",
      "[Record 34] Leading/trailing whitespace in span 'We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '\n",
      "[Record 34] Mismatch in offset for label 'Context', text='We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '\n",
      "[Record 34] Leading/trailing whitespace in span ' bench needs a repaint, the pond could do with some clearing and some logs around the pond would be great to make it a welcomed area.'\n",
      "[Record 34] Mismatch in offset for label 'Usage', text=' bench needs a repaint, the pond could do with some clearing and some logs around the pond would be great to make it a welcomed area.'\n",
      "[Record 40] Mismatch in offset for label 'Context', text='ur gardening club is brilliant, but we never have enough gloves or resources'\n",
      "[Record 41] Leading/trailing whitespace in span ' invest in our outside areas to get children active'\n",
      "[Record 41] Mismatch in offset for label 'Usage', text=' invest in our outside areas to get children active'\n",
      "[Record 42] Leading/trailing whitespace in span ' help us transform more of the concrete box into a naturally diverse play environment.'\n",
      "[Record 42] Mismatch in offset for label 'Usage', text=' help us transform more of the concrete box into a naturally diverse play environment.'\n",
      "[Record 51] Mismatch in offset for label 'Context', text='t Netherbrook, we are committed to hands-on, outdoor learning'\n",
      "[Record 52] Leading/trailing whitespace in span ' to resource this area with compost or seeds / plants'\n",
      "[Record 52] Mismatch in offset for label 'Usage', text=' to resource this area with compost or seeds / plants'\n",
      "[Record 55] Mismatch in offset for label 'Context', text='e have been utilising a large flower bed and the children have been working hard to get it ready to plant flowers and food that we can eat at the end of the summer term'\n",
      "[Record 56] Leading/trailing whitespace in span 'We run a free after-school gardening club Providing opportunities for about twenty children each week '\n",
      "[Record 56] Mismatch in offset for label 'Context', text='We run a free after-school gardening club Providing opportunities for about twenty children each week '\n",
      "[Record 57] Leading/trailing whitespace in span ' to revamp our outdoor area'\n",
      "[Record 57] Mismatch in offset for label 'Usage', text=' to revamp our outdoor area'\n",
      "[Record 58] Leading/trailing whitespace in span ' Our school is in a deprived area so a lot of the children live in flats etc so do not have a garden'\n",
      "[Record 58] Mismatch in offset for label 'Benefit', text='ives them the opportunity to support the school and learn all about gardening'\n",
      "[Record 60] Leading/trailing whitespace in span ' have some planters in the reception outdoor area which need some love and attention!'\n",
      "[Record 60] Mismatch in offset for label 'Context', text=' have some planters in the reception outdoor area which need some love and attention!'\n",
      "[Record 64] Mismatch in offset for label 'Usage', text='to tidy up the village and local care homes with flower displays and planter'\n",
      "[Record 70] Leading/trailing whitespace in span ' Our outdoor area is in desperate need of some love but unfortunately there just isn't the budget for it'\n",
      "[Record 72] Leading/trailing whitespace in span 'We plan to build sensory gardens (such as a Zen garden and fairy garden) '\n",
      "[Record 72] Mismatch in offset for label 'Context', text='We plan to build sensory gardens (such as a Zen garden and fairy garden) '\n",
      "[Record 73] Leading/trailing whitespace in span ' These groups are focused upon our Catholic Social Teaching Principals, one of these being Stewardship'\n",
      "[Record 73] Leading/trailing whitespace in span ' This pupil led group are starting a small project to help create a prayer and reflection space in the Early Years outdoor area'\n",
      "[Record 92] Leading/trailing whitespace in span ' allowing the children to escape there for some peace and enjoy the colours, smells, and textures the garden will bring'\n",
      "[Record 92] Mismatch in offset for label 'Benefit', text=' allowing the children to escape there for some peace and enjoy the colours, smells, and textures the garden will bring'\n",
      "[Record 100] Leading/trailing whitespace in span ' We have dedicated areas across school for students engage with gardening that have fallen into disrepair due to lack of resources'\n",
      "[Record 100] Mismatch in offset for label 'Context', text=' We have dedicated areas across school for students engage with gardening that have fallen into disrepair due to lack of resources'\n",
      "[Record 101] Leading/trailing whitespace in span ' to set up a sensory room for pupils'\n",
      "[Record 101] Mismatch in offset for label 'Usage', text=' to set up a sensory room for pupils'\n",
      "[Record 111] Leading/trailing whitespace in span ' supplies and resources'\n",
      "[Record 111] Mismatch in offset for label 'Usage', text=' supplies and resources'\n",
      "[Record 112] Leading/trailing whitespace in span ' enhance our Learning Support resource room'\n",
      "[Record 112] Mismatch in offset for label 'Benefit', text=' enhance our Learning Support resource room'\n",
      "[Record 114] Leading/trailing whitespace in span ' to improve our school hall or to buy new swing seats for the secondary'\n",
      "[Record 114] Mismatch in offset for label 'Usage', text=' to improve our school hall or to buy new swing seats for the secondary'\n",
      "[Record 126] Mismatch in offset for label 'Usage', text='to develop a small stage area and secure props, costumes, and sound equipmen'\n",
      "[Record 134] Mismatch in offset for label 'Usage', text='o convert an unused space into a community heritage hub featuring interactive exhibits and archival collections'\n",
      "[Record 141] Leading/trailing whitespace in span ' our children have limited exposure to performing arts'\n",
      "[Record 141] Mismatch in offset for label 'Context', text=' our children have limited exposure to performing arts'\n",
      "[Record 148] Leading/trailing whitespace in span ' to purchase sensory equipment, including weighted blankets, fidget toys, and calming lighting'\n",
      "[Record 148] Mismatch in offset for label 'Usage', text=' to purchase sensory equipment, including weighted blankets, fidget toys, and calming lighting'\n",
      "[Record 151] Leading/trailing whitespace in span ' improve literacy and encourage a love of reading in reluctant readers'\n",
      "[Record 151] Mismatch in offset for label 'Benefit', text=' improve literacy and encourage a love of reading in reluctant readers'\n",
      "[Record 157] Leading/trailing whitespace in span ' improve focus, behaviour, and attendance'\n",
      "[Record 157] Mismatch in offset for label 'Benefit', text=' improve focus, behaviour, and attendance'\n",
      "[Record 160] Leading/trailing whitespace in span ' training materials and team-building activities'\n",
      "[Record 160] Mismatch in offset for label 'Usage', text=' training materials and team-building activities'\n",
      "[Record 163] Leading/trailing whitespace in span ' ensure equitable access to learning and promote independence.'\n",
      "[Record 163] Mismatch in offset for label 'Benefit', text=' ensure equitable access to learning and promote independence.'\n",
      "[Record 175] Leading/trailing whitespace in span ' to purchase soft floor mats, noise-canceling headphones, and interactive light panels'\n",
      "[Record 175] Mismatch in offset for label 'Usage', text=' to purchase soft floor mats, noise-canceling headphones, and interactive light panels'\n",
      "[Record 177] Leading/trailing whitespace in span ' I’ve watched our old kiln become almost unusable, and we’re constantly low on clay and paints'\n",
      "[Record 177] Mismatch in offset for label 'Context', text=' I’ve watched our old kiln become almost unusable, and we’re constantly low on clay and paints'\n",
      "[Record 178] Mismatch in offset for label 'Benefit', text='Building trust, resilience, and communication outside the classroom often translates into better collaboration back at schoo'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have quite a high level of ALN children in our ...\" with entities \"[(101, 127, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to enhance the environment and resou...\" with entities \"[(190, 240, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"To win the money or the gardening bundle would be ...\" with entities \"[(59, 133, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are a small village Preschool within the main s...\" with entities \"[(0, 107, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to add to our garden area in the bac...\" with entities \"[(80, 180, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to add to our garden area in the bac...\" with entities \"[(183, 316, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our gardening club is brilliant, but we never have...\" with entities \"[(1, 77, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an infant school in the middle of a city bu...\" with entities \"[(213, 264, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an urban school with a concrete playgroung....\" with entities \"[(238, 324, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At Netherbrook, we are committed to hands-on, outd...\" with entities \"[(1, 62, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have recently had our outdoor area redesigned a...\" with entities \"[(125, 178, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our early years classes have been loving our mud k...\" with entities \"[(107, 275, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We run a free after-school gardening club Providin...\" with entities \"[(0, 102, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are currently trying to revamp our outdoor area...\" with entities \"[(23, 50, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"The Eco-Council have been working hard to prepare ...\" with entities \"[(506, 583, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an urban school without a green space (e.g....\" with entities \"[(68, 153, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are a small, rural school of 60 children with l...\" with entities \"[(389, 465, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"I work in an SEN school, where we are introducing ...\" with entities \"[(90, 163, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have a patch of grass that we are planning to t...\" with entities \"[(433, 552, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At our school, no child gets left behind. We offer...\" with entities \"[(266, 396, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are moving to a new site in September and I wou...\" with entities \"[(57, 93, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our children come from very difficult backgrounds ...\" with entities \"[(218, 241, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Budgets are tight, needs are great. We're a small ...\" with entities \"[(393, 436, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Greetings from our SEN school Harford Manor in Nor...\" with entities \"[(1004, 1075, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our school is proud to have recently formed a dram...\" with entities \"[(156, 232, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At Heritage Intermediate, our curriculum lacks opp...\" with entities \"[(108, 219, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Coleridge Primary hopes to launch a drama club tha...\" with entities \"[(128, 182, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our school has recently seen an increase in childr...\" with entities \"[(113, 207, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our Year 5 cohort is behind in reading comprehensi...\" with entities \"[(162, 232, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our pupils often arrive at school hungry. We would...\" with entities \"[(137, 178, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are launching a peer mentoring scheme for our Y...\" with entities \"[(94, 142, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have a number of students with visual impairmen...\" with entities \"[(155, 217, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"During my time working with students who have sens...\" with entities \"[(265, 351, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"As the lead art teacher, I’ve watched our old kiln...\" with entities \"[(24, 118, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n",
      "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"I help run a leadership program for high school ju...\" with entities \"[(373, 497, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "debug_nlp = spacy.blank(\"en\")\n",
    "\n",
    "for i, record in enumerate(raw_data):\n",
    "    text = record[\"additional_info\"]\n",
    "    doc = debug_nlp.make_doc(text)\n",
    "    \n",
    "    for ann in record[\"label\"]:\n",
    "        label = ann[\"labels\"][0]\n",
    "        start, end = ann[\"start\"], ann[\"end\"]\n",
    "        span_text = text[start:end]\n",
    "\n",
    "        # Quick check: leading or trailing whitespace?\n",
    "        if span_text != span_text.strip():\n",
    "            print(f\"[Record {i}] Leading/trailing whitespace in span '{span_text}'\")\n",
    "\n",
    "        # Attempt to convert offset(s) -> BILUO\n",
    "        try:\n",
    "            biluo_tags = offsets_to_biluo_tags(doc, [(start, end, label)])\n",
    "            # If any tag is '-' -> it means partial mismatch\n",
    "            if any(t == \"-\" for t in biluo_tags):\n",
    "                print(f\"[Record {i}] Mismatch in offset for label '{label}', text='{span_text}'\")\n",
    "        except Exception as e:\n",
    "            print(f\"[Record {i}] Error converting offsets for '{span_text}': {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a219e932-b161-4634-8914-5d5a9f7def54",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "\n",
    "def trim_and_fix_offsets(raw_data, context_key=\"additional_info\"):\n",
    "    \"\"\"\n",
    "    Attempt to fix leading/trailing whitespace in spans and recalc offsets.\n",
    "    Then do a local substring search to fix minor misalignments.\n",
    "    \"\"\"\n",
    "    fixed_data = []\n",
    "    for i, record in enumerate(raw_data):\n",
    "        text = record[context_key]\n",
    "        new_labels = []\n",
    "        for ann in record[\"label\"]:\n",
    "            label = ann[\"labels\"][0]\n",
    "            old_start, old_end = ann[\"start\"], ann[\"end\"]\n",
    "            original_substring = text[old_start:old_end]\n",
    "            trimmed_substring = original_substring.strip()\n",
    "            \n",
    "            # 1) Trim leading/trailing whitespace offsets\n",
    "            # Move start forward while it points to space\n",
    "            start = old_start\n",
    "            while start < old_end and text[start].isspace():\n",
    "                start += 1\n",
    "            # Move end backward while it points to space\n",
    "            end = old_end\n",
    "            while end > start and text[end - 1].isspace():\n",
    "                end -= 1\n",
    "            \n",
    "            # After naive trimming, see if the substring still matches\n",
    "            new_substring = text[start:end]\n",
    "            if new_substring == trimmed_substring:\n",
    "                # Great, we can trust these offsets directly\n",
    "                pass\n",
    "            else:\n",
    "                # Possibly there's hidden Unicode or the original offset was off.\n",
    "                # We'll do a local substring search around `old_start`.\n",
    "                # We'll search for `trimmed_substring` in a window of +/- 30 chars.\n",
    "                window_size = 30\n",
    "                \n",
    "                # Define a safe search window in the text\n",
    "                search_start = max(0, old_start - window_size)\n",
    "                search_end = min(len(text), old_end + window_size)\n",
    "                window_text = text[search_start:search_end]\n",
    "                \n",
    "                # Try to find the first occurrence of trimmed_substring in that window\n",
    "                local_pos = window_text.find(trimmed_substring)\n",
    "                if local_pos != -1:\n",
    "                    # Recalc absolute offset\n",
    "                    start = search_start + local_pos\n",
    "                    end = start + len(trimmed_substring)\n",
    "                    new_substring = text[start:end]\n",
    "                else:\n",
    "                    # We failed to find it in the local region\n",
    "                    print(f\"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}\")\n",
    "                    # We'll leave this annotation as-is or skip it\n",
    "                    start, end = old_start, old_end\n",
    "                    new_substring = original_substring\n",
    "\n",
    "            new_labels.append({\n",
    "                \"start\": start,\n",
    "                \"end\": end,\n",
    "                \"text\": new_substring,\n",
    "                \"labels\": [label]\n",
    "            })\n",
    "        \n",
    "        # Update the record with the new label data\n",
    "        new_record = dict(record)\n",
    "        new_record[\"label\"] = new_labels\n",
    "        fixed_data.append(new_record)\n",
    "    \n",
    "    return fixed_data\n",
    "\n",
    "\n",
    "# Usage example:\n",
    "# 1) Read your JSON\n",
    "with open(\"data/ner-training/03-15-labeled.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    raw_data = json.load(f)\n",
    "\n",
    "# 2) Fix whitespace + do local substring search\n",
    "fixed_data = trim_and_fix_offsets(raw_data, context_key=\"additional_info\")\n",
    "\n",
    "# 3) Write the fixed data back out\n",
    "with open(\"data/ner-training/03-15-labeled-fixed.json\", \"w\", encoding=\"utf-8\") as out:\n",
    "    json.dump(fixed_data, out, indent=2, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2a1a315-b7c5-4a91-875a-c540e05efe78",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}