File size: 31,748 Bytes
6c37d4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "dd46f692-4e46-40f2-bbb3-73240922a3e0",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import random\n",
"import spacy\n",
"from spacy.training import offsets_to_biluo_tags"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d2795420-7d1b-4bff-8662-fe0c7e41cdb0",
"metadata": {},
"outputs": [],
"source": [
"with open('data/ner-training/03-15-labeled.json', 'r') as file:\n",
" raw_data = json.load(file)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e8f1f763-f649-4e89-af08-209c72d17093",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Record 8] Mismatch in offset for label 'Usage', text='to develop a sensory garde'\n",
"[Record 12] Mismatch in offset for label 'Context', text='in a place of lower income, and a very tight budge'\n",
"[Record 14] Leading/trailing whitespace in span ' This is an extra activity to the curriculum, hence, budgets for seeds, etc are limited.'\n",
"[Record 20] Mismatch in offset for label 'Usage', text='to support all our children to access the gardening projects around nurser'\n",
"[Record 20] Leading/trailing whitespace in span ' We are a very outdoors nursery'\n",
"[Record 25] Mismatch in offset for label 'Context', text='We are a small village Preschool within the main school and We are following the Hygge approach in Preschoo'\n",
"[Record 34] Leading/trailing whitespace in span 'We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '\n",
"[Record 34] Mismatch in offset for label 'Context', text='We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '\n",
"[Record 34] Leading/trailing whitespace in span ' bench needs a repaint, the pond could do with some clearing and some logs around the pond would be great to make it a welcomed area.'\n",
"[Record 34] Mismatch in offset for label 'Usage', text=' bench needs a repaint, the pond could do with some clearing and some logs around the pond would be great to make it a welcomed area.'\n",
"[Record 40] Mismatch in offset for label 'Context', text='ur gardening club is brilliant, but we never have enough gloves or resources'\n",
"[Record 41] Leading/trailing whitespace in span ' invest in our outside areas to get children active'\n",
"[Record 41] Mismatch in offset for label 'Usage', text=' invest in our outside areas to get children active'\n",
"[Record 42] Leading/trailing whitespace in span ' help us transform more of the concrete box into a naturally diverse play environment.'\n",
"[Record 42] Mismatch in offset for label 'Usage', text=' help us transform more of the concrete box into a naturally diverse play environment.'\n",
"[Record 51] Mismatch in offset for label 'Context', text='t Netherbrook, we are committed to hands-on, outdoor learning'\n",
"[Record 52] Leading/trailing whitespace in span ' to resource this area with compost or seeds / plants'\n",
"[Record 52] Mismatch in offset for label 'Usage', text=' to resource this area with compost or seeds / plants'\n",
"[Record 55] Mismatch in offset for label 'Context', text='e have been utilising a large flower bed and the children have been working hard to get it ready to plant flowers and food that we can eat at the end of the summer term'\n",
"[Record 56] Leading/trailing whitespace in span 'We run a free after-school gardening club Providing opportunities for about twenty children each week '\n",
"[Record 56] Mismatch in offset for label 'Context', text='We run a free after-school gardening club Providing opportunities for about twenty children each week '\n",
"[Record 57] Leading/trailing whitespace in span ' to revamp our outdoor area'\n",
"[Record 57] Mismatch in offset for label 'Usage', text=' to revamp our outdoor area'\n",
"[Record 58] Leading/trailing whitespace in span ' Our school is in a deprived area so a lot of the children live in flats etc so do not have a garden'\n",
"[Record 58] Mismatch in offset for label 'Benefit', text='ives them the opportunity to support the school and learn all about gardening'\n",
"[Record 60] Leading/trailing whitespace in span ' have some planters in the reception outdoor area which need some love and attention!'\n",
"[Record 60] Mismatch in offset for label 'Context', text=' have some planters in the reception outdoor area which need some love and attention!'\n",
"[Record 64] Mismatch in offset for label 'Usage', text='to tidy up the village and local care homes with flower displays and planter'\n",
"[Record 70] Leading/trailing whitespace in span ' Our outdoor area is in desperate need of some love but unfortunately there just isn't the budget for it'\n",
"[Record 72] Leading/trailing whitespace in span 'We plan to build sensory gardens (such as a Zen garden and fairy garden) '\n",
"[Record 72] Mismatch in offset for label 'Context', text='We plan to build sensory gardens (such as a Zen garden and fairy garden) '\n",
"[Record 73] Leading/trailing whitespace in span ' These groups are focused upon our Catholic Social Teaching Principals, one of these being Stewardship'\n",
"[Record 73] Leading/trailing whitespace in span ' This pupil led group are starting a small project to help create a prayer and reflection space in the Early Years outdoor area'\n",
"[Record 92] Leading/trailing whitespace in span ' allowing the children to escape there for some peace and enjoy the colours, smells, and textures the garden will bring'\n",
"[Record 92] Mismatch in offset for label 'Benefit', text=' allowing the children to escape there for some peace and enjoy the colours, smells, and textures the garden will bring'\n",
"[Record 100] Leading/trailing whitespace in span ' We have dedicated areas across school for students engage with gardening that have fallen into disrepair due to lack of resources'\n",
"[Record 100] Mismatch in offset for label 'Context', text=' We have dedicated areas across school for students engage with gardening that have fallen into disrepair due to lack of resources'\n",
"[Record 101] Leading/trailing whitespace in span ' to set up a sensory room for pupils'\n",
"[Record 101] Mismatch in offset for label 'Usage', text=' to set up a sensory room for pupils'\n",
"[Record 111] Leading/trailing whitespace in span ' supplies and resources'\n",
"[Record 111] Mismatch in offset for label 'Usage', text=' supplies and resources'\n",
"[Record 112] Leading/trailing whitespace in span ' enhance our Learning Support resource room'\n",
"[Record 112] Mismatch in offset for label 'Benefit', text=' enhance our Learning Support resource room'\n",
"[Record 114] Leading/trailing whitespace in span ' to improve our school hall or to buy new swing seats for the secondary'\n",
"[Record 114] Mismatch in offset for label 'Usage', text=' to improve our school hall or to buy new swing seats for the secondary'\n",
"[Record 126] Mismatch in offset for label 'Usage', text='to develop a small stage area and secure props, costumes, and sound equipmen'\n",
"[Record 134] Mismatch in offset for label 'Usage', text='o convert an unused space into a community heritage hub featuring interactive exhibits and archival collections'\n",
"[Record 141] Leading/trailing whitespace in span ' our children have limited exposure to performing arts'\n",
"[Record 141] Mismatch in offset for label 'Context', text=' our children have limited exposure to performing arts'\n",
"[Record 148] Leading/trailing whitespace in span ' to purchase sensory equipment, including weighted blankets, fidget toys, and calming lighting'\n",
"[Record 148] Mismatch in offset for label 'Usage', text=' to purchase sensory equipment, including weighted blankets, fidget toys, and calming lighting'\n",
"[Record 151] Leading/trailing whitespace in span ' improve literacy and encourage a love of reading in reluctant readers'\n",
"[Record 151] Mismatch in offset for label 'Benefit', text=' improve literacy and encourage a love of reading in reluctant readers'\n",
"[Record 157] Leading/trailing whitespace in span ' improve focus, behaviour, and attendance'\n",
"[Record 157] Mismatch in offset for label 'Benefit', text=' improve focus, behaviour, and attendance'\n",
"[Record 160] Leading/trailing whitespace in span ' training materials and team-building activities'\n",
"[Record 160] Mismatch in offset for label 'Usage', text=' training materials and team-building activities'\n",
"[Record 163] Leading/trailing whitespace in span ' ensure equitable access to learning and promote independence.'\n",
"[Record 163] Mismatch in offset for label 'Benefit', text=' ensure equitable access to learning and promote independence.'\n",
"[Record 175] Leading/trailing whitespace in span ' to purchase soft floor mats, noise-canceling headphones, and interactive light panels'\n",
"[Record 175] Mismatch in offset for label 'Usage', text=' to purchase soft floor mats, noise-canceling headphones, and interactive light panels'\n",
"[Record 177] Leading/trailing whitespace in span ' I’ve watched our old kiln become almost unusable, and we’re constantly low on clay and paints'\n",
"[Record 177] Mismatch in offset for label 'Context', text=' I’ve watched our old kiln become almost unusable, and we’re constantly low on clay and paints'\n",
"[Record 178] Mismatch in offset for label 'Benefit', text='Building trust, resilience, and communication outside the classroom often translates into better collaboration back at schoo'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have quite a high level of ALN children in our ...\" with entities \"[(101, 127, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to enhance the environment and resou...\" with entities \"[(190, 240, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"To win the money or the gardening bundle would be ...\" with entities \"[(59, 133, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are a small village Preschool within the main s...\" with entities \"[(0, 107, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to add to our garden area in the bac...\" with entities \"[(80, 180, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to add to our garden area in the bac...\" with entities \"[(183, 316, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our gardening club is brilliant, but we never have...\" with entities \"[(1, 77, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an infant school in the middle of a city bu...\" with entities \"[(213, 264, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an urban school with a concrete playgroung....\" with entities \"[(238, 324, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At Netherbrook, we are committed to hands-on, outd...\" with entities \"[(1, 62, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have recently had our outdoor area redesigned a...\" with entities \"[(125, 178, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our early years classes have been loving our mud k...\" with entities \"[(107, 275, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We run a free after-school gardening club Providin...\" with entities \"[(0, 102, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are currently trying to revamp our outdoor area...\" with entities \"[(23, 50, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"The Eco-Council have been working hard to prepare ...\" with entities \"[(506, 583, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an urban school without a green space (e.g....\" with entities \"[(68, 153, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are a small, rural school of 60 children with l...\" with entities \"[(389, 465, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"I work in an SEN school, where we are introducing ...\" with entities \"[(90, 163, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have a patch of grass that we are planning to t...\" with entities \"[(433, 552, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At our school, no child gets left behind. We offer...\" with entities \"[(266, 396, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are moving to a new site in September and I wou...\" with entities \"[(57, 93, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our children come from very difficult backgrounds ...\" with entities \"[(218, 241, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Budgets are tight, needs are great. We're a small ...\" with entities \"[(393, 436, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Greetings from our SEN school Harford Manor in Nor...\" with entities \"[(1004, 1075, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our school is proud to have recently formed a dram...\" with entities \"[(156, 232, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At Heritage Intermediate, our curriculum lacks opp...\" with entities \"[(108, 219, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Coleridge Primary hopes to launch a drama club tha...\" with entities \"[(128, 182, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our school has recently seen an increase in childr...\" with entities \"[(113, 207, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our Year 5 cohort is behind in reading comprehensi...\" with entities \"[(162, 232, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our pupils often arrive at school hungry. We would...\" with entities \"[(137, 178, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are launching a peer mentoring scheme for our Y...\" with entities \"[(94, 142, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have a number of students with visual impairmen...\" with entities \"[(155, 217, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"During my time working with students who have sens...\" with entities \"[(265, 351, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"As the lead art teacher, I’ve watched our old kiln...\" with entities \"[(24, 118, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n",
"/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"I help run a leadership program for high school ju...\" with entities \"[(373, 497, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
" warnings.warn(\n"
]
}
],
"source": [
"debug_nlp = spacy.blank(\"en\")\n",
"\n",
"for i, record in enumerate(raw_data):\n",
" text = record[\"additional_info\"]\n",
" doc = debug_nlp.make_doc(text)\n",
" \n",
" for ann in record[\"label\"]:\n",
" label = ann[\"labels\"][0]\n",
" start, end = ann[\"start\"], ann[\"end\"]\n",
" span_text = text[start:end]\n",
"\n",
" # Quick check: leading or trailing whitespace?\n",
" if span_text != span_text.strip():\n",
" print(f\"[Record {i}] Leading/trailing whitespace in span '{span_text}'\")\n",
"\n",
" # Attempt to convert offset(s) -> BILUO\n",
" try:\n",
" biluo_tags = offsets_to_biluo_tags(doc, [(start, end, label)])\n",
" # If any tag is '-' -> it means partial mismatch\n",
" if any(t == \"-\" for t in biluo_tags):\n",
" print(f\"[Record {i}] Mismatch in offset for label '{label}', text='{span_text}'\")\n",
" except Exception as e:\n",
" print(f\"[Record {i}] Error converting offsets for '{span_text}': {e}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a219e932-b161-4634-8914-5d5a9f7def54",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import re\n",
"\n",
"def trim_and_fix_offsets(raw_data, context_key=\"additional_info\"):\n",
" \"\"\"\n",
" Attempt to fix leading/trailing whitespace in spans and recalc offsets.\n",
" Then do a local substring search to fix minor misalignments.\n",
" \"\"\"\n",
" fixed_data = []\n",
" for i, record in enumerate(raw_data):\n",
" text = record[context_key]\n",
" new_labels = []\n",
" for ann in record[\"label\"]:\n",
" label = ann[\"labels\"][0]\n",
" old_start, old_end = ann[\"start\"], ann[\"end\"]\n",
" original_substring = text[old_start:old_end]\n",
" trimmed_substring = original_substring.strip()\n",
" \n",
" # 1) Trim leading/trailing whitespace offsets\n",
" # Move start forward while it points to space\n",
" start = old_start\n",
" while start < old_end and text[start].isspace():\n",
" start += 1\n",
" # Move end backward while it points to space\n",
" end = old_end\n",
" while end > start and text[end - 1].isspace():\n",
" end -= 1\n",
" \n",
" # After naive trimming, see if the substring still matches\n",
" new_substring = text[start:end]\n",
" if new_substring == trimmed_substring:\n",
" # Great, we can trust these offsets directly\n",
" pass\n",
" else:\n",
" # Possibly there's hidden Unicode or the original offset was off.\n",
" # We'll do a local substring search around `old_start`.\n",
" # We'll search for `trimmed_substring` in a window of +/- 30 chars.\n",
" window_size = 30\n",
" \n",
" # Define a safe search window in the text\n",
" search_start = max(0, old_start - window_size)\n",
" search_end = min(len(text), old_end + window_size)\n",
" window_text = text[search_start:search_end]\n",
" \n",
" # Try to find the first occurrence of trimmed_substring in that window\n",
" local_pos = window_text.find(trimmed_substring)\n",
" if local_pos != -1:\n",
" # Recalc absolute offset\n",
" start = search_start + local_pos\n",
" end = start + len(trimmed_substring)\n",
" new_substring = text[start:end]\n",
" else:\n",
" # We failed to find it in the local region\n",
" print(f\"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}\")\n",
" # We'll leave this annotation as-is or skip it\n",
" start, end = old_start, old_end\n",
" new_substring = original_substring\n",
"\n",
" new_labels.append({\n",
" \"start\": start,\n",
" \"end\": end,\n",
" \"text\": new_substring,\n",
" \"labels\": [label]\n",
" })\n",
" \n",
" # Update the record with the new label data\n",
" new_record = dict(record)\n",
" new_record[\"label\"] = new_labels\n",
" fixed_data.append(new_record)\n",
" \n",
" return fixed_data\n",
"\n",
"\n",
"# Usage example:\n",
"# 1) Read your JSON\n",
"with open(\"data/ner-training/03-15-labeled.json\", \"r\", encoding=\"utf-8\") as f:\n",
" raw_data = json.load(f)\n",
"\n",
"# 2) Fix whitespace + do local substring search\n",
"fixed_data = trim_and_fix_offsets(raw_data, context_key=\"additional_info\")\n",
"\n",
"# 3) Write the fixed data back out\n",
"with open(\"data/ner-training/03-15-labeled-fixed.json\", \"w\", encoding=\"utf-8\") as out:\n",
" json.dump(fixed_data, out, indent=2, ensure_ascii=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2a1a315-b7c5-4a91-875a-c540e05efe78",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|