{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "dd46f692-4e46-40f2-bbb3-73240922a3e0", "metadata": {}, "outputs": [], "source": [ "import json\n", "import random\n", "import spacy\n", "from spacy.training import offsets_to_biluo_tags" ] }, { "cell_type": "code", "execution_count": 2, "id": "d2795420-7d1b-4bff-8662-fe0c7e41cdb0", "metadata": {}, "outputs": [], "source": [ "with open('data/ner-training/03-15-labeled.json', 'r') as file:\n", " raw_data = json.load(file)" ] }, { "cell_type": "code", "execution_count": 4, "id": "e8f1f763-f649-4e89-af08-209c72d17093", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Record 8] Mismatch in offset for label 'Usage', text='to develop a sensory garde'\n", "[Record 12] Mismatch in offset for label 'Context', text='in a place of lower income, and a very tight budge'\n", "[Record 14] Leading/trailing whitespace in span ' This is an extra activity to the curriculum, hence, budgets for seeds, etc are limited.'\n", "[Record 20] Mismatch in offset for label 'Usage', text='to support all our children to access the gardening projects around nurser'\n", "[Record 20] Leading/trailing whitespace in span ' We are a very outdoors nursery'\n", "[Record 25] Mismatch in offset for label 'Context', text='We are a small village Preschool within the main school and We are following the Hygge approach in Preschoo'\n", "[Record 34] Leading/trailing whitespace in span 'We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '\n", "[Record 34] Mismatch in offset for label 'Context', text='We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '\n", "[Record 34] Leading/trailing whitespace in span ' bench needs a repaint, the pond could do with some clearing and some logs around the pond would be great to make it a welcomed area.'\n", "[Record 34] Mismatch in offset for label 'Usage', text=' bench needs a repaint, the pond could do with some clearing and some logs around the pond would be great to make it a welcomed area.'\n", "[Record 40] Mismatch in offset for label 'Context', text='ur gardening club is brilliant, but we never have enough gloves or resources'\n", "[Record 41] Leading/trailing whitespace in span ' invest in our outside areas to get children active'\n", "[Record 41] Mismatch in offset for label 'Usage', text=' invest in our outside areas to get children active'\n", "[Record 42] Leading/trailing whitespace in span ' help us transform more of the concrete box into a naturally diverse play environment.'\n", "[Record 42] Mismatch in offset for label 'Usage', text=' help us transform more of the concrete box into a naturally diverse play environment.'\n", "[Record 51] Mismatch in offset for label 'Context', text='t Netherbrook, we are committed to hands-on, outdoor learning'\n", "[Record 52] Leading/trailing whitespace in span ' to resource this area with compost or seeds / plants'\n", "[Record 52] Mismatch in offset for label 'Usage', text=' to resource this area with compost or seeds / plants'\n", "[Record 55] Mismatch in offset for label 'Context', text='e have been utilising a large flower bed and the children have been working hard to get it ready to plant flowers and food that we can eat at the end of the summer term'\n", "[Record 56] Leading/trailing whitespace in span 'We run a free after-school gardening club Providing opportunities for about twenty children each week '\n", "[Record 56] Mismatch in offset for label 'Context', text='We run a free after-school gardening club Providing opportunities for about twenty children each week '\n", "[Record 57] Leading/trailing whitespace in span ' to revamp our outdoor area'\n", "[Record 57] Mismatch in offset for label 'Usage', text=' to revamp our outdoor area'\n", "[Record 58] Leading/trailing whitespace in span ' Our school is in a deprived area so a lot of the children live in flats etc so do not have a garden'\n", "[Record 58] Mismatch in offset for label 'Benefit', text='ives them the opportunity to support the school and learn all about gardening'\n", "[Record 60] Leading/trailing whitespace in span ' have some planters in the reception outdoor area which need some love and attention!'\n", "[Record 60] Mismatch in offset for label 'Context', text=' have some planters in the reception outdoor area which need some love and attention!'\n", "[Record 64] Mismatch in offset for label 'Usage', text='to tidy up the village and local care homes with flower displays and planter'\n", "[Record 70] Leading/trailing whitespace in span ' Our outdoor area is in desperate need of some love but unfortunately there just isn't the budget for it'\n", "[Record 72] Leading/trailing whitespace in span 'We plan to build sensory gardens (such as a Zen garden and fairy garden) '\n", "[Record 72] Mismatch in offset for label 'Context', text='We plan to build sensory gardens (such as a Zen garden and fairy garden) '\n", "[Record 73] Leading/trailing whitespace in span ' These groups are focused upon our Catholic Social Teaching Principals, one of these being Stewardship'\n", "[Record 73] Leading/trailing whitespace in span ' This pupil led group are starting a small project to help create a prayer and reflection space in the Early Years outdoor area'\n", "[Record 92] Leading/trailing whitespace in span ' allowing the children to escape there for some peace and enjoy the colours, smells, and textures the garden will bring'\n", "[Record 92] Mismatch in offset for label 'Benefit', text=' allowing the children to escape there for some peace and enjoy the colours, smells, and textures the garden will bring'\n", "[Record 100] Leading/trailing whitespace in span ' We have dedicated areas across school for students engage with gardening that have fallen into disrepair due to lack of resources'\n", "[Record 100] Mismatch in offset for label 'Context', text=' We have dedicated areas across school for students engage with gardening that have fallen into disrepair due to lack of resources'\n", "[Record 101] Leading/trailing whitespace in span ' to set up a sensory room for pupils'\n", "[Record 101] Mismatch in offset for label 'Usage', text=' to set up a sensory room for pupils'\n", "[Record 111] Leading/trailing whitespace in span ' supplies and resources'\n", "[Record 111] Mismatch in offset for label 'Usage', text=' supplies and resources'\n", "[Record 112] Leading/trailing whitespace in span ' enhance our Learning Support resource room'\n", "[Record 112] Mismatch in offset for label 'Benefit', text=' enhance our Learning Support resource room'\n", "[Record 114] Leading/trailing whitespace in span ' to improve our school hall or to buy new swing seats for the secondary'\n", "[Record 114] Mismatch in offset for label 'Usage', text=' to improve our school hall or to buy new swing seats for the secondary'\n", "[Record 126] Mismatch in offset for label 'Usage', text='to develop a small stage area and secure props, costumes, and sound equipmen'\n", "[Record 134] Mismatch in offset for label 'Usage', text='o convert an unused space into a community heritage hub featuring interactive exhibits and archival collections'\n", "[Record 141] Leading/trailing whitespace in span ' our children have limited exposure to performing arts'\n", "[Record 141] Mismatch in offset for label 'Context', text=' our children have limited exposure to performing arts'\n", "[Record 148] Leading/trailing whitespace in span ' to purchase sensory equipment, including weighted blankets, fidget toys, and calming lighting'\n", "[Record 148] Mismatch in offset for label 'Usage', text=' to purchase sensory equipment, including weighted blankets, fidget toys, and calming lighting'\n", "[Record 151] Leading/trailing whitespace in span ' improve literacy and encourage a love of reading in reluctant readers'\n", "[Record 151] Mismatch in offset for label 'Benefit', text=' improve literacy and encourage a love of reading in reluctant readers'\n", "[Record 157] Leading/trailing whitespace in span ' improve focus, behaviour, and attendance'\n", "[Record 157] Mismatch in offset for label 'Benefit', text=' improve focus, behaviour, and attendance'\n", "[Record 160] Leading/trailing whitespace in span ' training materials and team-building activities'\n", "[Record 160] Mismatch in offset for label 'Usage', text=' training materials and team-building activities'\n", "[Record 163] Leading/trailing whitespace in span ' ensure equitable access to learning and promote independence.'\n", "[Record 163] Mismatch in offset for label 'Benefit', text=' ensure equitable access to learning and promote independence.'\n", "[Record 175] Leading/trailing whitespace in span ' to purchase soft floor mats, noise-canceling headphones, and interactive light panels'\n", "[Record 175] Mismatch in offset for label 'Usage', text=' to purchase soft floor mats, noise-canceling headphones, and interactive light panels'\n", "[Record 177] Leading/trailing whitespace in span ' I’ve watched our old kiln become almost unusable, and we’re constantly low on clay and paints'\n", "[Record 177] Mismatch in offset for label 'Context', text=' I’ve watched our old kiln become almost unusable, and we’re constantly low on clay and paints'\n", "[Record 178] Mismatch in offset for label 'Benefit', text='Building trust, resilience, and communication outside the classroom often translates into better collaboration back at schoo'\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have quite a high level of ALN children in our ...\" with entities \"[(101, 127, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to enhance the environment and resou...\" with entities \"[(190, 240, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"To win the money or the gardening bundle would be ...\" with entities \"[(59, 133, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are a small village Preschool within the main s...\" with entities \"[(0, 107, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to add to our garden area in the bac...\" with entities \"[(80, 180, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We would love to add to our garden area in the bac...\" with entities \"[(183, 316, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our gardening club is brilliant, but we never have...\" with entities \"[(1, 77, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an infant school in the middle of a city bu...\" with entities \"[(213, 264, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an urban school with a concrete playgroung....\" with entities \"[(238, 324, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At Netherbrook, we are committed to hands-on, outd...\" with entities \"[(1, 62, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have recently had our outdoor area redesigned a...\" with entities \"[(125, 178, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our early years classes have been loving our mud k...\" with entities \"[(107, 275, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We run a free after-school gardening club Providin...\" with entities \"[(0, 102, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are currently trying to revamp our outdoor area...\" with entities \"[(23, 50, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"The Eco-Council have been working hard to prepare ...\" with entities \"[(506, 583, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are an urban school without a green space (e.g....\" with entities \"[(68, 153, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are a small, rural school of 60 children with l...\" with entities \"[(389, 465, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"I work in an SEN school, where we are introducing ...\" with entities \"[(90, 163, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have a patch of grass that we are planning to t...\" with entities \"[(433, 552, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At our school, no child gets left behind. We offer...\" with entities \"[(266, 396, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are moving to a new site in September and I wou...\" with entities \"[(57, 93, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our children come from very difficult backgrounds ...\" with entities \"[(218, 241, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Budgets are tight, needs are great. We're a small ...\" with entities \"[(393, 436, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Greetings from our SEN school Harford Manor in Nor...\" with entities \"[(1004, 1075, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our school is proud to have recently formed a dram...\" with entities \"[(156, 232, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"At Heritage Intermediate, our curriculum lacks opp...\" with entities \"[(108, 219, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Coleridge Primary hopes to launch a drama club tha...\" with entities \"[(128, 182, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our school has recently seen an increase in childr...\" with entities \"[(113, 207, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our Year 5 cohort is behind in reading comprehensi...\" with entities \"[(162, 232, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"Our pupils often arrive at school hungry. We would...\" with entities \"[(137, 178, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We are launching a peer mentoring scheme for our Y...\" with entities \"[(94, 142, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"We have a number of students with visual impairmen...\" with entities \"[(155, 217, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"During my time working with students who have sens...\" with entities \"[(265, 351, 'Usage')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"As the lead art teacher, I’ve watched our old kiln...\" with entities \"[(24, 118, 'Context')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n", "/Users/lynn/venv/lib/python3.12/site-packages/spacy/training/iob_utils.py:149: UserWarning: [W030] Some entities could not be aligned in the text \"I help run a leadership program for high school ju...\" with entities \"[(373, 497, 'Benefit')]\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n", " warnings.warn(\n" ] } ], "source": [ "debug_nlp = spacy.blank(\"en\")\n", "\n", "for i, record in enumerate(raw_data):\n", " text = record[\"additional_info\"]\n", " doc = debug_nlp.make_doc(text)\n", " \n", " for ann in record[\"label\"]:\n", " label = ann[\"labels\"][0]\n", " start, end = ann[\"start\"], ann[\"end\"]\n", " span_text = text[start:end]\n", "\n", " # Quick check: leading or trailing whitespace?\n", " if span_text != span_text.strip():\n", " print(f\"[Record {i}] Leading/trailing whitespace in span '{span_text}'\")\n", "\n", " # Attempt to convert offset(s) -> BILUO\n", " try:\n", " biluo_tags = offsets_to_biluo_tags(doc, [(start, end, label)])\n", " # If any tag is '-' -> it means partial mismatch\n", " if any(t == \"-\" for t in biluo_tags):\n", " print(f\"[Record {i}] Mismatch in offset for label '{label}', text='{span_text}'\")\n", " except Exception as e:\n", " print(f\"[Record {i}] Error converting offsets for '{span_text}': {e}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "a219e932-b161-4634-8914-5d5a9f7def54", "metadata": {}, "outputs": [], "source": [ "import json\n", "import re\n", "\n", "def trim_and_fix_offsets(raw_data, context_key=\"additional_info\"):\n", " \"\"\"\n", " Attempt to fix leading/trailing whitespace in spans and recalc offsets.\n", " Then do a local substring search to fix minor misalignments.\n", " \"\"\"\n", " fixed_data = []\n", " for i, record in enumerate(raw_data):\n", " text = record[context_key]\n", " new_labels = []\n", " for ann in record[\"label\"]:\n", " label = ann[\"labels\"][0]\n", " old_start, old_end = ann[\"start\"], ann[\"end\"]\n", " original_substring = text[old_start:old_end]\n", " trimmed_substring = original_substring.strip()\n", " \n", " # 1) Trim leading/trailing whitespace offsets\n", " # Move start forward while it points to space\n", " start = old_start\n", " while start < old_end and text[start].isspace():\n", " start += 1\n", " # Move end backward while it points to space\n", " end = old_end\n", " while end > start and text[end - 1].isspace():\n", " end -= 1\n", " \n", " # After naive trimming, see if the substring still matches\n", " new_substring = text[start:end]\n", " if new_substring == trimmed_substring:\n", " # Great, we can trust these offsets directly\n", " pass\n", " else:\n", " # Possibly there's hidden Unicode or the original offset was off.\n", " # We'll do a local substring search around `old_start`.\n", " # We'll search for `trimmed_substring` in a window of +/- 30 chars.\n", " window_size = 30\n", " \n", " # Define a safe search window in the text\n", " search_start = max(0, old_start - window_size)\n", " search_end = min(len(text), old_end + window_size)\n", " window_text = text[search_start:search_end]\n", " \n", " # Try to find the first occurrence of trimmed_substring in that window\n", " local_pos = window_text.find(trimmed_substring)\n", " if local_pos != -1:\n", " # Recalc absolute offset\n", " start = search_start + local_pos\n", " end = start + len(trimmed_substring)\n", " new_substring = text[start:end]\n", " else:\n", " # We failed to find it in the local region\n", " print(f\"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}\")\n", " # We'll leave this annotation as-is or skip it\n", " start, end = old_start, old_end\n", " new_substring = original_substring\n", "\n", " new_labels.append({\n", " \"start\": start,\n", " \"end\": end,\n", " \"text\": new_substring,\n", " \"labels\": [label]\n", " })\n", " \n", " # Update the record with the new label data\n", " new_record = dict(record)\n", " new_record[\"label\"] = new_labels\n", " fixed_data.append(new_record)\n", " \n", " return fixed_data\n", "\n", "\n", "# Usage example:\n", "# 1) Read your JSON\n", "with open(\"data/ner-training/03-15-labeled.json\", \"r\", encoding=\"utf-8\") as f:\n", " raw_data = json.load(f)\n", "\n", "# 2) Fix whitespace + do local substring search\n", "fixed_data = trim_and_fix_offsets(raw_data, context_key=\"additional_info\")\n", "\n", "# 3) Write the fixed data back out\n", "with open(\"data/ner-training/03-15-labeled-fixed.json\", \"w\", encoding=\"utf-8\") as out:\n", " json.dump(fixed_data, out, indent=2, ensure_ascii=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "d2a1a315-b7c5-4a91-875a-c540e05efe78", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }