In [3]:
import json
import random
import spacy
from spacy.training import offsets_to_biluo_tags

In [2]:
with open('data/ner-training/03-15-labeled.json', 'r') as file:
 raw_data = json.load(file)

In [4]:
debug_nlp = spacy.blank("en")

for i, record in enumerate(raw_data):
 text = record["additional_info"]
 doc = debug_nlp.make_doc(text)
 
 for ann in record["label"]:
 label = ann["labels"][0]
 start, end = ann["start"], ann["end"]
 span_text = text[start:end]

 # Quick check: leading or trailing whitespace?
 if span_text != span_text.strip():
 print(f"[Record {i}] Leading/trailing whitespace in span '{span_text}'")

 # Attempt to convert offset(s) -> BILUO
 try:
 biluo_tags = offsets_to_biluo_tags(doc, [(start, end, label)])
 # If any tag is '-' -> it means partial mismatch
 if any(t == "-" for t in biluo_tags):
 print(f"[Record {i}] Mismatch in offset for label '{label}', text='{span_text}'")
 except Exception as e:
 print(f"[Record {i}] Error converting offsets for '{span_text}': {e}")


[Record 8] Mismatch in offset for label 'Usage', text='to develop a sensory garde'
[Record 12] Mismatch in offset for label 'Context', text='in a place of lower income, and a very tight budge'
[Record 14] Leading/trailing whitespace in span ' This is an extra activity to the curriculum, hence, budgets for seeds, etc are limited.'
[Record 20] Mismatch in offset for label 'Usage', text='to support all our children to access the gardening projects around nurser'
[Record 20] Leading/trailing whitespace in span ' We are a very outdoors nursery'
[Record 25] Mismatch in offset for label 'Context', text='We are a small village Preschool within the main school and We are following the Hygge approach in Preschoo'
[Record 34] Leading/trailing whitespace in span 'We have lots of opportunity as we have a pond and bench area but no money to bring it back to life! '
[Record 34] Mismatch in offset for label 'Context', text='We have lots of opportunity as we have a pond and bench area but no money to b



In [6]:
import json
import re

def trim_and_fix_offsets(raw_data, context_key="additional_info"):
 """
 Attempt to fix leading/trailing whitespace in spans and recalc offsets.
 Then do a local substring search to fix minor misalignments.
 """
 fixed_data = []
 for i, record in enumerate(raw_data):
 text = record[context_key]
 new_labels = []
 for ann in record["label"]:
 label = ann["labels"][0]
 old_start, old_end = ann["start"], ann["end"]
 original_substring = text[old_start:old_end]
 trimmed_substring = original_substring.strip()
 
 # 1) Trim leading/trailing whitespace offsets
 # Move start forward while it points to space
 start = old_start
 while start < old_end and text[start].isspace():
 start += 1
 # Move end backward while it points to space
 end = old_end
 while end > start and text[end - 1].isspace():
 end -= 1
 
 # After naive trimming, see if the substring still matches
 new_substring = text[start:end]
 if new_substring == trimmed_substring:
 # Great, we can trust these offsets directly
 pass
 else:
 # Possibly there's hidden Unicode or the original offset was off.
 # We'll do a local substring search around `old_start`.
 # We'll search for `trimmed_substring` in a window of +/- 30 chars.
 window_size = 30
 
 # Define a safe search window in the text
 search_start = max(0, old_start - window_size)
 search_end = min(len(text), old_end + window_size)
 window_text = text[search_start:search_end]
 
 # Try to find the first occurrence of trimmed_substring in that window
 local_pos = window_text.find(trimmed_substring)
 if local_pos != -1:
 # Recalc absolute offset
 start = search_start + local_pos
 end = start + len(trimmed_substring)
 new_substring = text[start:end]
 else:
 # We failed to find it in the local region
 print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}")
 # We'll leave this annotation as-is or skip it
 start, end = old_start, old_end
 new_substring = original_substring

 new_labels.append({
 "start": start,
 "end": end,
 "text": new_substring,
 "labels": [label]
 })
 
 # Update the record with the new label data
 new_record = dict(record)
 new_record["label"] = new_labels
 fixed_data.append(new_record)
 
 return fixed_data


# Usage example:
# 1) Read your JSON
with open("data/ner-training/03-15-labeled.json", "r", encoding="utf-8") as f:
 raw_data = json.load(f)

# 2) Fix whitespace + do local substring search
fixed_data = trim_and_fix_offsets(raw_data, context_key="additional_info")

# 3) Write the fixed data back out
with open("data/ner-training/03-15-labeled-fixed.json", "w", encoding="utf-8") as out:
 json.dump(fixed_data, out, indent=2, ensure_ascii=False)