| | import sys |
| | import json |
| | import re |
| |
|
| |
|
| | json_file_path = sys.argv[1] |
| | text_key = sys.argv[2] |
| | out_path = sys.argv[3] |
| |
|
| | |
| |
|
| | def trim_and_fix_offsets(raw_data, context_key=text_key): |
| | """ |
| | Attempt to fix leading/trailing whitespace in spans and recalc offsets. |
| | Then do a local substring search to fix minor misalignments. |
| | """ |
| | fixed_data = [] |
| | for i, record in enumerate(raw_data): |
| | text = record[context_key] |
| | new_labels = [] |
| | for ann in record["label"]: |
| | label = ann["labels"][0] |
| | old_start, old_end = ann["start"], ann["end"] |
| | original_substring = text[old_start:old_end] |
| | trimmed_substring = original_substring.strip() |
| | |
| | |
| | |
| | start = old_start |
| | while start < old_end and text[start].isspace(): |
| | start += 1 |
| | |
| | end = old_end |
| | while end > start and text[end - 1].isspace(): |
| | end -= 1 |
| | |
| | |
| | new_substring = text[start:end] |
| | if new_substring == trimmed_substring: |
| | |
| | pass |
| | else: |
| | |
| | |
| | |
| | window_size = 30 |
| | |
| | |
| | search_start = max(0, old_start - window_size) |
| | search_end = min(len(text), old_end + window_size) |
| | window_text = text[search_start:search_end] |
| | |
| | |
| | local_pos = window_text.find(trimmed_substring) |
| | if local_pos != -1: |
| | |
| | start = search_start + local_pos |
| | end = start + len(trimmed_substring) |
| | new_substring = text[start:end] |
| | else: |
| | |
| | print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}") |
| | |
| | start, end = old_start, old_end |
| | new_substring = original_substring |
| |
|
| | new_labels.append({ |
| | "start": start, |
| | "end": end, |
| | "text": new_substring, |
| | "labels": [label] |
| | }) |
| | |
| | |
| | new_record = dict(record) |
| | new_record["label"] = new_labels |
| | fixed_data.append(new_record) |
| | |
| | return fixed_data |
| |
|
| |
|
| | |
| | with open(json_file_path, "r", encoding="utf-8") as f: |
| | raw_data = json.load(f) |
| |
|
| | fixed_data = trim_and_fix_offsets(raw_data, context_key=text_key) |
| |
|
| | with open(out_path, "w", encoding="utf-8") as out: |
| | json.dump(fixed_data, out, indent=2, ensure_ascii=False) |
| |
|