lynn-twinkl commited on
Commit
b86fc29
·
1 Parent(s): e080251

Moved file

Browse files
Files changed (1) hide show
  1. notebooks/debug-labeled-data.py +0 -89
notebooks/debug-labeled-data.py DELETED
@@ -1,89 +0,0 @@
1
- import sys
2
- import json
3
- import re
4
-
5
-
6
- json_file_path = sys.argv[1]
7
- text_key = sys.argv[2]
8
- out_path = sys.argv[3]
9
-
10
- # ------------ FUNCTION ------------
11
-
12
- def trim_and_fix_offsets(raw_data, context_key=text_key):
13
- """
14
- Attempt to fix leading/trailing whitespace in spans and recalc offsets.
15
- Then do a local substring search to fix minor misalignments.
16
- """
17
- fixed_data = []
18
- for i, record in enumerate(raw_data):
19
- text = record[context_key]
20
- new_labels = []
21
- for ann in record["label"]:
22
- label = ann["labels"][0]
23
- old_start, old_end = ann["start"], ann["end"]
24
- original_substring = text[old_start:old_end]
25
- trimmed_substring = original_substring.strip()
26
-
27
- # 1) Trim leading/trailing whitespace offsets
28
- # Move start forward while it points to space
29
- start = old_start
30
- while start < old_end and text[start].isspace():
31
- start += 1
32
- # Move end backward while it points to space
33
- end = old_end
34
- while end > start and text[end - 1].isspace():
35
- end -= 1
36
-
37
- # After naive trimming, see if the substring still matches
38
- new_substring = text[start:end]
39
- if new_substring == trimmed_substring:
40
- # Great, we can trust these offsets directly
41
- pass
42
- else:
43
- # Possibly there's hidden Unicode or the original offset was off.
44
- # We'll do a local substring search around `old_start`.
45
- # We'll search for `trimmed_substring` in a window of +/- 30 chars.
46
- window_size = 30
47
-
48
- # Define a safe search window in the text
49
- search_start = max(0, old_start - window_size)
50
- search_end = min(len(text), old_end + window_size)
51
- window_text = text[search_start:search_end]
52
-
53
- # Try to find the first occurrence of trimmed_substring in that window
54
- local_pos = window_text.find(trimmed_substring)
55
- if local_pos != -1:
56
- # Recalc absolute offset
57
- start = search_start + local_pos
58
- end = start + len(trimmed_substring)
59
- new_substring = text[start:end]
60
- else:
61
- # We failed to find it in the local region
62
- print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}")
63
- # We'll leave this annotation as-is or skip it
64
- start, end = old_start, old_end
65
- new_substring = original_substring
66
-
67
- new_labels.append({
68
- "start": start,
69
- "end": end,
70
- "text": new_substring,
71
- "labels": [label]
72
- })
73
-
74
- # Update the record with the new label data
75
- new_record = dict(record)
76
- new_record["label"] = new_labels
77
- fixed_data.append(new_record)
78
-
79
- return fixed_data
80
-
81
-
82
- # ----------------- USAGE ----------------
83
- with open(json_file_path, "r", encoding="utf-8") as f:
84
- raw_data = json.load(f)
85
-
86
- fixed_data = trim_and_fix_offsets(raw_data, context_key=text_key)
87
-
88
- with open(out_path, "w", encoding="utf-8") as out:
89
- json.dump(fixed_data, out, indent=2, ensure_ascii=False)