lynn-twinkl commited on
Commit
949c6cc
·
1 Parent(s): 7868ff7

This code is used to fix trailing white spaces and punctuation on labeled training data

Browse files
Files changed (1) hide show
  1. notebooks/debug-labeled-data.py +89 -0
notebooks/debug-labeled-data.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import json
3
+ import re
4
+
5
+
6
+ json_file_path = sys.argv[1]
7
+ text_key = sys.argv[2]
8
+ out_path = sys.argv[3]
9
+
10
+ # ------------ FUNCTION ------------
11
+
12
+ def trim_and_fix_offsets(raw_data, context_key=text_key):
13
+ """
14
+ Attempt to fix leading/trailing whitespace in spans and recalc offsets.
15
+ Then do a local substring search to fix minor misalignments.
16
+ """
17
+ fixed_data = []
18
+ for i, record in enumerate(raw_data):
19
+ text = record[context_key]
20
+ new_labels = []
21
+ for ann in record["label"]:
22
+ label = ann["labels"][0]
23
+ old_start, old_end = ann["start"], ann["end"]
24
+ original_substring = text[old_start:old_end]
25
+ trimmed_substring = original_substring.strip()
26
+
27
+ # 1) Trim leading/trailing whitespace offsets
28
+ # Move start forward while it points to space
29
+ start = old_start
30
+ while start < old_end and text[start].isspace():
31
+ start += 1
32
+ # Move end backward while it points to space
33
+ end = old_end
34
+ while end > start and text[end - 1].isspace():
35
+ end -= 1
36
+
37
+ # After naive trimming, see if the substring still matches
38
+ new_substring = text[start:end]
39
+ if new_substring == trimmed_substring:
40
+ # Great, we can trust these offsets directly
41
+ pass
42
+ else:
43
+ # Possibly there's hidden Unicode or the original offset was off.
44
+ # We'll do a local substring search around `old_start`.
45
+ # We'll search for `trimmed_substring` in a window of +/- 30 chars.
46
+ window_size = 30
47
+
48
+ # Define a safe search window in the text
49
+ search_start = max(0, old_start - window_size)
50
+ search_end = min(len(text), old_end + window_size)
51
+ window_text = text[search_start:search_end]
52
+
53
+ # Try to find the first occurrence of trimmed_substring in that window
54
+ local_pos = window_text.find(trimmed_substring)
55
+ if local_pos != -1:
56
+ # Recalc absolute offset
57
+ start = search_start + local_pos
58
+ end = start + len(trimmed_substring)
59
+ new_substring = text[start:end]
60
+ else:
61
+ # We failed to find it in the local region
62
+ print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}")
63
+ # We'll leave this annotation as-is or skip it
64
+ start, end = old_start, old_end
65
+ new_substring = original_substring
66
+
67
+ new_labels.append({
68
+ "start": start,
69
+ "end": end,
70
+ "text": new_substring,
71
+ "labels": [label]
72
+ })
73
+
74
+ # Update the record with the new label data
75
+ new_record = dict(record)
76
+ new_record["label"] = new_labels
77
+ fixed_data.append(new_record)
78
+
79
+ return fixed_data
80
+
81
+
82
+ # ----------------- USAGE ----------------
83
+ with open(json_file_path, "r", encoding="utf-8") as f:
84
+ raw_data = json.load(f)
85
+
86
+ fixed_data = trim_and_fix_offsets(raw_data, context_key=text_key)
87
+
88
+ with open(out_path, "w", encoding="utf-8") as out:
89
+ json.dump(fixed_data, out, indent=2, ensure_ascii=False)