File size: 3,386 Bytes
3dac39e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | #!/usr/bin/env python3
"""Fix trailing-period and zero-length span issues in cybersecurity NER JSONL files."""
import json, sys, os
FILES = [
"/home/ubuntu/alkyline/data/processed/enriched_5class_train_cleaned.jsonl",
"/home/ubuntu/alkyline/data/processed/aptner_5class_train.jsonl",
"/home/ubuntu/alkyline/data/processed/securebert2_5class_train.jsonl",
"/home/ubuntu/alkyline/data/processed/enriched_5class_valid_cleaned.jsonl",
]
def is_sentence_final_period(text: str, start: int, end: int) -> bool:
"""Return True if the period at text[end-1] is sentence-final, not part of
an abbreviation, file extension, IP address, URL, etc."""
surf = text[start:end]
if not surf.endswith("."):
return False
# Check what follows the span: should be end-of-string or whitespace
if end < len(text) and text[end] not in (" ", "\t", "\n", "\r", ""):
return False
# Don't strip if the surface text has multiple internal periods (IPs, URLs, versions)
# e.g. "192.168.1.1." or "www.evil.com." — count periods
core = surf[:-1] # without the trailing period
if core.count(".") >= 2:
return False
# Don't strip file extensions like "malware.exe." — if core looks like *.ext
if "." in core:
ext = core.rsplit(".", 1)[1]
# Common file/domain extensions — don't strip
if len(ext) <= 4 and ext.isalpha():
return False
# Don't strip known abbreviations (U.S., e.g., i.e., etc.)
if core.replace(".", "").isalpha() and len(core) <= 6 and core.count(".") >= 1:
return False
return True
def fix_file(path: str):
if not os.path.exists(path):
print(f"SKIP (not found): {path}")
return
lines = open(path).readlines()
fixed_lines = []
tp_fixed = 0
zl_removed = 0
for line in lines:
d = json.loads(line)
text = d["text"]
old_spans = d.get("spans", {})
new_spans = {}
for key, offsets in old_spans.items():
label = key.split(": ", 1)[0]
new_offsets = []
for s, e in offsets:
# Remove zero-length or negative spans
if s >= e:
zl_removed += 1
continue
# Fix trailing sentence-final period
if is_sentence_final_period(text, s, e):
e -= 1
tp_fixed += 1
if s >= e:
zl_removed += 1
continue
new_offsets.append([s, e])
if new_offsets:
new_surf = text[new_offsets[0][0]:new_offsets[0][1]]
new_key = f"{label}: {new_surf}"
# Merge if key collision (multiple offset groups map to same new key)
if new_key in new_spans:
new_spans[new_key].extend(new_offsets)
else:
new_spans[new_key] = new_offsets
d["spans"] = new_spans
fixed_lines.append(json.dumps(d, ensure_ascii=False) + "\n")
with open(path, "w") as f:
f.writelines(fixed_lines)
basename = os.path.basename(path)
print(f"{basename}: {tp_fixed} trailing periods fixed, {zl_removed} zero-length removed ({len(lines)} examples)")
if __name__ == "__main__":
for fp in FILES:
fix_file(fp)
|