arcspan / scripts /fix_span_boundaries.py
chairulridjal's picture
Add files using upload-large-folder tool
3dac39e verified
#!/usr/bin/env python3
"""Fix trailing-period and zero-length span issues in cybersecurity NER JSONL files."""
import json, sys, os
FILES = [
"/home/ubuntu/alkyline/data/processed/enriched_5class_train_cleaned.jsonl",
"/home/ubuntu/alkyline/data/processed/aptner_5class_train.jsonl",
"/home/ubuntu/alkyline/data/processed/securebert2_5class_train.jsonl",
"/home/ubuntu/alkyline/data/processed/enriched_5class_valid_cleaned.jsonl",
]
def is_sentence_final_period(text: str, start: int, end: int) -> bool:
"""Return True if the period at text[end-1] is sentence-final, not part of
an abbreviation, file extension, IP address, URL, etc."""
surf = text[start:end]
if not surf.endswith("."):
return False
# Check what follows the span: should be end-of-string or whitespace
if end < len(text) and text[end] not in (" ", "\t", "\n", "\r", ""):
return False
# Don't strip if the surface text has multiple internal periods (IPs, URLs, versions)
# e.g. "192.168.1.1." or "www.evil.com." — count periods
core = surf[:-1] # without the trailing period
if core.count(".") >= 2:
return False
# Don't strip file extensions like "malware.exe." — if core looks like *.ext
if "." in core:
ext = core.rsplit(".", 1)[1]
# Common file/domain extensions — don't strip
if len(ext) <= 4 and ext.isalpha():
return False
# Don't strip known abbreviations (U.S., e.g., i.e., etc.)
if core.replace(".", "").isalpha() and len(core) <= 6 and core.count(".") >= 1:
return False
return True
def fix_file(path: str):
if not os.path.exists(path):
print(f"SKIP (not found): {path}")
return
lines = open(path).readlines()
fixed_lines = []
tp_fixed = 0
zl_removed = 0
for line in lines:
d = json.loads(line)
text = d["text"]
old_spans = d.get("spans", {})
new_spans = {}
for key, offsets in old_spans.items():
label = key.split(": ", 1)[0]
new_offsets = []
for s, e in offsets:
# Remove zero-length or negative spans
if s >= e:
zl_removed += 1
continue
# Fix trailing sentence-final period
if is_sentence_final_period(text, s, e):
e -= 1
tp_fixed += 1
if s >= e:
zl_removed += 1
continue
new_offsets.append([s, e])
if new_offsets:
new_surf = text[new_offsets[0][0]:new_offsets[0][1]]
new_key = f"{label}: {new_surf}"
# Merge if key collision (multiple offset groups map to same new key)
if new_key in new_spans:
new_spans[new_key].extend(new_offsets)
else:
new_spans[new_key] = new_offsets
d["spans"] = new_spans
fixed_lines.append(json.dumps(d, ensure_ascii=False) + "\n")
with open(path, "w") as f:
f.writelines(fixed_lines)
basename = os.path.basename(path)
print(f"{basename}: {tp_fixed} trailing periods fixed, {zl_removed} zero-length removed ({len(lines)} examples)")
if __name__ == "__main__":
for fp in FILES:
fix_file(fp)