#!/usr/bin/env python3 """Fix trailing-period and zero-length span issues in cybersecurity NER JSONL files.""" import json, sys, os FILES = [ "/home/ubuntu/alkyline/data/processed/enriched_5class_train_cleaned.jsonl", "/home/ubuntu/alkyline/data/processed/aptner_5class_train.jsonl", "/home/ubuntu/alkyline/data/processed/securebert2_5class_train.jsonl", "/home/ubuntu/alkyline/data/processed/enriched_5class_valid_cleaned.jsonl", ] def is_sentence_final_period(text: str, start: int, end: int) -> bool: """Return True if the period at text[end-1] is sentence-final, not part of an abbreviation, file extension, IP address, URL, etc.""" surf = text[start:end] if not surf.endswith("."): return False # Check what follows the span: should be end-of-string or whitespace if end < len(text) and text[end] not in (" ", "\t", "\n", "\r", ""): return False # Don't strip if the surface text has multiple internal periods (IPs, URLs, versions) # e.g. "192.168.1.1." or "www.evil.com." — count periods core = surf[:-1] # without the trailing period if core.count(".") >= 2: return False # Don't strip file extensions like "malware.exe." — if core looks like *.ext if "." in core: ext = core.rsplit(".", 1)[1] # Common file/domain extensions — don't strip if len(ext) <= 4 and ext.isalpha(): return False # Don't strip known abbreviations (U.S., e.g., i.e., etc.) if core.replace(".", "").isalpha() and len(core) <= 6 and core.count(".") >= 1: return False return True def fix_file(path: str): if not os.path.exists(path): print(f"SKIP (not found): {path}") return lines = open(path).readlines() fixed_lines = [] tp_fixed = 0 zl_removed = 0 for line in lines: d = json.loads(line) text = d["text"] old_spans = d.get("spans", {}) new_spans = {} for key, offsets in old_spans.items(): label = key.split(": ", 1)[0] new_offsets = [] for s, e in offsets: # Remove zero-length or negative spans if s >= e: zl_removed += 1 continue # Fix trailing sentence-final period if is_sentence_final_period(text, s, e): e -= 1 tp_fixed += 1 if s >= e: zl_removed += 1 continue new_offsets.append([s, e]) if new_offsets: new_surf = text[new_offsets[0][0]:new_offsets[0][1]] new_key = f"{label}: {new_surf}" # Merge if key collision (multiple offset groups map to same new key) if new_key in new_spans: new_spans[new_key].extend(new_offsets) else: new_spans[new_key] = new_offsets d["spans"] = new_spans fixed_lines.append(json.dumps(d, ensure_ascii=False) + "\n") with open(path, "w") as f: f.writelines(fixed_lines) basename = os.path.basename(path) print(f"{basename}: {tp_fixed} trailing periods fixed, {zl_removed} zero-length removed ({len(lines)} examples)") if __name__ == "__main__": for fp in FILES: fix_file(fp)