| |
| """Fix trailing-period and zero-length span issues in cybersecurity NER JSONL files.""" |
|
|
| import json, sys, os |
|
|
| FILES = [ |
| "/home/ubuntu/alkyline/data/processed/enriched_5class_train_cleaned.jsonl", |
| "/home/ubuntu/alkyline/data/processed/aptner_5class_train.jsonl", |
| "/home/ubuntu/alkyline/data/processed/securebert2_5class_train.jsonl", |
| "/home/ubuntu/alkyline/data/processed/enriched_5class_valid_cleaned.jsonl", |
| ] |
|
|
|
|
| def is_sentence_final_period(text: str, start: int, end: int) -> bool: |
| """Return True if the period at text[end-1] is sentence-final, not part of |
| an abbreviation, file extension, IP address, URL, etc.""" |
| surf = text[start:end] |
| if not surf.endswith("."): |
| return False |
| |
| if end < len(text) and text[end] not in (" ", "\t", "\n", "\r", ""): |
| return False |
| |
| |
| core = surf[:-1] |
| if core.count(".") >= 2: |
| return False |
| |
| if "." in core: |
| ext = core.rsplit(".", 1)[1] |
| |
| if len(ext) <= 4 and ext.isalpha(): |
| return False |
| |
| if core.replace(".", "").isalpha() and len(core) <= 6 and core.count(".") >= 1: |
| return False |
| return True |
|
|
|
|
| def fix_file(path: str): |
| if not os.path.exists(path): |
| print(f"SKIP (not found): {path}") |
| return |
|
|
| lines = open(path).readlines() |
| fixed_lines = [] |
| tp_fixed = 0 |
| zl_removed = 0 |
|
|
| for line in lines: |
| d = json.loads(line) |
| text = d["text"] |
| old_spans = d.get("spans", {}) |
| new_spans = {} |
|
|
| for key, offsets in old_spans.items(): |
| label = key.split(": ", 1)[0] |
| new_offsets = [] |
| for s, e in offsets: |
| |
| if s >= e: |
| zl_removed += 1 |
| continue |
| |
| if is_sentence_final_period(text, s, e): |
| e -= 1 |
| tp_fixed += 1 |
| if s >= e: |
| zl_removed += 1 |
| continue |
| new_offsets.append([s, e]) |
|
|
| if new_offsets: |
| new_surf = text[new_offsets[0][0]:new_offsets[0][1]] |
| new_key = f"{label}: {new_surf}" |
| |
| if new_key in new_spans: |
| new_spans[new_key].extend(new_offsets) |
| else: |
| new_spans[new_key] = new_offsets |
|
|
| d["spans"] = new_spans |
| fixed_lines.append(json.dumps(d, ensure_ascii=False) + "\n") |
|
|
| with open(path, "w") as f: |
| f.writelines(fixed_lines) |
|
|
| basename = os.path.basename(path) |
| print(f"{basename}: {tp_fixed} trailing periods fixed, {zl_removed} zero-length removed ({len(lines)} examples)") |
|
|
|
|
| if __name__ == "__main__": |
| for fp in FILES: |
| fix_file(fp) |
|
|