File size: 3,386 Bytes
3dac39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
"""Fix trailing-period and zero-length span issues in cybersecurity NER JSONL files."""

import json, sys, os

FILES = [
    "/home/ubuntu/alkyline/data/processed/enriched_5class_train_cleaned.jsonl",
    "/home/ubuntu/alkyline/data/processed/aptner_5class_train.jsonl",
    "/home/ubuntu/alkyline/data/processed/securebert2_5class_train.jsonl",
    "/home/ubuntu/alkyline/data/processed/enriched_5class_valid_cleaned.jsonl",
]


def is_sentence_final_period(text: str, start: int, end: int) -> bool:
    """Return True if the period at text[end-1] is sentence-final, not part of
    an abbreviation, file extension, IP address, URL, etc."""
    surf = text[start:end]
    if not surf.endswith("."):
        return False
    # Check what follows the span: should be end-of-string or whitespace
    if end < len(text) and text[end] not in (" ", "\t", "\n", "\r", ""):
        return False
    # Don't strip if the surface text has multiple internal periods (IPs, URLs, versions)
    # e.g. "192.168.1.1." or "www.evil.com." — count periods
    core = surf[:-1]  # without the trailing period
    if core.count(".") >= 2:
        return False
    # Don't strip file extensions like "malware.exe." — if core looks like *.ext
    if "." in core:
        ext = core.rsplit(".", 1)[1]
        # Common file/domain extensions — don't strip
        if len(ext) <= 4 and ext.isalpha():
            return False
    # Don't strip known abbreviations (U.S., e.g., i.e., etc.)
    if core.replace(".", "").isalpha() and len(core) <= 6 and core.count(".") >= 1:
        return False
    return True


def fix_file(path: str):
    if not os.path.exists(path):
        print(f"SKIP (not found): {path}")
        return

    lines = open(path).readlines()
    fixed_lines = []
    tp_fixed = 0
    zl_removed = 0

    for line in lines:
        d = json.loads(line)
        text = d["text"]
        old_spans = d.get("spans", {})
        new_spans = {}

        for key, offsets in old_spans.items():
            label = key.split(": ", 1)[0]
            new_offsets = []
            for s, e in offsets:
                # Remove zero-length or negative spans
                if s >= e:
                    zl_removed += 1
                    continue
                # Fix trailing sentence-final period
                if is_sentence_final_period(text, s, e):
                    e -= 1
                    tp_fixed += 1
                    if s >= e:
                        zl_removed += 1
                        continue
                new_offsets.append([s, e])

            if new_offsets:
                new_surf = text[new_offsets[0][0]:new_offsets[0][1]]
                new_key = f"{label}: {new_surf}"
                # Merge if key collision (multiple offset groups map to same new key)
                if new_key in new_spans:
                    new_spans[new_key].extend(new_offsets)
                else:
                    new_spans[new_key] = new_offsets

        d["spans"] = new_spans
        fixed_lines.append(json.dumps(d, ensure_ascii=False) + "\n")

    with open(path, "w") as f:
        f.writelines(fixed_lines)

    basename = os.path.basename(path)
    print(f"{basename}: {tp_fixed} trailing periods fixed, {zl_removed} zero-length removed ({len(lines)} examples)")


if __name__ == "__main__":
    for fp in FILES:
        fix_file(fp)