File size: 6,940 Bytes
3dac39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
"""Annotate NVD CVE descriptions with cybersecurity NER entities."""
import json, re, sys

with open("/home/ubuntu/alkyline/data/raw/nvd_cves_filtered.json") as f:
    cves = json.load(f)

# Entity patterns with priorities (later patterns can override earlier)
# We'll collect all matches, then resolve overlaps

def find_all(text):
    """Find all entity spans in text. Returns list of (start, end, label, surface)."""
    spans = []
    
    # CVE_ID
    for m in re.finditer(r'CVE-\d{4}-\d{4,}', text):
        spans.append((m.start(), m.end(), 'CVE_ID', m.group()))
    
    # VULNERABILITY patterns
    vuln_patterns = [
        r'(?:buffer\s+overflow|stack\s+overflow|heap\s+overflow|integer\s+overflow)',
        r'(?:use-after-free|double[- ]free|null\s+pointer\s+dereference|NULL\s+pointer\s+dereference)',
        r'(?:remote\s+code\s+execution|arbitrary\s+code\s+execution)',
        r'(?:SQL\s+injection|command\s+injection|code\s+injection|OS\s+command\s+injection)',
        r'(?:cross-site\s+scripting|stored\s+XSS|reflected\s+XSS|DOM-based\s+cross-site\s+scripting|Stored\s+Cross-Site\s+Scripting)',
        r'(?:denial\s+of\s+service|DoS)',
        r'(?:path\s+traversal|directory\s+traversal|Path\s+Traversal)',
        r'(?:privilege\s+escalation|authentication\s+bypass)',
        r'(?:information\s+disclosure|information\s+leak)',
        r'(?:memory\s+leak|memory\s+corruption|memory\s+consumption)',
        r'(?:race\s+condition)',
        r'(?:out-of-bounds\s+(?:read|write|access)|oob\s+access)',
        r'(?:type\s+confusion)',
        r'(?:improper\s+(?:input\s+validation|access\s+control|authentication|authorization))',
        r'(?:brute-force\s+protection\s+bypass)',
        r'(?:SASL\s+mechanism\s+downgrade)',
        r'(?:response\s+injection)',
        r'(?:prompt\s+injection)',
        r'(?:token\s+theft)',
        r'(?:arbitrary\s+file\s+(?:upload|delete|read|write)s?)',
        r'(?:sanitization\s+bypass)',
        r'(?:RBAC\s+(?:bypass|restriction)s?)',
    ]
    for pat in vuln_patterns:
        for m in re.finditer(pat, text, re.IGNORECASE):
            spans.append((m.start(), m.end(), 'VULNERABILITY', m.group()))
    
    # SYSTEM patterns - software/hardware names
    system_patterns = [
        r'\b(?:Linux\s+kernel)\b',
        r'\b(?:WordPress)\b',
        r'\b(?:Apache\s+(?:HTTP\s+Server|Tomcat|Kafka|Struts|Maven|Hadoop|Spark|Flink|Airflow|Camel|Solr|CXF|NiFi|OFBiz))\b',
        r'\b(?:Cisco\s+(?:IOS|NX-OS|ASA|WebEx|Meraki|Catalyst|Nexus|ISE))\b',
        r'\b(?:Android|iOS|macOS|Windows|FreeBSD|OpenBSD|NetBSD)\b',
        r'\b(?:Chrome|Firefox|Safari|Edge|Opera)\b',
        r'\b(?:MySQL|PostgreSQL|MariaDB|MongoDB|Redis|SQLite|Oracle\s+Database)\b',
        r'\b(?:nginx|HAProxy|Envoy|Traefik)\b',
        r'\b(?:Docker|Kubernetes|Kyverno|Helm)\b',
        r'\b(?:OpenSSL|GnuTLS|LibreSSL|BoringSSL)\b',
        r'\b(?:Git|GitLab|GitHub|Bitbucket)\b',
        r'\b(?:MailKit|basic-ftp|AnythingLLM|Contact\s+Form\s+7)\b',
        r'\b(?:STARTTLS|GraphQL|LDAP|OAuth|FTP|SMTP|HTTP|HTTPS|SSH|DNS|NFS|TFTP)\b',
        r'\b(?:Delta\s+Electronics\s+AS320T|AS320T)\b',
        r'\b(?:D-Link\s+DWM-222W|DWM-222W)\b',
        r'\b(?:Royal\s+Elementor\s+Addons)\b',
        r'\b(?:Booking\s+Calendar\s+Contact\s+Form)\b',
        r'\b(?:WP\s+Books\s+Gallery)\b',
        r'\b(?:Drag\s+and\s+Drop\s+File\s+Upload)\b',
        r'\b(?:BetterDocs)\b',
        r'\b(?:ExactMetrics)\b',
        r'\b(?:MaxiBlocks\s+Builder)\b',
        r'\b(?:Mobile\s+Next)\b',
        r'\b(?:Sentry)\b',
        r'\b(?:OpenAI\s+API)\b',
        r'\b(?:Google\s+Ads)\b',
        r'\b(?:ConfigMap)\b',
        r'\b(?:ServiceAccount)\b',
        r'\b(?:CODEOWNERS)\b',
    ]
    for pat in system_patterns:
        for m in re.finditer(pat, text):
            spans.append((m.start(), m.end(), 'SYSTEM', m.group()))
    
    # ORGANIZATION patterns
    org_patterns = [
        r'\b(?:Microsoft|Google|Apple|Amazon|Meta|Facebook)\b',
        r'\b(?:Cisco|Intel|AMD|NVIDIA|Qualcomm|Broadcom|Samsung)\b',
        r'\b(?:Oracle|IBM|SAP|VMware|Red\s+Hat|Canonical|SUSE)\b',
        r'\b(?:Mozilla|Apache\s+(?:Software\s+)?Foundation)\b',
        r'\b(?:Delta\s+Electronics)\b',
        r'\b(?:D-Link)\b',
        r'\b(?:GitLab)\b',  # also org
        r'\b(?:OpenAI)\b',
    ]
    for pat in org_patterns:
        for m in re.finditer(pat, text):
            spans.append((m.start(), m.end(), 'ORGANIZATION', m.group()))
    
    # FILEPATH patterns
    for m in re.finditer(r'(?:/[\w.-]+){2,}(?:\.\w+)?(?:\(\))?', text):
        spans.append((m.start(), m.end(), 'FILEPATH', m.group()))
    
    # Function names like foo_bar() or foo::bar()
    for m in re.finditer(r'\b\w+(?:_\w+)+\(\)', text):
        s = m.group()
        if len(s) > 6:  # skip very short
            spans.append((m.start(), m.end(), 'FILEPATH', s))
    
    return spans


def resolve_overlaps(spans):
    """Remove overlapping spans, preferring longer matches and specific labels."""
    # Sort by start, then by length descending
    spans.sort(key=lambda x: (x[0], -(x[1]-x[0])))
    
    result = []
    for s in spans:
        # Check overlap with already accepted spans
        overlaps = False
        for r in result:
            if s[0] < r[1] and s[1] > r[0]:  # overlap
                overlaps = True
                break
        if not overlaps:
            result.append(s)
    return result


def to_opf_format(cve_id, text, spans):
    """Convert to OPF JSONL format."""
    span_dict = {}
    for start, end, label, surface in spans:
        # Verify offset
        actual = text[start:end]
        if actual != surface:
            print(f"WARNING: offset mismatch in {cve_id}: expected '{surface}' got '{actual}'", file=sys.stderr)
            continue
        key = f"{label}: {surface}"
        if key not in span_dict:
            span_dict[key] = []
        span_dict[key].append([start, end])
    
    return {
        "text": text,
        "spans": span_dict,
        "info": {"id": f"nvd_{cve_id.replace('CVE-','').replace('-','_')}", "source": "nvd_cve"}
    }


# Process all CVEs
output = []
total_spans = 0
for c in cves:
    text = c['desc']
    cve_id = c['id']
    
    # Prepend CVE ID to text if not already there
    if cve_id not in text:
        text = f"{cve_id}: {text}"
    
    raw_spans = find_all(text)
    spans = resolve_overlaps(raw_spans)
    
    if len(spans) < 2:
        continue  # skip if too few entities
    
    record = to_opf_format(cve_id, text, spans)
    total_spans += len(spans)
    output.append(record)

print(f"Annotated {len(output)} CVEs with {total_spans} total spans ({total_spans/len(output):.1f} avg)", file=sys.stderr)

outpath = "/home/ubuntu/alkyline/data/processed/llm_annotated_nvd.jsonl"
with open(outpath, "w") as f:
    for rec in output:
        f.write(json.dumps(rec) + "\n")

print(f"Written to {outpath}", file=sys.stderr)