File size: 13,519 Bytes
038e086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
#!/usr/bin/env python3
"""Sample 3K NVD descriptions and annotate with cybersecurity entity spans."""

import json, re, random, os
from collections import defaultdict

INPUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_descriptions.jsonl"
SAMPLE_OUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_sample_3k.jsonl"
OUTPUT = "/home/ubuntu/alkyline/data/processed/llm_annotated_nvd_v2.jsonl"

random.seed(42)

# ── STEP 1: Sample 3K richest descriptions ──

def richness_score(text):
    """Score how 'rich' a description is for annotation."""
    score = len(text) / 100.0  # longer = more entities
    # Bonus for specific patterns
    if re.search(r'CVE-\d{4}-\d{4,}', text): score += 3
    if re.search(r'\d+\.\d+\.\d+', text): score += 2  # version numbers
    if re.search(r'(?:allows?|enables?)\s+(?:remote|local)', text, re.I): score += 2
    if re.search(r'(?:SQL injection|XSS|buffer overflow|RCE|CSRF)', text, re.I): score += 2
    if re.search(r'(?:/[a-z]+/[a-z]|\.php|\.py|\.js|\.c\b)', text, re.I): score += 2
    return score

print("Loading and scoring...")
by_year = defaultdict(list)
for line in open(INPUT):
    rec = json.loads(line)
    text = rec['text']
    if len(text) <= 100:
        continue
    year = rec['cve_id'].split('-')[1]
    if year < '2020':
        continue
    score = richness_score(text)
    by_year[year].append((score, rec))

# Sample ~430 per year (3000/7), picking top-scoring
TARGET = 3000
years = sorted(by_year.keys())
per_year = TARGET // len(years)
remainder = TARGET - per_year * len(years)

sample = []
for i, year in enumerate(years):
    items = by_year[year]
    items.sort(key=lambda x: -x[0])
    n = per_year + (1 if i < remainder else 0)
    # Take top 2*n, then randomly sample n from those for diversity
    pool = items[:max(n * 3, n)]
    chosen = random.sample(pool, min(n, len(pool)))
    sample.extend([rec for _, rec in chosen])

random.shuffle(sample)
print(f"Sampled {len(sample)} descriptions across {len(years)} years")

with open(SAMPLE_OUT, 'w') as f:
    for rec in sample:
        f.write(json.dumps(rec) + '\n')
print(f"Wrote {SAMPLE_OUT}")

# ── STEP 2: Annotate ──

# Vulnerability type patterns (case-insensitive matching, find exact text)
VULN_PATTERNS = [
    r'remote code execution',
    r'code execution',
    r'SQL injection',
    r'cross-site scripting',
    r'cross-site request forgery',
    r'buffer overflow',
    r'heap overflow',
    r'stack overflow',
    r'stack-based buffer overflow',
    r'heap-based buffer overflow',
    r'integer overflow',
    r'integer underflow',
    r'use after free',
    r'use-after-free',
    r'double free',
    r'null pointer dereference',
    r'NULL pointer dereference',
    r'out of bounds read',
    r'out-of-bounds read',
    r'out of bounds write',
    r'out-of-bounds write',
    r'out of bounds access',
    r'out-of-bounds access',
    r'out of bounds memory',
    r'privilege escalation',
    r'escalation of privilege',
    r'denial of service',
    r'denial-of-service',
    r'information disclosure',
    r'information leak',
    r'memory leak',
    r'memory corruption',
    r'directory traversal',
    r'path traversal',
    r'command injection',
    r'OS command injection',
    r'XML external entity',
    r'XXE',
    r'SSRF',
    r'server-side request forgery',
    r'open redirect',
    r'authentication bypass',
    r'authorization bypass',
    r'improper authentication',
    r'improper authorization',
    r'improper access control',
    r'improper input validation',
    r'improper neutralization',
    r'race condition',
    r'time-of-check time-of-use',
    r'TOCTOU',
    r'type confusion',
    r'deserialization',
    r'insecure deserialization',
    r'prototype pollution',
    r'reflected XSS',
    r'stored XSS',
    r'DOM-based XSS',
    r'arbitrary file upload',
    r'arbitrary file read',
    r'arbitrary file write',
    r'arbitrary file deletion',
    r'local file inclusion',
    r'remote file inclusion',
    r'server-side template injection',
    r'SSTI',
    r'LDAP injection',
    r'XPath injection',
    r'CRLF injection',
    r'header injection',
    r'log injection',
    r'format string',
    r'symlink',
    r'hardcoded credentials',
    r'hard-coded credentials',
    r'hardcoded password',
    r'hard-coded password',
    r'cleartext transmission',
    r'cleartext storage',
    r'uncontrolled resource consumption',
    r'infinite loop',
    r'resource exhaustion',
]

# Organization patterns
ORGS = [
    'Microsoft', 'Google', 'Apple', 'Adobe', 'Cisco', 'Oracle', 'IBM',
    'Apache', 'Mozilla', 'Samsung', 'Intel', 'AMD', 'Qualcomm', 'NVIDIA',
    'Red Hat', 'Canonical', 'Debian', 'Ubuntu', 'Fedora', 'SUSE',
    'VMware', 'Broadcom', 'Juniper', 'Fortinet', 'Palo Alto Networks',
    'Check Point', 'F5', 'Citrix', 'SAP', 'Siemens', 'Schneider Electric',
    'Rockwell Automation', 'ABB', 'Honeywell', 'Huawei', 'ZTE',
    'D-Link', 'TP-Link', 'Netgear', 'ASUS', 'Zyxel', 'MikroTik',
    'WordPress', 'Drupal', 'Joomla', 'GitLab', 'GitHub', 'Atlassian',
    'Jenkins', 'Docker', 'Kubernetes', 'HashiCorp', 'Elastic',
    'Trend Micro', 'Kaspersky', 'McAfee', 'Symantec', 'Sophos',
    'CrowdStrike', 'SentinelOne', 'Splunk', 'Rapid7',
    'Dell', 'HP', 'Lenovo', 'Xerox', 'Epson', 'Canon',
    'Zoom', 'Slack', 'Salesforce', 'ServiceNow', 'Ivanti',
    'SolarWinds', 'ManageEngine', 'Progress', 'Veeam',
    'Moodle', 'MediaWiki', 'phpMyAdmin', 'Roundcube',
    'OpenSSL', 'OpenSSH', 'GnuPG', 'cURL',
    'Facebook', 'Meta', 'Amazon', 'AWS', 'Cloudflare',
    'MITRE', 'NIST', 'CISA',
    'Tenda', 'TOTOLINK', 'LB-LINK', 'Ruijie', 'H3C',
    'Aruba', 'Ruckus', 'Mitel', 'Avaya',
    'Moxa', 'Phoenix Contact', 'WAGO', 'Beckhoff',
    'Synology', 'QNAP', 'Western Digital', 'Buffalo',
    'Grafana', 'Prometheus', 'InfluxDB',
    'JetBrains', 'Eclipse', 'Spring',
    'Node.js', 'npm', 'PyPI',
]

# System/product patterns - match as whole words
SYSTEMS = [
    'Windows', 'Linux', 'macOS', 'Android', 'iOS', 'ChromeOS',
    'Windows Server', 'Windows 10', 'Windows 11',
    'Internet Explorer', 'Microsoft Edge', 'Google Chrome', 'Mozilla Firefox', 'Safari',
    'Apache HTTP Server', 'Apache Tomcat', 'Apache Struts', 'Apache Kafka',
    'Apache ActiveMQ', 'Apache Camel', 'Apache Flink', 'Apache Spark',
    'Apache Airflow', 'Apache Superset', 'Apache Solr', 'Apache Dubbo',
    'Apache NiFi', 'Apache OFBiz', 'Apache RocketMQ', 'Apache Pulsar',
    'Apache Log4j', 'Apache Commons',
    'nginx', 'NGINX', 'IIS',
    'MySQL', 'PostgreSQL', 'MariaDB', 'MongoDB', 'Redis', 'SQLite',
    'Microsoft SQL Server', 'Oracle Database',
    'Microsoft Exchange', 'Microsoft Office', 'Microsoft Teams',
    'Microsoft SharePoint', 'Microsoft Outlook', 'Microsoft Word',
    'Visual Studio Code', 'Visual Studio',
    'VMware ESXi', 'VMware vCenter', 'VMware Workstation',
    'Docker Desktop', 'Kubernetes',
    'OpenSSL', 'OpenSSH', 'OpenVPN', 'WireGuard',
    'Samba', 'BIND', 'ISC BIND',
    'PHP', 'Python', 'Java', 'Ruby',
    'WordPress', 'Drupal', 'Joomla', 'Magento', 'PrestaShop',
    'GitLab', 'Grafana', 'Jenkins', 'Ansible', 'Terraform',
    'Chromium', 'WebKit', 'V8',
    'QEMU', 'VirtualBox', 'Xen', 'KVM',
    'systemd', 'sudo', 'polkit', 'glibc', 'libxml2', 'libcurl',
    'FFmpeg', 'ImageMagick', 'GStreamer', 'Wireshark',
    'Fortinet FortiOS', 'FortiOS', 'FortiGate', 'FortiProxy',
    'FortiAnalyzer', 'FortiManager', 'FortiWeb', 'FortiClient',
    'Palo Alto PAN-OS', 'PAN-OS', 'GlobalProtect',
    'Cisco IOS', 'Cisco IOS XE', 'Cisco NX-OS', 'Cisco ASA',
    'Cisco Firepower', 'Cisco Webex', 'Cisco SD-WAN',
    'SonicWall', 'Sophos XG', 'Sophos UTM',
    'Ivanti Connect Secure', 'Ivanti Policy Secure',
    'Citrix ADC', 'Citrix Gateway', 'Citrix NetScaler',
    'SAP NetWeaver', 'SAP HANA', 'SAP BusinessObjects',
    'Splunk Enterprise', 'Splunk Cloud',
    'Elasticsearch', 'Kibana', 'Logstash',
    'Moodle', 'Canvas LMS', 'Blackboard',
    'Zimbra', 'Roundcube', 'Dovecot', 'Postfix', 'Exim', 'Sendmail',
    'cURL', 'curl', 'wget',
    'Linux kernel', 'FreeBSD', 'NetBSD', 'OpenBSD',
    'Xen hypervisor',
]

def find_all_nonoverlapping(text, pattern, flags=0):
    """Find all non-overlapping matches with their offsets."""
    results = []
    for m in re.finditer(pattern, text, flags):
        results.append((m.start(), m.end(), m.group()))
    return results

def annotate(rec):
    text = rec['text']
    spans = defaultdict(list)
    used_ranges = []  # track to avoid overlaps

    def overlaps(s, e):
        for us, ue in used_ranges:
            if s < ue and e > us:
                return True
        return False

    def add_span(label, start, end, entity_text):
        if not overlaps(start, end):
            key = f"{label}: {entity_text}"
            spans[key].append([start, end])
            used_ranges.append((start, end))

    # 1. CVE_ID (regex, exact)
    for m in re.finditer(r'CVE-\d{4}-\d{4,}', text):
        add_span('CVE_ID', m.start(), m.end(), m.group())

    # 2. IP_ADDRESS
    for m in re.finditer(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text):
        add_span('IP_ADDRESS', m.start(), m.end(), m.group())

    # 3. URL
    for m in re.finditer(r'https?://[^\s)<>"]+', text):
        add_span('URL', m.start(), m.end(), m.group())

    # 4. DOMAIN (after URL to avoid overlap)
    for m in re.finditer(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com|org|net|io|gov|edu|mil|co|info|biz|dev|app|cloud)\b', text):
        if not overlaps(m.start(), m.end()):
            add_span('DOMAIN', m.start(), m.end(), m.group())

    # 5. EMAIL
    for m in re.finditer(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
        add_span('EMAIL', m.start(), m.end(), m.group())

    # 6. HASH (SHA-256, SHA-1, MD5)
    for m in re.finditer(r'\b[a-fA-F0-9]{64}\b', text):
        add_span('HASH', m.start(), m.end(), m.group())
    for m in re.finditer(r'\b[a-fA-F0-9]{40}\b', text):
        if not overlaps(m.start(), m.end()):
            add_span('HASH', m.start(), m.end(), m.group())
    for m in re.finditer(r'\b[a-fA-F0-9]{32}\b', text):
        if not overlaps(m.start(), m.end()):
            add_span('HASH', m.start(), m.end(), m.group())

    # 7. FILEPATH
    for m in re.finditer(r'(?:/[a-zA-Z0-9_.@-]+){2,}(?:\.[a-zA-Z0-9]+)?', text):
        add_span('FILEPATH', m.start(), m.end(), m.group())
    # Windows-style paths
    for m in re.finditer(r'[A-Z]:\\(?:[a-zA-Z0-9_.@ -]+\\)*[a-zA-Z0-9_.@ -]+', text):
        if not overlaps(m.start(), m.end()):
            add_span('FILEPATH', m.start(), m.end(), m.group())
    # Filenames with extensions in common code patterns
    for m in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\.(?:php|py|js|java|c|cpp|h|rb|go|rs|pl|sh|bat|ps1|xml|json|yaml|yml|conf|cfg|ini|log|sql|html|jsp|asp|aspx|cgi)\b', text):
        if not overlaps(m.start(), m.end()):
            add_span('FILEPATH', m.start(), m.end(), m.group())

    # 8. SYSTEM (longer matches first to handle "Apache HTTP Server" before "Apache")
    systems_sorted = sorted(SYSTEMS, key=len, reverse=True)
    for sys_name in systems_sorted:
        pat = re.escape(sys_name)
        for m in re.finditer(r'\b' + pat + r'\b', text):
            if not overlaps(m.start(), m.end()):
                add_span('SYSTEM', m.start(), m.end(), m.group())

    # 9. ORGANIZATION (longer first, avoid overlap with SYSTEM)
    orgs_sorted = sorted(ORGS, key=len, reverse=True)
    for org in orgs_sorted:
        pat = re.escape(org)
        for m in re.finditer(r'\b' + pat + r'\b', text):
            if not overlaps(m.start(), m.end()):
                add_span('ORGANIZATION', m.start(), m.end(), m.group())

    # 10. VULNERABILITY (case-insensitive, but capture exact text)
    vuln_sorted = sorted(VULN_PATTERNS, key=len, reverse=True)
    for vp in vuln_sorted:
        for m in re.finditer(r'\b' + vp + r'\b', text, re.IGNORECASE):
            actual = m.group()
            if not overlaps(m.start(), m.end()):
                add_span('VULNERABILITY', m.start(), m.end(), actual)

    # Convert defaultdict to regular dict
    spans_dict = {k: v for k, v in spans.items()}

    return {
        "text": text,
        "spans": spans_dict,
        "info": {"source": "nvd_v2", "cve_id": rec["cve_id"]}
    }

# ── Process and write ──
print("Annotating...")
with open(OUTPUT, 'w') as f:
    for i, rec in enumerate(sample):
        result = annotate(rec)
        f.write(json.dumps(result, ensure_ascii=False) + '\n')
        if (i + 1) % 500 == 0:
            print(f"  {i+1}/{len(sample)}")

print(f"Wrote {len(sample)} annotated records to {OUTPUT}")

# ── Verify offsets ──
print("\nVerifying offsets...")
errors = 0
total_spans = 0
for i, line in enumerate(open(OUTPUT)):
    rec = json.loads(line)
    for key, offsets in rec["spans"].items():
        entity = key.split(": ", 1)[1]
        for start, end in offsets:
            total_spans += 1
            if rec["text"][start:end] != entity:
                errors += 1
                if errors <= 5:
                    print(f"  ERROR line {i}: expected '{entity}', got '{rec['text'][start:end]}'")

print(f"Total spans: {total_spans}, Errors: {errors}")

# Stats
label_counts = defaultdict(int)
for line in open(OUTPUT):
    rec = json.loads(line)
    for key in rec["spans"]:
        label = key.split(": ", 1)[0]
        label_counts[label] += len(rec["spans"][key])

print("\nLabel distribution:")
for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
    print(f"  {label}: {count}")