File size: 13,519 Bytes
038e086 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | #!/usr/bin/env python3
"""Sample 3K NVD descriptions and annotate with cybersecurity entity spans."""
import json, re, random, os
from collections import defaultdict
INPUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_descriptions.jsonl"
SAMPLE_OUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_sample_3k.jsonl"
OUTPUT = "/home/ubuntu/alkyline/data/processed/llm_annotated_nvd_v2.jsonl"
random.seed(42)
# ββ STEP 1: Sample 3K richest descriptions ββ
def richness_score(text):
"""Score how 'rich' a description is for annotation."""
score = len(text) / 100.0 # longer = more entities
# Bonus for specific patterns
if re.search(r'CVE-\d{4}-\d{4,}', text): score += 3
if re.search(r'\d+\.\d+\.\d+', text): score += 2 # version numbers
if re.search(r'(?:allows?|enables?)\s+(?:remote|local)', text, re.I): score += 2
if re.search(r'(?:SQL injection|XSS|buffer overflow|RCE|CSRF)', text, re.I): score += 2
if re.search(r'(?:/[a-z]+/[a-z]|\.php|\.py|\.js|\.c\b)', text, re.I): score += 2
return score
print("Loading and scoring...")
by_year = defaultdict(list)
for line in open(INPUT):
rec = json.loads(line)
text = rec['text']
if len(text) <= 100:
continue
year = rec['cve_id'].split('-')[1]
if year < '2020':
continue
score = richness_score(text)
by_year[year].append((score, rec))
# Sample ~430 per year (3000/7), picking top-scoring
TARGET = 3000
years = sorted(by_year.keys())
per_year = TARGET // len(years)
remainder = TARGET - per_year * len(years)
sample = []
for i, year in enumerate(years):
items = by_year[year]
items.sort(key=lambda x: -x[0])
n = per_year + (1 if i < remainder else 0)
# Take top 2*n, then randomly sample n from those for diversity
pool = items[:max(n * 3, n)]
chosen = random.sample(pool, min(n, len(pool)))
sample.extend([rec for _, rec in chosen])
random.shuffle(sample)
print(f"Sampled {len(sample)} descriptions across {len(years)} years")
with open(SAMPLE_OUT, 'w') as f:
for rec in sample:
f.write(json.dumps(rec) + '\n')
print(f"Wrote {SAMPLE_OUT}")
# ββ STEP 2: Annotate ββ
# Vulnerability type patterns (case-insensitive matching, find exact text)
VULN_PATTERNS = [
r'remote code execution',
r'code execution',
r'SQL injection',
r'cross-site scripting',
r'cross-site request forgery',
r'buffer overflow',
r'heap overflow',
r'stack overflow',
r'stack-based buffer overflow',
r'heap-based buffer overflow',
r'integer overflow',
r'integer underflow',
r'use after free',
r'use-after-free',
r'double free',
r'null pointer dereference',
r'NULL pointer dereference',
r'out of bounds read',
r'out-of-bounds read',
r'out of bounds write',
r'out-of-bounds write',
r'out of bounds access',
r'out-of-bounds access',
r'out of bounds memory',
r'privilege escalation',
r'escalation of privilege',
r'denial of service',
r'denial-of-service',
r'information disclosure',
r'information leak',
r'memory leak',
r'memory corruption',
r'directory traversal',
r'path traversal',
r'command injection',
r'OS command injection',
r'XML external entity',
r'XXE',
r'SSRF',
r'server-side request forgery',
r'open redirect',
r'authentication bypass',
r'authorization bypass',
r'improper authentication',
r'improper authorization',
r'improper access control',
r'improper input validation',
r'improper neutralization',
r'race condition',
r'time-of-check time-of-use',
r'TOCTOU',
r'type confusion',
r'deserialization',
r'insecure deserialization',
r'prototype pollution',
r'reflected XSS',
r'stored XSS',
r'DOM-based XSS',
r'arbitrary file upload',
r'arbitrary file read',
r'arbitrary file write',
r'arbitrary file deletion',
r'local file inclusion',
r'remote file inclusion',
r'server-side template injection',
r'SSTI',
r'LDAP injection',
r'XPath injection',
r'CRLF injection',
r'header injection',
r'log injection',
r'format string',
r'symlink',
r'hardcoded credentials',
r'hard-coded credentials',
r'hardcoded password',
r'hard-coded password',
r'cleartext transmission',
r'cleartext storage',
r'uncontrolled resource consumption',
r'infinite loop',
r'resource exhaustion',
]
# Organization patterns
ORGS = [
'Microsoft', 'Google', 'Apple', 'Adobe', 'Cisco', 'Oracle', 'IBM',
'Apache', 'Mozilla', 'Samsung', 'Intel', 'AMD', 'Qualcomm', 'NVIDIA',
'Red Hat', 'Canonical', 'Debian', 'Ubuntu', 'Fedora', 'SUSE',
'VMware', 'Broadcom', 'Juniper', 'Fortinet', 'Palo Alto Networks',
'Check Point', 'F5', 'Citrix', 'SAP', 'Siemens', 'Schneider Electric',
'Rockwell Automation', 'ABB', 'Honeywell', 'Huawei', 'ZTE',
'D-Link', 'TP-Link', 'Netgear', 'ASUS', 'Zyxel', 'MikroTik',
'WordPress', 'Drupal', 'Joomla', 'GitLab', 'GitHub', 'Atlassian',
'Jenkins', 'Docker', 'Kubernetes', 'HashiCorp', 'Elastic',
'Trend Micro', 'Kaspersky', 'McAfee', 'Symantec', 'Sophos',
'CrowdStrike', 'SentinelOne', 'Splunk', 'Rapid7',
'Dell', 'HP', 'Lenovo', 'Xerox', 'Epson', 'Canon',
'Zoom', 'Slack', 'Salesforce', 'ServiceNow', 'Ivanti',
'SolarWinds', 'ManageEngine', 'Progress', 'Veeam',
'Moodle', 'MediaWiki', 'phpMyAdmin', 'Roundcube',
'OpenSSL', 'OpenSSH', 'GnuPG', 'cURL',
'Facebook', 'Meta', 'Amazon', 'AWS', 'Cloudflare',
'MITRE', 'NIST', 'CISA',
'Tenda', 'TOTOLINK', 'LB-LINK', 'Ruijie', 'H3C',
'Aruba', 'Ruckus', 'Mitel', 'Avaya',
'Moxa', 'Phoenix Contact', 'WAGO', 'Beckhoff',
'Synology', 'QNAP', 'Western Digital', 'Buffalo',
'Grafana', 'Prometheus', 'InfluxDB',
'JetBrains', 'Eclipse', 'Spring',
'Node.js', 'npm', 'PyPI',
]
# System/product patterns - match as whole words
SYSTEMS = [
'Windows', 'Linux', 'macOS', 'Android', 'iOS', 'ChromeOS',
'Windows Server', 'Windows 10', 'Windows 11',
'Internet Explorer', 'Microsoft Edge', 'Google Chrome', 'Mozilla Firefox', 'Safari',
'Apache HTTP Server', 'Apache Tomcat', 'Apache Struts', 'Apache Kafka',
'Apache ActiveMQ', 'Apache Camel', 'Apache Flink', 'Apache Spark',
'Apache Airflow', 'Apache Superset', 'Apache Solr', 'Apache Dubbo',
'Apache NiFi', 'Apache OFBiz', 'Apache RocketMQ', 'Apache Pulsar',
'Apache Log4j', 'Apache Commons',
'nginx', 'NGINX', 'IIS',
'MySQL', 'PostgreSQL', 'MariaDB', 'MongoDB', 'Redis', 'SQLite',
'Microsoft SQL Server', 'Oracle Database',
'Microsoft Exchange', 'Microsoft Office', 'Microsoft Teams',
'Microsoft SharePoint', 'Microsoft Outlook', 'Microsoft Word',
'Visual Studio Code', 'Visual Studio',
'VMware ESXi', 'VMware vCenter', 'VMware Workstation',
'Docker Desktop', 'Kubernetes',
'OpenSSL', 'OpenSSH', 'OpenVPN', 'WireGuard',
'Samba', 'BIND', 'ISC BIND',
'PHP', 'Python', 'Java', 'Ruby',
'WordPress', 'Drupal', 'Joomla', 'Magento', 'PrestaShop',
'GitLab', 'Grafana', 'Jenkins', 'Ansible', 'Terraform',
'Chromium', 'WebKit', 'V8',
'QEMU', 'VirtualBox', 'Xen', 'KVM',
'systemd', 'sudo', 'polkit', 'glibc', 'libxml2', 'libcurl',
'FFmpeg', 'ImageMagick', 'GStreamer', 'Wireshark',
'Fortinet FortiOS', 'FortiOS', 'FortiGate', 'FortiProxy',
'FortiAnalyzer', 'FortiManager', 'FortiWeb', 'FortiClient',
'Palo Alto PAN-OS', 'PAN-OS', 'GlobalProtect',
'Cisco IOS', 'Cisco IOS XE', 'Cisco NX-OS', 'Cisco ASA',
'Cisco Firepower', 'Cisco Webex', 'Cisco SD-WAN',
'SonicWall', 'Sophos XG', 'Sophos UTM',
'Ivanti Connect Secure', 'Ivanti Policy Secure',
'Citrix ADC', 'Citrix Gateway', 'Citrix NetScaler',
'SAP NetWeaver', 'SAP HANA', 'SAP BusinessObjects',
'Splunk Enterprise', 'Splunk Cloud',
'Elasticsearch', 'Kibana', 'Logstash',
'Moodle', 'Canvas LMS', 'Blackboard',
'Zimbra', 'Roundcube', 'Dovecot', 'Postfix', 'Exim', 'Sendmail',
'cURL', 'curl', 'wget',
'Linux kernel', 'FreeBSD', 'NetBSD', 'OpenBSD',
'Xen hypervisor',
]
def find_all_nonoverlapping(text, pattern, flags=0):
"""Find all non-overlapping matches with their offsets."""
results = []
for m in re.finditer(pattern, text, flags):
results.append((m.start(), m.end(), m.group()))
return results
def annotate(rec):
text = rec['text']
spans = defaultdict(list)
used_ranges = [] # track to avoid overlaps
def overlaps(s, e):
for us, ue in used_ranges:
if s < ue and e > us:
return True
return False
def add_span(label, start, end, entity_text):
if not overlaps(start, end):
key = f"{label}: {entity_text}"
spans[key].append([start, end])
used_ranges.append((start, end))
# 1. CVE_ID (regex, exact)
for m in re.finditer(r'CVE-\d{4}-\d{4,}', text):
add_span('CVE_ID', m.start(), m.end(), m.group())
# 2. IP_ADDRESS
for m in re.finditer(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text):
add_span('IP_ADDRESS', m.start(), m.end(), m.group())
# 3. URL
for m in re.finditer(r'https?://[^\s)<>"]+', text):
add_span('URL', m.start(), m.end(), m.group())
# 4. DOMAIN (after URL to avoid overlap)
for m in re.finditer(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com|org|net|io|gov|edu|mil|co|info|biz|dev|app|cloud)\b', text):
if not overlaps(m.start(), m.end()):
add_span('DOMAIN', m.start(), m.end(), m.group())
# 5. EMAIL
for m in re.finditer(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
add_span('EMAIL', m.start(), m.end(), m.group())
# 6. HASH (SHA-256, SHA-1, MD5)
for m in re.finditer(r'\b[a-fA-F0-9]{64}\b', text):
add_span('HASH', m.start(), m.end(), m.group())
for m in re.finditer(r'\b[a-fA-F0-9]{40}\b', text):
if not overlaps(m.start(), m.end()):
add_span('HASH', m.start(), m.end(), m.group())
for m in re.finditer(r'\b[a-fA-F0-9]{32}\b', text):
if not overlaps(m.start(), m.end()):
add_span('HASH', m.start(), m.end(), m.group())
# 7. FILEPATH
for m in re.finditer(r'(?:/[a-zA-Z0-9_.@-]+){2,}(?:\.[a-zA-Z0-9]+)?', text):
add_span('FILEPATH', m.start(), m.end(), m.group())
# Windows-style paths
for m in re.finditer(r'[A-Z]:\\(?:[a-zA-Z0-9_.@ -]+\\)*[a-zA-Z0-9_.@ -]+', text):
if not overlaps(m.start(), m.end()):
add_span('FILEPATH', m.start(), m.end(), m.group())
# Filenames with extensions in common code patterns
for m in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\.(?:php|py|js|java|c|cpp|h|rb|go|rs|pl|sh|bat|ps1|xml|json|yaml|yml|conf|cfg|ini|log|sql|html|jsp|asp|aspx|cgi)\b', text):
if not overlaps(m.start(), m.end()):
add_span('FILEPATH', m.start(), m.end(), m.group())
# 8. SYSTEM (longer matches first to handle "Apache HTTP Server" before "Apache")
systems_sorted = sorted(SYSTEMS, key=len, reverse=True)
for sys_name in systems_sorted:
pat = re.escape(sys_name)
for m in re.finditer(r'\b' + pat + r'\b', text):
if not overlaps(m.start(), m.end()):
add_span('SYSTEM', m.start(), m.end(), m.group())
# 9. ORGANIZATION (longer first, avoid overlap with SYSTEM)
orgs_sorted = sorted(ORGS, key=len, reverse=True)
for org in orgs_sorted:
pat = re.escape(org)
for m in re.finditer(r'\b' + pat + r'\b', text):
if not overlaps(m.start(), m.end()):
add_span('ORGANIZATION', m.start(), m.end(), m.group())
# 10. VULNERABILITY (case-insensitive, but capture exact text)
vuln_sorted = sorted(VULN_PATTERNS, key=len, reverse=True)
for vp in vuln_sorted:
for m in re.finditer(r'\b' + vp + r'\b', text, re.IGNORECASE):
actual = m.group()
if not overlaps(m.start(), m.end()):
add_span('VULNERABILITY', m.start(), m.end(), actual)
# Convert defaultdict to regular dict
spans_dict = {k: v for k, v in spans.items()}
return {
"text": text,
"spans": spans_dict,
"info": {"source": "nvd_v2", "cve_id": rec["cve_id"]}
}
# ββ Process and write ββ
print("Annotating...")
with open(OUTPUT, 'w') as f:
for i, rec in enumerate(sample):
result = annotate(rec)
f.write(json.dumps(result, ensure_ascii=False) + '\n')
if (i + 1) % 500 == 0:
print(f" {i+1}/{len(sample)}")
print(f"Wrote {len(sample)} annotated records to {OUTPUT}")
# ββ Verify offsets ββ
print("\nVerifying offsets...")
errors = 0
total_spans = 0
for i, line in enumerate(open(OUTPUT)):
rec = json.loads(line)
for key, offsets in rec["spans"].items():
entity = key.split(": ", 1)[1]
for start, end in offsets:
total_spans += 1
if rec["text"][start:end] != entity:
errors += 1
if errors <= 5:
print(f" ERROR line {i}: expected '{entity}', got '{rec['text'][start:end]}'")
print(f"Total spans: {total_spans}, Errors: {errors}")
# Stats
label_counts = defaultdict(int)
for line in open(OUTPUT):
rec = json.loads(line)
for key in rec["spans"]:
label = key.split(": ", 1)[0]
label_counts[label] += len(rec["spans"][key])
print("\nLabel distribution:")
for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
print(f" {label}: {count}")
|