arcspan / data /processed /annotate_nvd_v2.py

Add files using upload-large-folder tool

038e086 verified 8 days ago

13.5 kB

	#!/usr/bin/env python3
	"""Sample 3K NVD descriptions and annotate with cybersecurity entity spans."""

	import json, re, random, os
	from collections import defaultdict

	INPUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_descriptions.jsonl"
	SAMPLE_OUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_sample_3k.jsonl"
	OUTPUT = "/home/ubuntu/alkyline/data/processed/llm_annotated_nvd_v2.jsonl"

	random.seed(42)

	# ── STEP 1: Sample 3K richest descriptions ──

	def richness_score(text):
	"""Score how 'rich' a description is for annotation."""
	score = len(text) / 100.0 # longer = more entities
	# Bonus for specific patterns
	if re.search(r'CVE-\d{4}-\d{4,}', text): score += 3
	if re.search(r'\d+\.\d+\.\d+', text): score += 2 # version numbers
	if re.search(r'(?:allows?\|enables?)\s+(?:remote\|local)', text, re.I): score += 2
	if re.search(r'(?:SQL injection\|XSS\|buffer overflow\|RCE\|CSRF)', text, re.I): score += 2
	if re.search(r'(?:/[a-z]+/[a-z]\|\.php\|\.py\|\.js\|\.c\b)', text, re.I): score += 2
	return score

	print("Loading and scoring...")
	by_year = defaultdict(list)
	for line in open(INPUT):
	rec = json.loads(line)
	text = rec['text']
	if len(text) <= 100:
	continue
	year = rec['cve_id'].split('-')[1]
	if year < '2020':
	continue
	score = richness_score(text)
	by_year[year].append((score, rec))

	# Sample ~430 per year (3000/7), picking top-scoring
	TARGET = 3000
	years = sorted(by_year.keys())
	per_year = TARGET // len(years)
	remainder = TARGET - per_year * len(years)

	sample = []
	for i, year in enumerate(years):
	items = by_year[year]
	items.sort(key=lambda x: -x[0])
	n = per_year + (1 if i < remainder else 0)
	# Take top 2*n, then randomly sample n from those for diversity
	pool = items[:max(n * 3, n)]
	chosen = random.sample(pool, min(n, len(pool)))
	sample.extend([rec for _, rec in chosen])

	random.shuffle(sample)
	print(f"Sampled {len(sample)} descriptions across {len(years)} years")

	with open(SAMPLE_OUT, 'w') as f:
	for rec in sample:
	f.write(json.dumps(rec) + '\n')
	print(f"Wrote {SAMPLE_OUT}")

	# ── STEP 2: Annotate ──

	# Vulnerability type patterns (case-insensitive matching, find exact text)
	VULN_PATTERNS = [
	r'remote code execution',
	r'code execution',
	r'SQL injection',
	r'cross-site scripting',
	r'cross-site request forgery',
	r'buffer overflow',
	r'heap overflow',
	r'stack overflow',
	r'stack-based buffer overflow',
	r'heap-based buffer overflow',
	r'integer overflow',
	r'integer underflow',
	r'use after free',
	r'use-after-free',
	r'double free',
	r'null pointer dereference',
	r'NULL pointer dereference',
	r'out of bounds read',
	r'out-of-bounds read',
	r'out of bounds write',
	r'out-of-bounds write',
	r'out of bounds access',
	r'out-of-bounds access',
	r'out of bounds memory',
	r'privilege escalation',
	r'escalation of privilege',
	r'denial of service',
	r'denial-of-service',
	r'information disclosure',
	r'information leak',
	r'memory leak',
	r'memory corruption',
	r'directory traversal',
	r'path traversal',
	r'command injection',
	r'OS command injection',
	r'XML external entity',
	r'XXE',
	r'SSRF',
	r'server-side request forgery',
	r'open redirect',
	r'authentication bypass',
	r'authorization bypass',
	r'improper authentication',
	r'improper authorization',
	r'improper access control',
	r'improper input validation',
	r'improper neutralization',
	r'race condition',
	r'time-of-check time-of-use',
	r'TOCTOU',
	r'type confusion',
	r'deserialization',
	r'insecure deserialization',
	r'prototype pollution',
	r'reflected XSS',
	r'stored XSS',
	r'DOM-based XSS',
	r'arbitrary file upload',
	r'arbitrary file read',
	r'arbitrary file write',
	r'arbitrary file deletion',
	r'local file inclusion',
	r'remote file inclusion',
	r'server-side template injection',
	r'SSTI',
	r'LDAP injection',
	r'XPath injection',
	r'CRLF injection',
	r'header injection',
	r'log injection',
	r'format string',
	r'symlink',
	r'hardcoded credentials',
	r'hard-coded credentials',
	r'hardcoded password',
	r'hard-coded password',
	r'cleartext transmission',
	r'cleartext storage',
	r'uncontrolled resource consumption',
	r'infinite loop',
	r'resource exhaustion',
	]

	# Organization patterns
	ORGS = [
	'Microsoft', 'Google', 'Apple', 'Adobe', 'Cisco', 'Oracle', 'IBM',
	'Apache', 'Mozilla', 'Samsung', 'Intel', 'AMD', 'Qualcomm', 'NVIDIA',
	'Red Hat', 'Canonical', 'Debian', 'Ubuntu', 'Fedora', 'SUSE',
	'VMware', 'Broadcom', 'Juniper', 'Fortinet', 'Palo Alto Networks',
	'Check Point', 'F5', 'Citrix', 'SAP', 'Siemens', 'Schneider Electric',
	'Rockwell Automation', 'ABB', 'Honeywell', 'Huawei', 'ZTE',
	'D-Link', 'TP-Link', 'Netgear', 'ASUS', 'Zyxel', 'MikroTik',
	'WordPress', 'Drupal', 'Joomla', 'GitLab', 'GitHub', 'Atlassian',
	'Jenkins', 'Docker', 'Kubernetes', 'HashiCorp', 'Elastic',
	'Trend Micro', 'Kaspersky', 'McAfee', 'Symantec', 'Sophos',
	'CrowdStrike', 'SentinelOne', 'Splunk', 'Rapid7',
	'Dell', 'HP', 'Lenovo', 'Xerox', 'Epson', 'Canon',
	'Zoom', 'Slack', 'Salesforce', 'ServiceNow', 'Ivanti',
	'SolarWinds', 'ManageEngine', 'Progress', 'Veeam',
	'Moodle', 'MediaWiki', 'phpMyAdmin', 'Roundcube',
	'OpenSSL', 'OpenSSH', 'GnuPG', 'cURL',
	'Facebook', 'Meta', 'Amazon', 'AWS', 'Cloudflare',
	'MITRE', 'NIST', 'CISA',
	'Tenda', 'TOTOLINK', 'LB-LINK', 'Ruijie', 'H3C',
	'Aruba', 'Ruckus', 'Mitel', 'Avaya',
	'Moxa', 'Phoenix Contact', 'WAGO', 'Beckhoff',
	'Synology', 'QNAP', 'Western Digital', 'Buffalo',
	'Grafana', 'Prometheus', 'InfluxDB',
	'JetBrains', 'Eclipse', 'Spring',
	'Node.js', 'npm', 'PyPI',
	]

	# System/product patterns - match as whole words
	SYSTEMS = [
	'Windows', 'Linux', 'macOS', 'Android', 'iOS', 'ChromeOS',
	'Windows Server', 'Windows 10', 'Windows 11',
	'Internet Explorer', 'Microsoft Edge', 'Google Chrome', 'Mozilla Firefox', 'Safari',
	'Apache HTTP Server', 'Apache Tomcat', 'Apache Struts', 'Apache Kafka',
	'Apache ActiveMQ', 'Apache Camel', 'Apache Flink', 'Apache Spark',
	'Apache Airflow', 'Apache Superset', 'Apache Solr', 'Apache Dubbo',
	'Apache NiFi', 'Apache OFBiz', 'Apache RocketMQ', 'Apache Pulsar',
	'Apache Log4j', 'Apache Commons',
	'nginx', 'NGINX', 'IIS',
	'MySQL', 'PostgreSQL', 'MariaDB', 'MongoDB', 'Redis', 'SQLite',
	'Microsoft SQL Server', 'Oracle Database',
	'Microsoft Exchange', 'Microsoft Office', 'Microsoft Teams',
	'Microsoft SharePoint', 'Microsoft Outlook', 'Microsoft Word',
	'Visual Studio Code', 'Visual Studio',
	'VMware ESXi', 'VMware vCenter', 'VMware Workstation',
	'Docker Desktop', 'Kubernetes',
	'OpenSSL', 'OpenSSH', 'OpenVPN', 'WireGuard',
	'Samba', 'BIND', 'ISC BIND',
	'PHP', 'Python', 'Java', 'Ruby',
	'WordPress', 'Drupal', 'Joomla', 'Magento', 'PrestaShop',
	'GitLab', 'Grafana', 'Jenkins', 'Ansible', 'Terraform',
	'Chromium', 'WebKit', 'V8',
	'QEMU', 'VirtualBox', 'Xen', 'KVM',
	'systemd', 'sudo', 'polkit', 'glibc', 'libxml2', 'libcurl',
	'FFmpeg', 'ImageMagick', 'GStreamer', 'Wireshark',
	'Fortinet FortiOS', 'FortiOS', 'FortiGate', 'FortiProxy',
	'FortiAnalyzer', 'FortiManager', 'FortiWeb', 'FortiClient',
	'Palo Alto PAN-OS', 'PAN-OS', 'GlobalProtect',
	'Cisco IOS', 'Cisco IOS XE', 'Cisco NX-OS', 'Cisco ASA',
	'Cisco Firepower', 'Cisco Webex', 'Cisco SD-WAN',
	'SonicWall', 'Sophos XG', 'Sophos UTM',
	'Ivanti Connect Secure', 'Ivanti Policy Secure',
	'Citrix ADC', 'Citrix Gateway', 'Citrix NetScaler',
	'SAP NetWeaver', 'SAP HANA', 'SAP BusinessObjects',
	'Splunk Enterprise', 'Splunk Cloud',
	'Elasticsearch', 'Kibana', 'Logstash',
	'Moodle', 'Canvas LMS', 'Blackboard',
	'Zimbra', 'Roundcube', 'Dovecot', 'Postfix', 'Exim', 'Sendmail',
	'cURL', 'curl', 'wget',
	'Linux kernel', 'FreeBSD', 'NetBSD', 'OpenBSD',
	'Xen hypervisor',
	]

	def find_all_nonoverlapping(text, pattern, flags=0):
	"""Find all non-overlapping matches with their offsets."""
	results = []
	for m in re.finditer(pattern, text, flags):
	results.append((m.start(), m.end(), m.group()))
	return results

	def annotate(rec):
	text = rec['text']
	spans = defaultdict(list)
	used_ranges = [] # track to avoid overlaps

	def overlaps(s, e):
	for us, ue in used_ranges:
	if s < ue and e > us:
	return True
	return False

	def add_span(label, start, end, entity_text):
	if not overlaps(start, end):
	key = f"{label}: {entity_text}"
	spans[key].append([start, end])
	used_ranges.append((start, end))

	# 1. CVE_ID (regex, exact)
	for m in re.finditer(r'CVE-\d{4}-\d{4,}', text):
	add_span('CVE_ID', m.start(), m.end(), m.group())

	# 2. IP_ADDRESS
	for m in re.finditer(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text):
	add_span('IP_ADDRESS', m.start(), m.end(), m.group())

	# 3. URL
	for m in re.finditer(r'https?://[^\s)<>"]+', text):
	add_span('URL', m.start(), m.end(), m.group())

	# 4. DOMAIN (after URL to avoid overlap)
	for m in re.finditer(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com\|org\|net\|io\|gov\|edu\|mil\|co\|info\|biz\|dev\|app\|cloud)\b', text):
	if not overlaps(m.start(), m.end()):
	add_span('DOMAIN', m.start(), m.end(), m.group())

	# 5. EMAIL
	for m in re.finditer(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
	add_span('EMAIL', m.start(), m.end(), m.group())

	# 6. HASH (SHA-256, SHA-1, MD5)
	for m in re.finditer(r'\b[a-fA-F0-9]{64}\b', text):
	add_span('HASH', m.start(), m.end(), m.group())
	for m in re.finditer(r'\b[a-fA-F0-9]{40}\b', text):
	if not overlaps(m.start(), m.end()):
	add_span('HASH', m.start(), m.end(), m.group())
	for m in re.finditer(r'\b[a-fA-F0-9]{32}\b', text):
	if not overlaps(m.start(), m.end()):
	add_span('HASH', m.start(), m.end(), m.group())

	# 7. FILEPATH
	for m in re.finditer(r'(?:/[a-zA-Z0-9_.@-]+){2,}(?:\.[a-zA-Z0-9]+)?', text):
	add_span('FILEPATH', m.start(), m.end(), m.group())
	# Windows-style paths
	for m in re.finditer(r'[A-Z]:\\(?:[a-zA-Z0-9_.@ -]+\\)*[a-zA-Z0-9_.@ -]+', text):
	if not overlaps(m.start(), m.end()):
	add_span('FILEPATH', m.start(), m.end(), m.group())
	# Filenames with extensions in common code patterns
	for m in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\.(?:php\|py\|js\|java\|c\|cpp\|h\|rb\|go\|rs\|pl\|sh\|bat\|ps1\|xml\|json\|yaml\|yml\|conf\|cfg\|ini\|log\|sql\|html\|jsp\|asp\|aspx\|cgi)\b', text):
	if not overlaps(m.start(), m.end()):
	add_span('FILEPATH', m.start(), m.end(), m.group())

	# 8. SYSTEM (longer matches first to handle "Apache HTTP Server" before "Apache")
	systems_sorted = sorted(SYSTEMS, key=len, reverse=True)
	for sys_name in systems_sorted:
	pat = re.escape(sys_name)
	for m in re.finditer(r'\b' + pat + r'\b', text):
	if not overlaps(m.start(), m.end()):
	add_span('SYSTEM', m.start(), m.end(), m.group())

	# 9. ORGANIZATION (longer first, avoid overlap with SYSTEM)
	orgs_sorted = sorted(ORGS, key=len, reverse=True)
	for org in orgs_sorted:
	pat = re.escape(org)
	for m in re.finditer(r'\b' + pat + r'\b', text):
	if not overlaps(m.start(), m.end()):
	add_span('ORGANIZATION', m.start(), m.end(), m.group())

	# 10. VULNERABILITY (case-insensitive, but capture exact text)
	vuln_sorted = sorted(VULN_PATTERNS, key=len, reverse=True)
	for vp in vuln_sorted:
	for m in re.finditer(r'\b' + vp + r'\b', text, re.IGNORECASE):
	actual = m.group()
	if not overlaps(m.start(), m.end()):
	add_span('VULNERABILITY', m.start(), m.end(), actual)

	# Convert defaultdict to regular dict
	spans_dict = {k: v for k, v in spans.items()}

	return {
	"text": text,
	"spans": spans_dict,
	"info": {"source": "nvd_v2", "cve_id": rec["cve_id"]}
	}

	# ── Process and write ──
	print("Annotating...")
	with open(OUTPUT, 'w') as f:
	for i, rec in enumerate(sample):
	result = annotate(rec)
	f.write(json.dumps(result, ensure_ascii=False) + '\n')
	if (i + 1) % 500 == 0:
	print(f" {i+1}/{len(sample)}")

	print(f"Wrote {len(sample)} annotated records to {OUTPUT}")

	# ── Verify offsets ──
	print("\nVerifying offsets...")
	errors = 0
	total_spans = 0
	for i, line in enumerate(open(OUTPUT)):
	rec = json.loads(line)
	for key, offsets in rec["spans"].items():
	entity = key.split(": ", 1)[1]
	for start, end in offsets:
	total_spans += 1
	if rec["text"][start:end] != entity:
	errors += 1
	if errors <= 5:
	print(f" ERROR line {i}: expected '{entity}', got '{rec['text'][start:end]}'")

	print(f"Total spans: {total_spans}, Errors: {errors}")

	# Stats
	label_counts = defaultdict(int)
	for line in open(OUTPUT):
	rec = json.loads(line)
	for key in rec["spans"]:
	label = key.split(": ", 1)[0]
	label_counts[label] += len(rec["spans"][key])

	print("\nLabel distribution:")
	for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
	print(f" {label}: {count}")