| |
| """Sample 3K NVD descriptions and annotate with cybersecurity entity spans.""" |
|
|
| import json, re, random, os |
| from collections import defaultdict |
|
|
| INPUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_descriptions.jsonl" |
| SAMPLE_OUT = "/home/ubuntu/alkyline/data/raw/nvd/nvd_sample_3k.jsonl" |
| OUTPUT = "/home/ubuntu/alkyline/data/processed/llm_annotated_nvd_v2.jsonl" |
|
|
| random.seed(42) |
|
|
| |
|
|
| def richness_score(text): |
| """Score how 'rich' a description is for annotation.""" |
| score = len(text) / 100.0 |
| |
| if re.search(r'CVE-\d{4}-\d{4,}', text): score += 3 |
| if re.search(r'\d+\.\d+\.\d+', text): score += 2 |
| if re.search(r'(?:allows?|enables?)\s+(?:remote|local)', text, re.I): score += 2 |
| if re.search(r'(?:SQL injection|XSS|buffer overflow|RCE|CSRF)', text, re.I): score += 2 |
| if re.search(r'(?:/[a-z]+/[a-z]|\.php|\.py|\.js|\.c\b)', text, re.I): score += 2 |
| return score |
|
|
| print("Loading and scoring...") |
| by_year = defaultdict(list) |
| for line in open(INPUT): |
| rec = json.loads(line) |
| text = rec['text'] |
| if len(text) <= 100: |
| continue |
| year = rec['cve_id'].split('-')[1] |
| if year < '2020': |
| continue |
| score = richness_score(text) |
| by_year[year].append((score, rec)) |
|
|
| |
| TARGET = 3000 |
| years = sorted(by_year.keys()) |
| per_year = TARGET // len(years) |
| remainder = TARGET - per_year * len(years) |
|
|
| sample = [] |
| for i, year in enumerate(years): |
| items = by_year[year] |
| items.sort(key=lambda x: -x[0]) |
| n = per_year + (1 if i < remainder else 0) |
| |
| pool = items[:max(n * 3, n)] |
| chosen = random.sample(pool, min(n, len(pool))) |
| sample.extend([rec for _, rec in chosen]) |
|
|
| random.shuffle(sample) |
| print(f"Sampled {len(sample)} descriptions across {len(years)} years") |
|
|
| with open(SAMPLE_OUT, 'w') as f: |
| for rec in sample: |
| f.write(json.dumps(rec) + '\n') |
| print(f"Wrote {SAMPLE_OUT}") |
|
|
| |
|
|
| |
| VULN_PATTERNS = [ |
| r'remote code execution', |
| r'code execution', |
| r'SQL injection', |
| r'cross-site scripting', |
| r'cross-site request forgery', |
| r'buffer overflow', |
| r'heap overflow', |
| r'stack overflow', |
| r'stack-based buffer overflow', |
| r'heap-based buffer overflow', |
| r'integer overflow', |
| r'integer underflow', |
| r'use after free', |
| r'use-after-free', |
| r'double free', |
| r'null pointer dereference', |
| r'NULL pointer dereference', |
| r'out of bounds read', |
| r'out-of-bounds read', |
| r'out of bounds write', |
| r'out-of-bounds write', |
| r'out of bounds access', |
| r'out-of-bounds access', |
| r'out of bounds memory', |
| r'privilege escalation', |
| r'escalation of privilege', |
| r'denial of service', |
| r'denial-of-service', |
| r'information disclosure', |
| r'information leak', |
| r'memory leak', |
| r'memory corruption', |
| r'directory traversal', |
| r'path traversal', |
| r'command injection', |
| r'OS command injection', |
| r'XML external entity', |
| r'XXE', |
| r'SSRF', |
| r'server-side request forgery', |
| r'open redirect', |
| r'authentication bypass', |
| r'authorization bypass', |
| r'improper authentication', |
| r'improper authorization', |
| r'improper access control', |
| r'improper input validation', |
| r'improper neutralization', |
| r'race condition', |
| r'time-of-check time-of-use', |
| r'TOCTOU', |
| r'type confusion', |
| r'deserialization', |
| r'insecure deserialization', |
| r'prototype pollution', |
| r'reflected XSS', |
| r'stored XSS', |
| r'DOM-based XSS', |
| r'arbitrary file upload', |
| r'arbitrary file read', |
| r'arbitrary file write', |
| r'arbitrary file deletion', |
| r'local file inclusion', |
| r'remote file inclusion', |
| r'server-side template injection', |
| r'SSTI', |
| r'LDAP injection', |
| r'XPath injection', |
| r'CRLF injection', |
| r'header injection', |
| r'log injection', |
| r'format string', |
| r'symlink', |
| r'hardcoded credentials', |
| r'hard-coded credentials', |
| r'hardcoded password', |
| r'hard-coded password', |
| r'cleartext transmission', |
| r'cleartext storage', |
| r'uncontrolled resource consumption', |
| r'infinite loop', |
| r'resource exhaustion', |
| ] |
|
|
| |
| ORGS = [ |
| 'Microsoft', 'Google', 'Apple', 'Adobe', 'Cisco', 'Oracle', 'IBM', |
| 'Apache', 'Mozilla', 'Samsung', 'Intel', 'AMD', 'Qualcomm', 'NVIDIA', |
| 'Red Hat', 'Canonical', 'Debian', 'Ubuntu', 'Fedora', 'SUSE', |
| 'VMware', 'Broadcom', 'Juniper', 'Fortinet', 'Palo Alto Networks', |
| 'Check Point', 'F5', 'Citrix', 'SAP', 'Siemens', 'Schneider Electric', |
| 'Rockwell Automation', 'ABB', 'Honeywell', 'Huawei', 'ZTE', |
| 'D-Link', 'TP-Link', 'Netgear', 'ASUS', 'Zyxel', 'MikroTik', |
| 'WordPress', 'Drupal', 'Joomla', 'GitLab', 'GitHub', 'Atlassian', |
| 'Jenkins', 'Docker', 'Kubernetes', 'HashiCorp', 'Elastic', |
| 'Trend Micro', 'Kaspersky', 'McAfee', 'Symantec', 'Sophos', |
| 'CrowdStrike', 'SentinelOne', 'Splunk', 'Rapid7', |
| 'Dell', 'HP', 'Lenovo', 'Xerox', 'Epson', 'Canon', |
| 'Zoom', 'Slack', 'Salesforce', 'ServiceNow', 'Ivanti', |
| 'SolarWinds', 'ManageEngine', 'Progress', 'Veeam', |
| 'Moodle', 'MediaWiki', 'phpMyAdmin', 'Roundcube', |
| 'OpenSSL', 'OpenSSH', 'GnuPG', 'cURL', |
| 'Facebook', 'Meta', 'Amazon', 'AWS', 'Cloudflare', |
| 'MITRE', 'NIST', 'CISA', |
| 'Tenda', 'TOTOLINK', 'LB-LINK', 'Ruijie', 'H3C', |
| 'Aruba', 'Ruckus', 'Mitel', 'Avaya', |
| 'Moxa', 'Phoenix Contact', 'WAGO', 'Beckhoff', |
| 'Synology', 'QNAP', 'Western Digital', 'Buffalo', |
| 'Grafana', 'Prometheus', 'InfluxDB', |
| 'JetBrains', 'Eclipse', 'Spring', |
| 'Node.js', 'npm', 'PyPI', |
| ] |
|
|
| |
| SYSTEMS = [ |
| 'Windows', 'Linux', 'macOS', 'Android', 'iOS', 'ChromeOS', |
| 'Windows Server', 'Windows 10', 'Windows 11', |
| 'Internet Explorer', 'Microsoft Edge', 'Google Chrome', 'Mozilla Firefox', 'Safari', |
| 'Apache HTTP Server', 'Apache Tomcat', 'Apache Struts', 'Apache Kafka', |
| 'Apache ActiveMQ', 'Apache Camel', 'Apache Flink', 'Apache Spark', |
| 'Apache Airflow', 'Apache Superset', 'Apache Solr', 'Apache Dubbo', |
| 'Apache NiFi', 'Apache OFBiz', 'Apache RocketMQ', 'Apache Pulsar', |
| 'Apache Log4j', 'Apache Commons', |
| 'nginx', 'NGINX', 'IIS', |
| 'MySQL', 'PostgreSQL', 'MariaDB', 'MongoDB', 'Redis', 'SQLite', |
| 'Microsoft SQL Server', 'Oracle Database', |
| 'Microsoft Exchange', 'Microsoft Office', 'Microsoft Teams', |
| 'Microsoft SharePoint', 'Microsoft Outlook', 'Microsoft Word', |
| 'Visual Studio Code', 'Visual Studio', |
| 'VMware ESXi', 'VMware vCenter', 'VMware Workstation', |
| 'Docker Desktop', 'Kubernetes', |
| 'OpenSSL', 'OpenSSH', 'OpenVPN', 'WireGuard', |
| 'Samba', 'BIND', 'ISC BIND', |
| 'PHP', 'Python', 'Java', 'Ruby', |
| 'WordPress', 'Drupal', 'Joomla', 'Magento', 'PrestaShop', |
| 'GitLab', 'Grafana', 'Jenkins', 'Ansible', 'Terraform', |
| 'Chromium', 'WebKit', 'V8', |
| 'QEMU', 'VirtualBox', 'Xen', 'KVM', |
| 'systemd', 'sudo', 'polkit', 'glibc', 'libxml2', 'libcurl', |
| 'FFmpeg', 'ImageMagick', 'GStreamer', 'Wireshark', |
| 'Fortinet FortiOS', 'FortiOS', 'FortiGate', 'FortiProxy', |
| 'FortiAnalyzer', 'FortiManager', 'FortiWeb', 'FortiClient', |
| 'Palo Alto PAN-OS', 'PAN-OS', 'GlobalProtect', |
| 'Cisco IOS', 'Cisco IOS XE', 'Cisco NX-OS', 'Cisco ASA', |
| 'Cisco Firepower', 'Cisco Webex', 'Cisco SD-WAN', |
| 'SonicWall', 'Sophos XG', 'Sophos UTM', |
| 'Ivanti Connect Secure', 'Ivanti Policy Secure', |
| 'Citrix ADC', 'Citrix Gateway', 'Citrix NetScaler', |
| 'SAP NetWeaver', 'SAP HANA', 'SAP BusinessObjects', |
| 'Splunk Enterprise', 'Splunk Cloud', |
| 'Elasticsearch', 'Kibana', 'Logstash', |
| 'Moodle', 'Canvas LMS', 'Blackboard', |
| 'Zimbra', 'Roundcube', 'Dovecot', 'Postfix', 'Exim', 'Sendmail', |
| 'cURL', 'curl', 'wget', |
| 'Linux kernel', 'FreeBSD', 'NetBSD', 'OpenBSD', |
| 'Xen hypervisor', |
| ] |
|
|
| def find_all_nonoverlapping(text, pattern, flags=0): |
| """Find all non-overlapping matches with their offsets.""" |
| results = [] |
| for m in re.finditer(pattern, text, flags): |
| results.append((m.start(), m.end(), m.group())) |
| return results |
|
|
| def annotate(rec): |
| text = rec['text'] |
| spans = defaultdict(list) |
| used_ranges = [] |
|
|
| def overlaps(s, e): |
| for us, ue in used_ranges: |
| if s < ue and e > us: |
| return True |
| return False |
|
|
| def add_span(label, start, end, entity_text): |
| if not overlaps(start, end): |
| key = f"{label}: {entity_text}" |
| spans[key].append([start, end]) |
| used_ranges.append((start, end)) |
|
|
| |
| for m in re.finditer(r'CVE-\d{4}-\d{4,}', text): |
| add_span('CVE_ID', m.start(), m.end(), m.group()) |
|
|
| |
| for m in re.finditer(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text): |
| add_span('IP_ADDRESS', m.start(), m.end(), m.group()) |
|
|
| |
| for m in re.finditer(r'https?://[^\s)<>"]+', text): |
| add_span('URL', m.start(), m.end(), m.group()) |
|
|
| |
| for m in re.finditer(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com|org|net|io|gov|edu|mil|co|info|biz|dev|app|cloud)\b', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('DOMAIN', m.start(), m.end(), m.group()) |
|
|
| |
| for m in re.finditer(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text): |
| add_span('EMAIL', m.start(), m.end(), m.group()) |
|
|
| |
| for m in re.finditer(r'\b[a-fA-F0-9]{64}\b', text): |
| add_span('HASH', m.start(), m.end(), m.group()) |
| for m in re.finditer(r'\b[a-fA-F0-9]{40}\b', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('HASH', m.start(), m.end(), m.group()) |
| for m in re.finditer(r'\b[a-fA-F0-9]{32}\b', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('HASH', m.start(), m.end(), m.group()) |
|
|
| |
| for m in re.finditer(r'(?:/[a-zA-Z0-9_.@-]+){2,}(?:\.[a-zA-Z0-9]+)?', text): |
| add_span('FILEPATH', m.start(), m.end(), m.group()) |
| |
| for m in re.finditer(r'[A-Z]:\\(?:[a-zA-Z0-9_.@ -]+\\)*[a-zA-Z0-9_.@ -]+', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('FILEPATH', m.start(), m.end(), m.group()) |
| |
| for m in re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\.(?:php|py|js|java|c|cpp|h|rb|go|rs|pl|sh|bat|ps1|xml|json|yaml|yml|conf|cfg|ini|log|sql|html|jsp|asp|aspx|cgi)\b', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('FILEPATH', m.start(), m.end(), m.group()) |
|
|
| |
| systems_sorted = sorted(SYSTEMS, key=len, reverse=True) |
| for sys_name in systems_sorted: |
| pat = re.escape(sys_name) |
| for m in re.finditer(r'\b' + pat + r'\b', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('SYSTEM', m.start(), m.end(), m.group()) |
|
|
| |
| orgs_sorted = sorted(ORGS, key=len, reverse=True) |
| for org in orgs_sorted: |
| pat = re.escape(org) |
| for m in re.finditer(r'\b' + pat + r'\b', text): |
| if not overlaps(m.start(), m.end()): |
| add_span('ORGANIZATION', m.start(), m.end(), m.group()) |
|
|
| |
| vuln_sorted = sorted(VULN_PATTERNS, key=len, reverse=True) |
| for vp in vuln_sorted: |
| for m in re.finditer(r'\b' + vp + r'\b', text, re.IGNORECASE): |
| actual = m.group() |
| if not overlaps(m.start(), m.end()): |
| add_span('VULNERABILITY', m.start(), m.end(), actual) |
|
|
| |
| spans_dict = {k: v for k, v in spans.items()} |
|
|
| return { |
| "text": text, |
| "spans": spans_dict, |
| "info": {"source": "nvd_v2", "cve_id": rec["cve_id"]} |
| } |
|
|
| |
| print("Annotating...") |
| with open(OUTPUT, 'w') as f: |
| for i, rec in enumerate(sample): |
| result = annotate(rec) |
| f.write(json.dumps(result, ensure_ascii=False) + '\n') |
| if (i + 1) % 500 == 0: |
| print(f" {i+1}/{len(sample)}") |
|
|
| print(f"Wrote {len(sample)} annotated records to {OUTPUT}") |
|
|
| |
| print("\nVerifying offsets...") |
| errors = 0 |
| total_spans = 0 |
| for i, line in enumerate(open(OUTPUT)): |
| rec = json.loads(line) |
| for key, offsets in rec["spans"].items(): |
| entity = key.split(": ", 1)[1] |
| for start, end in offsets: |
| total_spans += 1 |
| if rec["text"][start:end] != entity: |
| errors += 1 |
| if errors <= 5: |
| print(f" ERROR line {i}: expected '{entity}', got '{rec['text'][start:end]}'") |
|
|
| print(f"Total spans: {total_spans}, Errors: {errors}") |
|
|
| |
| label_counts = defaultdict(int) |
| for line in open(OUTPUT): |
| rec = json.loads(line) |
| for key in rec["spans"]: |
| label = key.split(": ", 1)[0] |
| label_counts[label] += len(rec["spans"][key]) |
|
|
| print("\nLabel distribution:") |
| for label, count in sorted(label_counts.items(), key=lambda x: -x[1]): |
| print(f" {label}: {count}") |
|
|