| |
| """ |
| Programmatic NER annotator for APT/threat-intel descriptions. |
| Processes apt_descriptions.jsonl -> llm_annotated_apt.jsonl |
| """ |
|
|
| import json |
| import re |
| import sys |
| from pathlib import Path |
| from collections import defaultdict |
|
|
| INPUT = Path("/home/ubuntu/alkyline/data/raw/apt_reports/apt_descriptions.jsonl") |
| OUTPUT = Path("/home/ubuntu/alkyline/data/processed/llm_annotated_apt.jsonl") |
|
|
| |
|
|
| |
| |
|
|
| KNOWN_TOOLS = { |
| "Mimikatz", "PsExec", "Cobalt Strike", "Metasploit", "Nmap", "BloodHound", |
| "AdFind", "Impacket", "PowerSploit", "Rubeus", "SharpHound", "LaZagne", |
| "CrackMapExec", "Responder", "Empire", "Covenant", "Sliver", "Brute Ratel", |
| "Havoc", "Mythic", "PoshC2", "SilentTrinity", "Meterpreter", "Netcat", "nc", |
| "WinRM", "PuTTY", "WinSCP", "RDP", "TeamViewer", "AnyDesk", "ScreenConnect", |
| "ConnectWise", "Ngrok", "ngrok", "Chisel", "ligolo", "ProxyChains", |
| "proxychains", "Tor", "socat", "curl", "wget", "certutil", "bitsadmin", |
| "WMIC", "wmic", "PowerShell", "powershell", "cmd.exe", "cmd", "cscript", |
| "wscript", "mshta", "regsvr32", "rundll32", "schtasks", "at.exe", |
| "net.exe", "ipconfig", "tasklist", "taskkill", "nltest", "dsquery", |
| "csvde", "ldifde", "ntdsutil", "vssadmin", "wevtutil", |
| "Sysinternals", "ProcDump", "procdump", "Process Monitor", |
| "Process Explorer", "Autoruns", "Sysmon", "PsService", |
| "7-Zip", "7zip", "WinRAR", "RAR", "UPX", |
| "Volatility", "FTK", "Wireshark", |
| "MSBuild", "InstallUtil", "Regasm", "Regsvcs", |
| "Certreq", "CMSTP", "cmstp", |
| "esentutl", "expand", "extrac32", "findstr", |
| "forfiles", "ftp", "makecab", "msiexec", |
| "pcalua", "replace", "rpcping", "SyncAppvPublishingServer", |
| "xwizard", "Msbuild", "Dnscmd", |
| "Advanced IP Scanner", "Angry IP Scanner", |
| "Rclone", "rclone", "MEGAsync", "megacmd", |
| "SharpView", "Seatbelt", "GhostPack", |
| "Invoke-Obfuscation", "Invoke-Mimikatz", |
| "Net-GPPPassword", "Get-GPPPassword", |
| "Sharphound", "PlinkPlink", "Plink", |
| "Remote Desktop Protocol", "SSH", |
| "Living off the Land", "LOLBin", "LOLBins", |
| "LOLBAS", |
| "BITSAdmin", "Certutil", "Reg.exe", |
| "Remcos", "QuasarRAT", "Quasar RAT", |
| "NjRAT", "njRAT", "DarkComet", |
| "AsyncRAT", "Async RAT", |
| "Atera", "Splashtop", "GoToAssist", |
| "LogMeIn", "Ammyy Admin", "NetSupport", |
| "UltraVNC", "TightVNC", "VNC", |
| "SecureCRT", "MobaXterm", "Xshell", |
| "Hashcat", "John the Ripper", |
| "Hydra", "Medusa", "Aircrack-ng", |
| "sqlmap", "SQLMap", "Burp Suite", |
| "Nikto", "DirBuster", "Gobuster", |
| "ffuf", "wfuzz", "Sublist3r", |
| "Amass", "Subfinder", "Masscan", |
| "Shodan", "Censys", "ZoomEye", |
| "FOFA", "GreyNoise", |
| "theHarvester", "Recon-ng", "SpiderFoot", |
| "Maltego", "FOCA", |
| "Social-Engineer Toolkit", "SET", |
| "BeEF", "King Phisher", "Gophish", |
| "Evilginx", "Modlishka", |
| "CobaltStrike", "Cobalt Strike Beacon", |
| } |
|
|
| KNOWN_SYSTEMS = { |
| "Windows", "Linux", "macOS", "Mac OS X", "Mac OS", "iOS", "Android", |
| "Unix", "FreeBSD", "Solaris", "AIX", "HP-UX", |
| "Windows 10", "Windows 11", "Windows 7", "Windows 8", "Windows XP", |
| "Windows Vista", "Windows Server", "Windows Server 2003", |
| "Windows Server 2008", "Windows Server 2012", "Windows Server 2016", |
| "Windows Server 2019", "Windows Server 2022", |
| "Ubuntu", "Debian", "CentOS", "Red Hat", "RHEL", "Fedora", |
| "Kali Linux", "Arch Linux", "SUSE", "openSUSE", "Alpine Linux", |
| "Chrome OS", "ChromeOS", "Tizen", "HarmonyOS", |
| "Microsoft Office", "Microsoft Word", "Microsoft Excel", |
| "Microsoft PowerPoint", "Microsoft Outlook", "Microsoft Access", |
| "Microsoft Exchange", "Exchange Server", "Exchange Online", |
| "Microsoft SharePoint", "SharePoint", |
| "Microsoft Teams", "Office 365", "Microsoft 365", |
| "Active Directory", "Azure AD", "Azure Active Directory", "Entra ID", |
| "Azure", "AWS", "Amazon Web Services", "Google Cloud", "GCP", |
| "VMware", "ESXi", "vCenter", "vSphere", "Hyper-V", |
| "Docker", "Kubernetes", "OpenShift", |
| "Apache", "Nginx", "nginx", "IIS", "Tomcat", "JBoss", "WebLogic", |
| "MySQL", "PostgreSQL", "MongoDB", "Redis", "Elasticsearch", |
| "SQL Server", "Oracle Database", "MariaDB", "SQLite", |
| "SAP", "Salesforce", "ServiceNow", "Jira", "Confluence", |
| "WordPress", "Drupal", "Joomla", "Magento", |
| "Citrix", "Fortinet", "FortiGate", "FortiOS", |
| "Palo Alto", "PAN-OS", "Cisco ASA", "Cisco IOS", |
| "SonicWall", "Sophos", "Barracuda", |
| "Ivanti", "Pulse Secure", "Pulse Connect Secure", |
| "Juniper", "MikroTik", "Ubiquiti", |
| "Zimbra", "Zoho", "cPanel", "Plesk", |
| "Git", "GitHub", "GitLab", "Bitbucket", |
| "Jenkins", "TeamCity", "Bamboo", "CircleCI", |
| "Splunk", "QRadar", "ArcSight", "LogRhythm", |
| "CrowdStrike Falcon", "Carbon Black", "SentinelOne", |
| "Microsoft Defender", "Windows Defender", |
| "Symantec", "McAfee", "Kaspersky", "ESET", |
| "Trend Micro", "Bitdefender", "Avast", "AVG", |
| "Chrome", "Firefox", "Safari", "Edge", "Internet Explorer", "IE", |
| "Opera", "Brave", |
| "Outlook", "Thunderbird", "Gmail", |
| "Telegram", "WhatsApp", "Signal", "Discord", "Slack", |
| "Zoom", "Skype", "WebEx", |
| "Adobe Reader", "Adobe Acrobat", "Adobe Flash", |
| "Java", "JRE", "JDK", ".NET", ".NET Framework", |
| "Python", "Node.js", "PHP", "Ruby", "Perl", "Go", "Rust", |
| "OpenSSL", "OpenSSH", |
| "QNAP", "Synology", "NAS", |
| "Confluence Server", "Atlassian Confluence", |
| "SolarWinds", "SolarWinds Orion", |
| "Zoho ManageEngine", "ManageEngine", |
| "pfSense", "OPNsense", |
| "Cobalt Strike", |
| "Kerberos", "LDAP", "NTLM", "SMB", "WMI", "DCOM", |
| "Group Policy", "GPO", |
| "VPN", "SSL VPN", |
| "UEFI", "BIOS", |
| "Cisco", "Huawei", |
| "NETGEAR", "TP-Link", "D-Link", "Zyxel", |
| "Microsoft IIS", |
| "Android operating system", |
| "Google Play", "App Store", |
| "Telegram Bot API", |
| } |
|
|
| KNOWN_ORGS = { |
| "Microsoft", "Google", "Apple", "Amazon", "Meta", "Facebook", |
| "Mandiant", "CrowdStrike", "Palo Alto Networks", "Unit 42", |
| "FireEye", "Recorded Future", "Proofpoint", "Symantec", |
| "Kaspersky", "ESET", "Trend Micro", "Bitdefender", |
| "Sophos", "McAfee", "Avast", "F-Secure", "Fortinet", |
| "Check Point", "SentinelOne", "Carbon Black", |
| "Cisco Talos", "Talos", "Volexity", "Secureworks", |
| "Dragos", "ThreatConnect", "Anomali", "ReversingLabs", |
| "VirusTotal", "Hybrid Analysis", "ANY.RUN", |
| "MITRE", "NIST", "CISA", "NSA", "FBI", "CIA", "DHS", |
| "Europol", "Interpol", "GCHQ", "MI5", "MI6", "BND", |
| "CERT-UA", "CERT-FR", "CERT-EU", "US-CERT", "CERT/CC", |
| "ANSSI", "NCSC", "ASD", "ACSC", "CCCS", "BSI", "CNMF", |
| "NCA", "DOJ", "Department of Justice", |
| "Treasury Department", "OFAC", |
| "Rapid7", "Qualys", "Tenable", "BeyondTrust", |
| "Okta", "Duo", "RSA", |
| "Red Canary", "Huntress", "Arctic Wolf", |
| "Zscaler", "Cloudflare", "Akamai", "Fastly", |
| "Splunk", "Elastic", "Sumo Logic", |
| "VMware", "Broadcom", "Intel", "AMD", "NVIDIA", |
| "Cisco", "Juniper", "Huawei", |
| "AT&T", "Verizon", "T-Mobile", |
| "ThreatFabric", "PCrisk", "Cyfirma", "Group-IB", |
| "Positive Technologies", "Intezer", "Deep Instinct", |
| "BlackBerry", "Lookout", "Zimperium", |
| "WithSecure", "Trellix", "Cybereason", |
| "IBM", "IBM X-Force", "X-Force", |
| "Sekoia", "Intrinsec", |
| "CISA and FBI", "NSA and CISA", |
| "Citizen Lab", "EFF", "Amnesty International", |
| "Bellingcat", "Atlantic Council", |
| "The DFIR Report", "DFIR Report", |
| "Lumen Technologies", "Lumen Black Lotus Labs", |
| "Black Lotus Labs", |
| "Lab52", "Clearsky", "ClearSky", |
| "Netskope", "Mimecast", "Abnormal Security", |
| "Cofense", "PhishLabs", "Area 1 Security", |
| "ExtraHop", "Corelight", "Vectra", |
| "Wiz", "Orca Security", "Lacework", |
| "Aqua Security", "Sysdig", |
| "Snyk", "Sonatype", "JFrog", |
| "HackerOne", "Bugcrowd", "Synack", |
| "NCC Group", "Fox-IT", "PwC", |
| "Deloitte", "EY", "KPMG", "Accenture", |
| "BAE Systems", "Raytheon", "Northrop Grumman", |
| "Lockheed Martin", "General Dynamics", |
| "Booz Allen Hamilton", |
| "National Security Agency", |
| "Federal Bureau of Investigation", |
| "Department of Homeland Security", |
| "Cybersecurity and Infrastructure Security Agency", |
| } |
|
|
| VULN_KEYWORDS = { |
| "buffer overflow", "stack overflow", "heap overflow", |
| "integer overflow", "integer underflow", |
| "use-after-free", "use after free", |
| "double free", "null pointer dereference", |
| "format string", "format string vulnerability", |
| "race condition", "TOCTOU", |
| "SQL injection", "SQL Injection", "SQLi", |
| "cross-site scripting", "XSS", "Cross-Site Scripting", |
| "cross-site request forgery", "CSRF", "XSRF", |
| "server-side request forgery", "SSRF", |
| "XML external entity", "XXE", |
| "remote code execution", "RCE", |
| "Remote Code Execution", |
| "local privilege escalation", "LPE", |
| "privilege escalation", |
| "arbitrary code execution", |
| "command injection", "OS command injection", |
| "code injection", "code execution", |
| "path traversal", "directory traversal", |
| "local file inclusion", "LFI", |
| "remote file inclusion", "RFI", |
| "insecure deserialization", "deserialization vulnerability", |
| "authentication bypass", "authorization bypass", |
| "memory corruption", "out-of-bounds read", "out-of-bounds write", |
| "type confusion", "information disclosure", |
| "denial of service", "DoS", "DDoS", |
| "man-in-the-middle", "MitM", "MITM", |
| "zero-day", "zero day", "0-day", "0day", |
| "supply chain attack", "supply chain compromise", |
| "DLL hijacking", "DLL side-loading", "DLL sideloading", |
| "DLL search order hijacking", |
| "reflective DLL injection", "process injection", |
| "process hollowing", "thread hijacking", |
| "COM hijacking", "COM-hijacking", |
| "credential dumping", "credential theft", |
| "pass-the-hash", "pass-the-ticket", |
| "golden ticket", "silver ticket", |
| "Kerberoasting", "AS-REP roasting", |
| "brute force", "brute-force", "password spraying", |
| "credential stuffing", "phishing", |
| "spear-phishing", "spearphishing", "spear phishing", |
| "watering hole", "drive-by download", |
| "clickjacking", "session hijacking", |
| "DNS hijacking", "DNS spoofing", "DNS poisoning", |
| "ARP spoofing", "BGP hijacking", |
| "rootkit", "bootkit", |
| "keylogging", "keylogger", |
| "screen capture", "clipboard hijacking", |
| "log4shell", "Log4Shell", |
| "ProxyLogon", "ProxyShell", "ProxyNotShell", |
| "EternalBlue", "BlueKeep", |
| "Zerologon", "PrintNightmare", "Follina", |
| "Spring4Shell", "SpringShell", |
| "Shellshock", "Heartbleed", "POODLE", "DROWN", |
| "Spectre", "Meltdown", |
| "living-off-the-land", |
| } |
|
|
| |
|
|
| CVE_RE = re.compile(r'CVE-\d{4}-\d{4,7}') |
| IP_RE = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b') |
| HASH_MD5 = re.compile(r'\b[a-fA-F0-9]{32}\b') |
| HASH_SHA1 = re.compile(r'\b[a-fA-F0-9]{40}\b') |
| HASH_SHA256 = re.compile(r'\b[a-fA-F0-9]{64}\b') |
| EMAIL_RE = re.compile(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b') |
| URL_RE = re.compile(r'https?://[^\s\)<>\]]+') |
| DOMAIN_RE = re.compile(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com|net|org|io|ru|cn|info|biz|xyz|top|tk|ml|ga|cf|gq|cc|pw|ly|me|co|tv|in|de|uk|fr|it|es|br|au|ca|jp|kr|tw|hk|sg|my|ph|vn|th|id|za|ng|ke|ua|by|kz|ir|iq|sy|su|gov|mil|edu|int|onion|bit)\b') |
| FILEPATH_RE = re.compile(r'(?:[A-Z]:\\(?:[^\s\\/:*?"<>|]+\\)*[^\s\\/:*?"<>|]+|/(?:etc|usr|var|tmp|opt|home|root|proc|sys|dev|bin|sbin|lib|mnt|media|boot|run|srv)/[^\s]+|%[A-Za-z_]+%(?:\\[^\s]+)?)') |
|
|
| MD_LINK_RE = re.compile(r'\[([^\]]*)\]\([^\)]*\)') |
|
|
|
|
| def strip_markdown_links(text): |
| """Replace [Name](url) with Name, return new text and offset map.""" |
| result = [] |
| last = 0 |
| |
| offset_shifts = [] |
| cumulative_shift = 0 |
|
|
| for m in MD_LINK_RE.finditer(text): |
| start, end = m.start(), m.end() |
| link_text = m.group(1) |
| |
| result.append(text[last:start]) |
| result.append(link_text) |
| |
| removed = (end - start) - len(link_text) |
| cumulative_shift += removed |
| offset_shifts.append((end, cumulative_shift)) |
| last = end |
|
|
| result.append(text[last:]) |
| new_text = ''.join(result) |
| return new_text, offset_shifts |
|
|
|
|
| def old_to_new_offset(old_pos, offset_shifts): |
| """Convert an offset in the original text to offset in cleaned text.""" |
| shift = 0 |
| for boundary, cum_shift in offset_shifts: |
| if old_pos >= boundary: |
| shift = cum_shift |
| else: |
| break |
| return old_pos - shift |
|
|
|
|
| def find_all_occurrences(text, pattern, case_sensitive=True): |
| """Find all occurrences of pattern in text, return list of (start, end).""" |
| results = [] |
| if not pattern: |
| return results |
| flags = 0 if case_sensitive else re.IGNORECASE |
| |
| escaped = re.escape(pattern) |
| if len(pattern) >= 2: |
| regex = re.compile(r'(?<![a-zA-Z0-9_])' + escaped + r'(?![a-zA-Z0-9_])', flags) |
| else: |
| regex = re.compile(escaped, flags) |
| for m in regex.finditer(text): |
| results.append((m.start(), m.end())) |
| return results |
|
|
|
|
| def annotate_entry(entry): |
| """Annotate a single entry, return output dict.""" |
| raw_text = entry['text'] |
| name = entry['name'] |
| source = entry['source'] |
| alt_names = entry.get('alt_names', []) |
| attribution = entry.get('attribution', []) |
|
|
| |
| text, offset_shifts = strip_markdown_links(raw_text) |
|
|
| |
| spans = defaultdict(list) |
|
|
| def add_spans(label, occurrences, entity_text): |
| if occurrences: |
| key = f"{label}: {entity_text}" |
| for start, end in occurrences: |
| |
| if text[start:end] == entity_text: |
| spans[key].append([start, end]) |
|
|
| def add_regex_spans(label, regex): |
| for m in regex.finditer(text): |
| entity_text = m.group() |
| key = f"{label}: {entity_text}" |
| spans[key].append([m.start(), m.end()]) |
|
|
| |
| add_regex_spans("CVE_ID", CVE_RE) |
| add_regex_spans("IP_ADDRESS", IP_RE) |
| add_regex_spans("EMAIL", EMAIL_RE) |
| add_regex_spans("URL", URL_RE) |
| add_regex_spans("FILEPATH", FILEPATH_RE) |
|
|
| |
| |
| hash_positions = set() |
| for m in HASH_SHA256.finditer(text): |
| key = f"HASH: {m.group()}" |
| spans[key].append([m.start(), m.end()]) |
| hash_positions.update(range(m.start(), m.end())) |
| for m in HASH_SHA1.finditer(text): |
| if m.start() not in hash_positions: |
| key = f"HASH: {m.group()}" |
| spans[key].append([m.start(), m.end()]) |
| hash_positions.update(range(m.start(), m.end())) |
| for m in HASH_MD5.finditer(text): |
| if m.start() not in hash_positions: |
| key = f"HASH: {m.group()}" |
| spans[key].append([m.start(), m.end()]) |
|
|
| |
| url_positions = set() |
| for m in URL_RE.finditer(text): |
| url_positions.update(range(m.start(), m.end())) |
| for m in EMAIL_RE.finditer(text): |
| url_positions.update(range(m.start(), m.end())) |
| for m in DOMAIN_RE.finditer(text): |
| if m.start() not in url_positions: |
| key = f"DOMAIN: {m.group()}" |
| spans[key].append([m.start(), m.end()]) |
|
|
| |
| |
| is_actor_source = 'actor' in source or 'campaign' in source |
| is_technique = 'technique' in source |
|
|
| |
| all_names_for_entry = [name] + alt_names |
|
|
| if not is_technique: |
| if is_actor_source: |
| label = "THREAT_ACTOR" |
| else: |
| |
| if name in KNOWN_TOOLS: |
| label = "TOOL" |
| else: |
| label = "MALWARE" |
|
|
| for n in all_names_for_entry: |
| if len(n) >= 2: |
| occs = find_all_occurrences(text, n, case_sensitive=True) |
| if not occs and len(n) >= 4: |
| occs = find_all_occurrences(text, n, case_sensitive=False) |
| add_spans(label, occs, n) |
| |
| for start, end in occs: |
| actual = text[start:end] |
| if actual != n: |
| |
| key = f"{label}: {actual}" |
| spans[key].append([start, end]) |
| |
| wrong_key = f"{label}: {n}" |
| if wrong_key in spans and [start, end] in spans[wrong_key]: |
| spans[wrong_key].remove([start, end]) |
|
|
| |
| for actor in attribution: |
| if len(actor) >= 2: |
| occs = find_all_occurrences(text, actor, case_sensitive=True) |
| add_spans("THREAT_ACTOR", occs, actor) |
|
|
| |
| |
| for actor in KNOWN_ACTORS: |
| if len(actor) >= 3: |
| occs = find_all_occurrences(text, actor, case_sensitive=True) |
| add_spans("THREAT_ACTOR", occs, actor) |
|
|
| |
| for tool in KNOWN_TOOLS: |
| if len(tool) >= 3: |
| occs = find_all_occurrences(text, tool, case_sensitive=True) |
| add_spans("TOOL", occs, tool) |
|
|
| |
| for sys_name in KNOWN_SYSTEMS: |
| if len(sys_name) >= 2: |
| cs = len(sys_name) >= 4 |
| occs = find_all_occurrences(text, sys_name, case_sensitive=cs) |
| add_spans("SYSTEM", occs, sys_name) |
|
|
| |
| for org in KNOWN_ORGS: |
| if len(org) >= 3: |
| occs = find_all_occurrences(text, org, case_sensitive=True) |
| add_spans("ORGANIZATION", occs, org) |
|
|
| |
| for vuln in VULN_KEYWORDS: |
| if len(vuln) >= 3: |
| cs = len(vuln) >= 5 |
| occs = find_all_occurrences(text, vuln, case_sensitive=cs) |
| add_spans("VULNERABILITY", occs, vuln) |
|
|
| |
| |
|
|
| |
| final_spans = {} |
| for key, positions in spans.items(): |
| |
| unique_positions = [] |
| seen = set() |
| for pos in positions: |
| t = tuple(pos) |
| if t not in seen: |
| seen.add(t) |
| |
| label_entity = key.split(": ", 1) |
| if len(label_entity) == 2: |
| entity_text = label_entity[1] |
| if text[pos[0]:pos[1]] == entity_text: |
| unique_positions.append(pos) |
| if unique_positions: |
| final_spans[key] = unique_positions |
|
|
| return { |
| "text": text, |
| "spans": final_spans, |
| "info": { |
| "source": "apt_reports", |
| "name": name, |
| } |
| } |
|
|
|
|
| def build_global_name_sets(entries): |
| """Build sets of known malware, actors from all entries.""" |
| malware_names = set() |
| actor_names = set() |
| tool_names = set(KNOWN_TOOLS) |
|
|
| for e in entries: |
| source = e['source'] |
| name = e['name'] |
| alts = e.get('alt_names', []) |
| is_actor = 'actor' in source or 'campaign' in source |
| is_technique = 'technique' in source |
|
|
| if is_technique: |
| continue |
|
|
| all_n = [name] + alts |
| for n in all_n: |
| if is_actor: |
| actor_names.add(n) |
| elif n in KNOWN_TOOLS: |
| tool_names.add(n) |
| else: |
| malware_names.add(n) |
|
|
| for a in e.get('attribution', []): |
| actor_names.add(a) |
|
|
| return malware_names, actor_names, tool_names |
|
|
|
|
| |
| KNOWN_ACTORS = set() |
|
|
|
|
| def main(): |
| global KNOWN_ACTORS |
|
|
| print("Loading entries...") |
| entries = [] |
| with open(INPUT) as f: |
| for line in f: |
| entries.append(json.loads(line)) |
| print(f"Loaded {len(entries)} entries") |
|
|
| print("Building global name sets...") |
| malware_names, actor_names, tool_names = build_global_name_sets(entries) |
| KNOWN_ACTORS = actor_names |
| print(f" Malware: {len(malware_names)}, Actors: {len(actor_names)}, Tools: {len(tool_names)}") |
|
|
| |
| |
| |
|
|
| |
| cross_ref_malware = {n for n in malware_names if len(n) >= 4 and not n.isdigit()} |
| cross_ref_actors = {n for n in actor_names if len(n) >= 4 and not n.isdigit()} |
| cross_ref_tools = {n for n in tool_names if len(n) >= 4} |
|
|
| print(f"Cross-ref candidates: malware={len(cross_ref_malware)}, actors={len(cross_ref_actors)}, tools={len(cross_ref_tools)}") |
|
|
| |
| |
| print("Processing entries...") |
|
|
| with open(OUTPUT, 'w') as out: |
| for i, entry in enumerate(entries): |
| result = annotate_entry(entry) |
|
|
| |
| text = result['text'] |
|
|
| |
| |
| for n in cross_ref_malware: |
| if n in text: |
| occs = find_all_occurrences(text, n, case_sensitive=True) |
| for start, end in occs: |
| actual = text[start:end] |
| key = f"MALWARE: {actual}" |
| if key not in result['spans']: |
| result['spans'][key] = [] |
| if [start, end] not in result['spans'][key]: |
| result['spans'][key].append([start, end]) |
|
|
| for n in cross_ref_actors: |
| if n in text: |
| occs = find_all_occurrences(text, n, case_sensitive=True) |
| for start, end in occs: |
| actual = text[start:end] |
| key = f"THREAT_ACTOR: {actual}" |
| if key not in result['spans']: |
| result['spans'][key] = [] |
| if [start, end] not in result['spans'][key]: |
| result['spans'][key].append([start, end]) |
|
|
| for n in cross_ref_tools: |
| if n in text: |
| occs = find_all_occurrences(text, n, case_sensitive=True) |
| for start, end in occs: |
| actual = text[start:end] |
| key = f"TOOL: {actual}" |
| if key not in result['spans']: |
| result['spans'][key] = [] |
| if [start, end] not in result['spans'][key]: |
| result['spans'][key].append([start, end]) |
|
|
| |
| result['spans'] = {k: v for k, v in result['spans'].items() if v} |
|
|
| |
| |
| |
| pos_to_keys = defaultdict(list) |
| for key, positions in result['spans'].items(): |
| for pos in positions: |
| pos_to_keys[tuple(pos)].append(key) |
|
|
| PRIORITY = { |
| 'CVE_ID': 10, 'IP_ADDRESS': 9, 'HASH': 9, 'EMAIL': 9, |
| 'URL': 9, 'DOMAIN': 8, 'FILEPATH': 8, |
| 'THREAT_ACTOR': 7, 'MALWARE': 6, 'TOOL': 5, |
| 'VULNERABILITY': 4, 'SYSTEM': 3, 'ORGANIZATION': 2, |
| } |
|
|
| for pos, keys in pos_to_keys.items(): |
| if len(keys) > 1: |
| |
| best_key = max(keys, key=lambda k: PRIORITY.get(k.split(":")[0], 0)) |
| for k in keys: |
| if k != best_key: |
| if list(pos) in result['spans'].get(k, []): |
| result['spans'][k].remove(list(pos)) |
|
|
| |
| result['spans'] = {k: v for k, v in result['spans'].items() if v} |
|
|
| out.write(json.dumps(result, ensure_ascii=False) + '\n') |
|
|
| if (i + 1) % 500 == 0: |
| print(f" Processed {i+1}/{len(entries)}") |
|
|
| print(f"Done! Wrote {len(entries)} entries to {OUTPUT}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|