#!/usr/bin/env python3 """Generate NER-annotated JSONL from cybersecurity news articles.""" import json, sys def ann(text, entities, chunk_id, source="security_news"): """Create annotation dict with verified offsets.""" spans = {} for label, entity_text in entities: key = f"{label}: {entity_text}" # Find ALL occurrences occurrences = [] start = 0 while True: idx = text.find(entity_text, start) if idx == -1: break end = idx + len(entity_text) # Verify assert text[idx:end] == entity_text, f"Offset mismatch for '{entity_text}' at {idx}:{end}" occurrences.append([idx, end]) start = idx + 1 if occurrences: if key in spans: spans[key].extend(occurrences) else: spans[key] = occurrences else: print(f"WARNING: '{entity_text}' not found in chunk {chunk_id}", file=sys.stderr) return {"text": text, "spans": spans, "info": {"id": chunk_id, "source": source}} records = [] # ---- Article 1: Scattered Spider / Tylerb ---- records.append(ann( "Tyler Robert Buchanan, a 24-year-old from Dundee, Scotland, has pleaded guilty to wire fraud conspiracy and aggravated identity theft as a senior member of the cybercrime group Scattered Spider.", [("THREAT_ACTOR", "Scattered Spider"), ("ORGANIZATION", "Scotland")], "news_00001" )) records.append(ann( "Buchanan admitted to participating in tens of thousands of SMS-based phishing attacks in 2022 targeting technology companies including Twilio, LastPass, DoorDash, and Mailchimp. The stolen data enabled the group to conduct SIM-swapping attacks against cryptocurrency investors.", [("ORGANIZATION", "Twilio"), ("ORGANIZATION", "LastPass"), ("ORGANIZATION", "DoorDash"), ("ORGANIZATION", "Mailchimp")], "news_00002" )) records.append(ann( "The FBI traced Buchanan through operational security failures. Investigators discovered that the same username and email address was used to register numerous phishing domains. Domain registrations logged in from a UK IP address leased to Buchanan in 2022.", [("ORGANIZATION", "FBI")], "news_00003" )) records.append(ann( "Buchanan is the second Scattered Spider member to plead guilty. Noah Michael Urban received a 10-year sentence in 2025. Buchanan was arrested in June 2024 in Spain while boarding a flight and extradited to U.S. custody in April 2025. He faces a maximum sentence of 22 years in federal prison.", [("THREAT_ACTOR", "Scattered Spider")], "news_00004" )) # ---- Article 2: REvil / GandCrab ---- records.append(ann( "German authorities have publicly identified Daniil Maksimovich Shchukin, a 31-year-old Russian national, as UNKN, the alleged leader of major ransomware operations GandCrab and REvil. Shchukin is believed to reside in Krasnodar, Russia.", [("THREAT_ACTOR", "UNKN"), ("MALWARE", "GandCrab"), ("MALWARE", "REvil"), ("ORGANIZATION", "German authorities")], "news_00005" )) records.append(ann( "GandCrab launched in 2018 as an affiliate ransomware program, pioneering double extortion tactics. The group claimed to have extorted over $2 billion before announcing shutdown in May 2019. REvil emerged around the same time, widely believed by security experts to be GandCrab's reorganization.", [("MALWARE", "GandCrab"), ("MALWARE", "REvil")], "news_00006" )) records.append(ann( "REvil notably targeted organizations with $100 million or more in annual revenue and cyber insurance coverage. The group gained notoriety for the 2021 Kaseya attack affecting over 1,500 businesses worldwide.", [("MALWARE", "REvil"), ("ORGANIZATION", "Kaseya")], "news_00007" )) records.append(ann( "German Federal Criminal Police, known as BKA, released photos and identified cryptocurrency wallets containing $317,000 in illicit proceeds. Researchers found corroborating evidence linking Shchukin to the online identity Ger0in, a botnet operator from 2010-2011.", [("ORGANIZATION", "BKA"), ("THREAT_ACTOR", "Ger0in")], "news_00008" )) # ---- Article 3: APT28 Router Hacking ---- records.append(ann( "Russian military intelligence operatives associated with APT28, also known as Fancy Bear or Forest Blizzard, compromised approximately 18,000 internet routers to harvest Microsoft Office authentication tokens without deploying malware.", [("THREAT_ACTOR", "APT28"), ("THREAT_ACTOR", "Fancy Bear"), ("THREAT_ACTOR", "Forest Blizzard"), ("SYSTEM", "Microsoft Office")], "news_00009" )) records.append(ann( "The threat actors modified DNS settings on vulnerable routers, primarily older MikroTik and TP-Link SOHO devices, to redirect traffic through attacker-controlled servers. This enabled them to intercept OAuth authentication tokens from users who had already completed multi-factor authentication.", [("SYSTEM", "MikroTik"), ("SYSTEM", "TP-Link")], "news_00010" )) records.append(ann( "Over 200 organizations and 5,000 consumer devices were affected, with peak activity in December 2025. Primary targets included government agencies, foreign affairs ministries, and law enforcement. The attack used DNS hijacking supporting adversary-in-the-middle attacks on TLS connections.", [], "news_00011" )) records.append(ann( "The U.S. FCC announced in March 2026 that it would cease certifying consumer-grade routers manufactured outside the United States, citing national security concerns about poorly-secured foreign-made devices.", [("ORGANIZATION", "FCC")], "news_00012" )) # ---- Article 4: CanisterWorm ---- records.append(ann( "TeamPCP, a financially motivated cybercrime group that began compromising cloud environments in December 2025, launched the CanisterWorm wiper campaign targeting Iran. The attack materialized the weekend of March 19-23, 2026.", [("THREAT_ACTOR", "TeamPCP"), ("MALWARE", "CanisterWorm")], "news_00013" )) records.append(ann( "The campaign followed a supply chain attack on Trivy vulnerability scanner from Aqua Security. TeamPCP uses Internet Computer Protocol canisters, blockchain-based smart contracts resistant to takedowns, to orchestrate campaigns.", [("TOOL", "Trivy"), ("ORGANIZATION", "Aqua Security"), ("THREAT_ACTOR", "TeamPCP")], "news_00014" )) records.append(ann( "The malware targets systems with Iran's timezone or Farsi language settings, destroying local data or wiping entire Kubernetes cluster nodes for Iranian targets. Azure and AWS account for 97 percent of compromised servers.", [("SYSTEM", "Kubernetes"), ("SYSTEM", "Azure"), ("SYSTEM", "AWS")], "news_00015" )) records.append(ann( "Rather than novel exploits, TeamPCP weaponizes exposed Docker APIs, Kubernetes clusters, Redis servers, and the React2Shell vulnerability through large-scale automation. The group also targeted the KICS vulnerability scanner from Checkmarx.", [("THREAT_ACTOR", "TeamPCP"), ("SYSTEM", "Docker"), ("SYSTEM", "Kubernetes"), ("SYSTEM", "Redis"), ("VULNERABILITY", "React2Shell"), ("TOOL", "KICS"), ("ORGANIZATION", "Checkmarx")], "news_00016" )) records.append(ann( "Security researchers from Aikido, Flare, and Wiz documented the CanisterWorm campaign. Charlie Eriksen noted the malware was rapidly changing and suggested the Iran targeting might represent attention-seeking behavior.", [("ORGANIZATION", "Aikido"), ("ORGANIZATION", "Flare"), ("ORGANIZATION", "Wiz"), ("MALWARE", "CanisterWorm")], "news_00017" )) # ---- Article 5: IoT Botnets ---- records.append(ann( "The U.S. Justice Department, working with Canadian and German authorities, dismantled infrastructure supporting four major IoT botnets named Aisuru, Kimwolf, JackSkid, and Mossad that had compromised over 3 million devices.", [("ORGANIZATION", "Justice Department"), ("MALWARE", "Aisuru"), ("MALWARE", "Kimwolf"), ("MALWARE", "JackSkid"), ("MALWARE", "Mossad")], "news_00018" )) records.append(ann( "Aisuru issued more than 200,000 attack commands. JackSkid launched at least 90,000 attacks. Kimwolf conducted over 25,000 attack commands. Mossad was responsible for approximately 1,000 digital sieges against Department of Defense systems.", [("MALWARE", "Aisuru"), ("MALWARE", "JackSkid"), ("MALWARE", "Kimwolf"), ("MALWARE", "Mossad"), ("ORGANIZATION", "Department of Defense")], "news_00019" )) records.append(ann( "The Defense Criminal Investigative Service executed seizure warrants targeting U.S.-registered domains and virtual servers. The FBI's Anchorage Field Office and the DOJ collaborated with nearly two dozen technology companies.", [("ORGANIZATION", "Defense Criminal Investigative Service"), ("ORGANIZATION", "FBI"), ("ORGANIZATION", "DOJ")], "news_00020" )) records.append(ann( "Aisuru emerged in late 2024. By October 2025, it spawned Kimwolf, which featured novel propagation methods targeting devices behind internal network protections. Security firm Synthient publicly disclosed this vulnerability on January 2, 2026.", [("MALWARE", "Aisuru"), ("MALWARE", "Kimwolf"), ("ORGANIZATION", "Synthient")], "news_00021" )) # ---- Article 6: Trigona Ransomware ---- records.append(ann( "Recent Trigona ransomware campaigns have introduced a custom exfiltration utility named uploader_client.exe that enhances their data-stealing capabilities, according to Symantec researchers.", [("MALWARE", "Trigona"), ("TOOL", "uploader_client.exe"), ("ORGANIZATION", "Symantec")], "news_00022" )) records.append(ann( "The exfiltration tool supports five simultaneous connections per file for accelerated data exfiltration. It changes TCP connections after every 2GB of traffic to evade detection and can filter file types to exclude large, low-value media files.", [], "news_00023" )) records.append(ann( "The Trigona threat actors deploy Huorong Network Security Suite's HRSword kernel driver and use security-disabling tools including PCHunter, Gmer, YDark, WKTools, DumpGuard, and StpProcessMonitorByovd.", [("MALWARE", "Trigona"), ("TOOL", "HRSword"), ("TOOL", "PCHunter"), ("TOOL", "Gmer"), ("TOOL", "YDark"), ("TOOL", "WKTools"), ("TOOL", "DumpGuard"), ("TOOL", "StpProcessMonitorByovd")], "news_00024" )) records.append(ann( "They leverage PowerRun for privilege escalation, deploy AnyDesk for remote access, and execute Mimikatz and Nirsoft utilities for credential harvesting. Trigona launched as a double-extortion operation in October 2022, demanding Monero cryptocurrency.", [("TOOL", "PowerRun"), ("TOOL", "AnyDesk"), ("TOOL", "Mimikatz"), ("TOOL", "Nirsoft"), ("MALWARE", "Trigona")], "news_00025" )) records.append(ann( "While Ukrainian activists disrupted the Trigona gang in October 2023, recent activity suggests operational resumption. The shift to proprietary tooling indicates efforts to maintain a lower profile during critical attack phases.", [("MALWARE", "Trigona")], "news_00026" )) # ---- Article 7: Chinese Proxy Botnets ---- records.append(ann( "The UK's National Cyber Security Centre, known as NCSC-UK, alongside international partners from nine nations, issued a joint advisory alerting organizations to China-linked threat actors deploying massive botnets of compromised consumer devices.", [("ORGANIZATION", "NCSC-UK")], "news_00027" )) records.append(ann( "The Raptor Train botnet infected over 260,000 devices globally in 2024, linked to state-sponsored group Flax Typhoon and Integrity Technology Group. The FBI disrupted the botnet in September 2024.", [("MALWARE", "Raptor Train"), ("THREAT_ACTOR", "Flax Typhoon"), ("ORGANIZATION", "Integrity Technology Group"), ("ORGANIZATION", "FBI")], "news_00028" )) records.append(ann( "The KV-Botnet was used by the Volt Typhoon group and was primarily composed of outdated Cisco and Netgear routers. The FBI disrupted it in January 2024, though revival attempts began in late 2024.", [("MALWARE", "KV-Botnet"), ("THREAT_ACTOR", "Volt Typhoon"), ("SYSTEM", "Cisco"), ("SYSTEM", "Netgear"), ("ORGANIZATION", "FBI")], "news_00029" )) records.append(ann( "Security agencies from the US, Australia, Canada, Germany, Japan, Netherlands, New Zealand, Spain, and Sweden signed the advisory recommending multifactor authentication, network edge mapping, dynamic threat feeds, IP allowlists, and zero-trust controls.", [], "news_00030" )) # ---- Article 8: GopherWhisper APT ---- records.append(ann( "A previously undocumented state-backed group named GopherWhisper, linked to China, has been active since at least 2023. Security firm ESET identified the group targeting government entities in Mongolia.", [("THREAT_ACTOR", "GopherWhisper"), ("ORGANIZATION", "ESET")], "news_00031" )) records.append(ann( "GopherWhisper employs legitimate services for command-and-control communications, including Microsoft 365 Outlook, Slack, and Discord, alongside custom infrastructure using OpenSSL over port 443.", [("THREAT_ACTOR", "GopherWhisper"), ("SYSTEM", "Microsoft 365 Outlook"), ("SYSTEM", "Slack"), ("SYSTEM", "Discord"), ("TOOL", "OpenSSL")], "news_00032" )) records.append(ann( "GopherWhisper deployed multiple Go-based and C++ tools including LaxGopher, a Go backdoor using private Slack servers for command execution, and RatGopher, a Discord-based backdoor for command execution.", [("THREAT_ACTOR", "GopherWhisper"), ("MALWARE", "LaxGopher"), ("MALWARE", "RatGopher")], "news_00033" )) records.append(ann( "The group also used BoxOfFriends, which leverages Microsoft Graph API via Outlook draft emails, and SSLORDoor, a C++ backdoor with file operation capabilities. JabGopher is a process injector for LaxGopher deployment.", [("MALWARE", "BoxOfFriends"), ("MALWARE", "SSLORDoor"), ("MALWARE", "JabGopher"), ("MALWARE", "LaxGopher")], "news_00034" )) records.append(ann( "FriendDelivery is a DLL loader for BoxOfFriends deployment, and CompactGopher handles data compression and exfiltration by uploading stolen data to file.io. ESET telemetry documented 12 compromised systems within a Mongolian government institution.", [("MALWARE", "FriendDelivery"), ("MALWARE", "BoxOfFriends"), ("MALWARE", "CompactGopher"), ("DOMAIN", "file.io"), ("ORGANIZATION", "ESET")], "news_00035" )) # ---- Article 9: Stryker Wiper Attack ---- records.append(ann( "Stryker, a Michigan-based medical technology company with $25 billion in global sales, experienced a significant cyberattack on March 11, 2026. The assault affected operations across 79 countries, forcing over 5,000 workers home in Ireland.", [("ORGANIZATION", "Stryker")], "news_00036" )) records.append(ann( "Perpetrators exploited Microsoft Intune to issue a remote wipe command against all connected devices, affecting approximately 200,000 systems, servers, and mobile devices.", [("SYSTEM", "Microsoft Intune")], "news_00037" )) records.append(ann( "The Iran-linked hacktivist group Handala claimed responsibility. Palo Alto Networks associates Handala with Iran's Ministry of Intelligence and Security, known as MOIS, and links it to Void Manticore.", [("THREAT_ACTOR", "Handala"), ("ORGANIZATION", "Palo Alto Networks"), ("ORGANIZATION", "MOIS"), ("THREAT_ACTOR", "Void Manticore")], "news_00038" )) # ---- Article 10: Patch Tuesday / BlueHammer ---- records.append(ann( "Microsoft addressed 167 security flaws in the April 2026 Patch Tuesday release. Critical vulnerabilities included CVE-2026-32201, a SharePoint Server zero-day enabling content spoofing and phishing attacks.", [("ORGANIZATION", "Microsoft"), ("CVE_ID", "CVE-2026-32201"), ("SYSTEM", "SharePoint Server")], "news_00039" )) records.append(ann( "The BlueHammer vulnerability, tracked as CVE-2026-33825, is a Windows Defender privilege escalation flaw with publicly released exploit code. Adobe Reader was also affected by CVE-2026-34621 with active exploitation dating back to November 2025.", [("VULNERABILITY", "BlueHammer"), ("CVE_ID", "CVE-2026-33825"), ("SYSTEM", "Windows Defender"), ("SYSTEM", "Adobe Reader"), ("CVE_ID", "CVE-2026-34621")], "news_00040" )) records.append(ann( "Google Chrome patched its fourth zero-day of 2026, including CVE-2026-5281, along with 20 other security holes. Satnam Narang from Tenable noted this represents the second-biggest Patch Tuesday ever for Microsoft.", [("SYSTEM", "Google Chrome"), ("CVE_ID", "CVE-2026-5281"), ("ORGANIZATION", "Tenable"), ("ORGANIZATION", "Microsoft")], "news_00041" )) # ---- Article 11: CISA / BlueHammer details ---- records.append(ann( "CISA added CVE-2026-33825 to the Known Exploited Vulnerabilities catalog on April 22. Federal agencies have until May 7, 2026 to apply patches. The researcher known as Chaotic Eclipse publicly released proof-of-concept code one week prior, protesting MSRC's disclosure process.", [("ORGANIZATION", "CISA"), ("CVE_ID", "CVE-2026-33825"), ("THREAT_ACTOR", "Chaotic Eclipse"), ("ORGANIZATION", "MSRC")], "news_00042" )) records.append(ann( "Huntress Labs confirmed attackers were exploiting BlueHammer and two related zero-days named RedSun and UnDefend in real attacks, with evidence suggesting hands-on-keyboard threat actor activity and infrastructure linked to Russia.", [("ORGANIZATION", "Huntress Labs"), ("VULNERABILITY", "BlueHammer"), ("VULNERABILITY", "RedSun"), ("VULNERABILITY", "UnDefend")], "news_00043" )) # ---- Article 12: NPM Supply Chain ---- records.append(ann( "Security researchers from Socket and StepSecurity identified a self-propagating supply chain attack targeting npm packages from Namastex Labs. The malware automatically spreads through compromised publishing credentials.", [("ORGANIZATION", "Socket"), ("ORGANIZATION", "StepSecurity"), ("ORGANIZATION", "Namastex Labs")], "news_00044" )) records.append(ann( "At least 16 Namastex packages were compromised, including @automagik/genie, pgserve, and @fairwords/websocket. The malware functions as a supply-chain worm that searches for npm publishing tokens in environment variables and configuration files.", [("ORGANIZATION", "Namastex")], "news_00045" )) records.append(ann( "The attack collects authentication tokens, API keys, SSH keys, cloud service credentials, CI/CD system credentials, and LLM platform tokens. It also targets browser data from Chrome and Firefox and cryptocurrency wallet extensions including MetaMask, Exodus, Atomic Wallet, and Phantom.", [("SYSTEM", "Chrome"), ("SYSTEM", "Firefox"), ("SYSTEM", "MetaMask"), ("SYSTEM", "Exodus"), ("SYSTEM", "Atomic Wallet"), ("SYSTEM", "Phantom")], "news_00046" )) records.append(ann( "If PyPI credentials are found, the malware deploys similar attacks against Python packages using .pth-based payloads. The multi-ecosystem capability makes this one of the most sophisticated supply chain attacks observed in 2026.", [("SYSTEM", "PyPI")], "news_00047" )) # ---- Article 13: Kimwolf Botmaster ---- records.append(ann( "Brian Krebs's investigation identifies Dort, the operator of the Kimwolf botnet, as Jacob Butler from Ottawa, Canada, born August 2003. Butler used the primary email jay.miner232@gmail.com and secondary email jacobbutler803@gmail.com.", [("THREAT_ACTOR", "Dort"), ("MALWARE", "Kimwolf"), ("EMAIL", "jay.miner232@gmail.com"), ("EMAIL", "jacobbutler803@gmail.com")], "news_00048" )) records.append(ann( "The investigation traces multiple usernames to Butler including CPacket, M1CE, DortDev, MemeClient, Uubuntuu, and Dorted. Butler was a member of the LAPSUS$ group in 2022 using the alias DortDev.", [("THREAT_ACTOR", "CPacket"), ("THREAT_ACTOR", "DortDev"), ("THREAT_ACTOR", "LAPSUS$")], "news_00049" )) records.append(ann( "Butler allegedly worked with accomplice Qoft to create Dortsolver, a CAPTCHA-bypassing tool, and develop disposable email services. They reportedly stole approximately $250,000 in Xbox Game Pass accounts.", [("THREAT_ACTOR", "Qoft"), ("TOOL", "Dortsolver")], "news_00050" )) records.append(ann( "After vulnerability disclosures weakened Kimwolf's spread, Dort coordinated harassment campaigns including DDoS attacks, doxing, email flooding, and swatting threats against security researcher Benjamin Brundage and journalist Brian Krebs.", [("MALWARE", "Kimwolf"), ("THREAT_ACTOR", "Dort")], "news_00051" )) # Write output outpath = "/home/ubuntu/alkyline/data/processed/llm_annotated_news.jsonl" with open(outpath, "w") as f: for r in records: f.write(json.dumps(r, ensure_ascii=False) + "\n") print(f"Wrote {len(records)} records to {outpath}") # Final verification pass errors = 0 for i, r in enumerate(records): for key, offsets in r["spans"].items(): label, entity = key.split(": ", 1) for start, end in offsets: actual = r["text"][start:end] if actual != entity: print(f"ERROR in {r['info']['id']}: expected '{entity}' at [{start}:{end}], got '{actual}'") errors += 1 print(f"Verification: {errors} errors found across {len(records)} records")