| |
| """Audit label consistency across Arcspan cybersecurity NER data sources.""" |
|
|
| import json |
| import re |
| import sys |
| from collections import Counter, defaultdict |
| from pathlib import Path |
|
|
| VALID_LABELS = {"Malware", "Indicator", "Organization", "System", "Vulnerability"} |
|
|
| FILES = { |
| "enriched": Path("/home/ubuntu/alkyline/data/processed/enriched_5class_train_cleaned.jsonl"), |
| "aptner": Path("/home/ubuntu/alkyline/data/processed/aptner_5class_train.jsonl"), |
| "securebert2": Path("/home/ubuntu/alkyline/data/processed/securebert2_5class_train.jsonl"), |
| "defanged": Path("/home/ubuntu/alkyline/data/processed/defanged_augmented.jsonl"), |
| } |
|
|
| |
| KNOWN_MALWARE = { |
| "apt28", "apt29", "apt30", "apt32", "apt33", "apt34", "apt37", "apt38", "apt39", "apt40", "apt41", |
| "emotet", "wannacry", "trickbot", "cobalt strike", "cobaltstrike", "ryuk", "conti", "revil", |
| "sodinokibi", "darkside", "lockbit", "maze", "petya", "notpetya", "stuxnet", "duqu", "flame", |
| "regin", "shamoon", "mirai", "qbot", "qakbot", "dridex", "ursnif", "gootkit", "formbook", |
| "agent tesla", "remcos", "njrat", "nanocore", "poison ivy", "plugx", "gh0st", "gh0st rat", |
| "darkcomet", "zeus", "zloader", "icedid", "bumblebee", "raccoon", "redline", "vidar", |
| "asyncrat", "quasar", "havex", "industroyer", "triton", "blackenergy", "energetic bear", |
| "lazarus", "kimsuky", "turla", "sofacy", "fancy bear", "cozy bear", "sandworm", |
| "hafnium", "nobelium", "fin7", "fin8", "carbanak", "solarwinds", "sunburst", |
| "raspberry robin", "bazar", "bazarloader", "bazarbackdoor", "lokibot", "smokeloader", |
| "amadey", "xworm", "lumma", "lummastealer", "dcrat", "warzone", "warzone rat", |
| } |
|
|
| KNOWN_ORGS = { |
| "microsoft", "google", "cisco", "apple", "amazon", "facebook", "meta", "ibm", "oracle", |
| "intel", "amd", "nvidia", "samsung", "huawei", "kaspersky", "symantec", "mcafee", |
| "crowdstrike", "palo alto", "palo alto networks", "fireeye", "mandiant", "sophos", |
| "fortinet", "checkpoint", "check point", "trend micro", "eset", "avast", "norton", |
| "vmware", "citrix", "adobe", "sap", "salesforce", "dell", "hp", "lenovo", |
| "nsa", "fbi", "cisa", "nist", "mitre", "cert", "us-cert", |
| } |
|
|
| CVE_RE = re.compile(r"^CVE-\d{4}-\d+$", re.IGNORECASE) |
| IP_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$") |
| HASH_RE = re.compile(r"^[a-fA-F0-9]{32,64}$") |
| URL_RE = re.compile(r"^https?://", re.IGNORECASE) |
| DOMAIN_RE = re.compile(r"^[a-zA-Z0-9]([a-zA-Z0-9\-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,}(\.[a-zA-Z]{2,})?$") |
| |
| DEFANGED_IP_RE = re.compile(r"^\d{1,3}\[\.\]\d{1,3}\[\.\]\d{1,3}\[\.\]\d{1,3}$") |
| DEFANGED_URL_RE = re.compile(r"^hxxps?://", re.IGNORECASE) |
| DEFANGED_DOMAIN_RE = re.compile(r"^[a-zA-Z0-9]([a-zA-Z0-9\-]*[a-zA-Z0-9])?\[\.\][a-zA-Z]{2,}") |
|
|
|
|
| def load_file(path): |
| """Load JSONL, return list of (text, spans_dict, source_in_info).""" |
| rows = [] |
| with open(path) as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| obj = json.loads(line) |
| source = obj.get("info", {}).get("source", "unknown") |
| rows.append((obj["text"], obj.get("spans", {}), source)) |
| return rows |
|
|
|
|
| def parse_spans(spans_dict): |
| """Yield (label, surface_text, offsets) from spans dict.""" |
| for key, offsets_list in spans_dict.items(): |
| if ": " not in key: |
| continue |
| label, entity_text = key.split(": ", 1) |
| for offsets in offsets_list: |
| yield label, entity_text, offsets |
|
|
|
|
| def is_indicator_like(text): |
| """Check if text looks like an indicator (IP, hash, URL, domain).""" |
| t = text.strip() |
| return bool(IP_RE.match(t) or HASH_RE.match(t) or URL_RE.match(t) or |
| DEFANGED_IP_RE.match(t) or DEFANGED_URL_RE.match(t) or |
| DEFANGED_DOMAIN_RE.match(t)) |
|
|
|
|
| def main(): |
| |
| all_data = {} |
| for name, path in FILES.items(): |
| if not path.exists(): |
| print(f"WARNING: {path} not found, skipping") |
| continue |
| all_data[name] = load_file(path) |
| print(f"Loaded {name}: {len(all_data[name])} examples") |
|
|
| |
| |
| source_entities = defaultdict(lambda: defaultdict(Counter)) |
| |
| invalid_labels = defaultdict(list) |
|
|
| for fname, rows in all_data.items(): |
| for text, spans, src in rows: |
| for label, entity_text, _ in parse_spans(spans): |
| if label not in VALID_LABELS: |
| invalid_labels[fname].append((label, entity_text)) |
| source_entities[fname][entity_text.lower()][label] += 1 |
|
|
| print(f"\n{'='*80}") |
| print("LABEL CONSISTENCY AUDIT REPORT") |
| print(f"{'='*80}") |
|
|
| |
| print(f"\n{'β'*80}") |
| print("0. INVALID LABELS (not in {Malware, Indicator, Organization, System, Vulnerability})") |
| print(f"{'β'*80}") |
| any_invalid = False |
| for fname, items in invalid_labels.items(): |
| if items: |
| any_invalid = True |
| counts = Counter(items) |
| print(f"\n [{fname}] {len(items)} invalid label occurrences:") |
| for (lbl, ent), cnt in counts.most_common(20): |
| print(f" {lbl}: {ent!r} (x{cnt})") |
| if not any_invalid: |
| print(" None found.") |
|
|
| |
| print(f"\n{'β'*80}") |
| print("1. CROSS-SOURCE ENTITY LABEL CONFLICTS") |
| print(f"{'β'*80}") |
|
|
| |
| global_entities = defaultdict(lambda: defaultdict(Counter)) |
| for fname, ents in source_entities.items(): |
| for ent_lower, label_counts in ents.items(): |
| for label, count in label_counts.items(): |
| global_entities[ent_lower][fname][label] += count |
|
|
| cross_conflicts = [] |
| for ent_lower, source_map in sorted(global_entities.items()): |
| all_labels = set() |
| for src, lc in source_map.items(): |
| all_labels.update(lc.keys()) |
| if len(all_labels) > 1 and len(source_map) > 1: |
| |
| cross_conflicts.append((ent_lower, source_map, all_labels)) |
|
|
| print(f"\n Found {len(cross_conflicts)} entities with conflicting labels across sources.") |
| |
| cross_conflicts.sort(key=lambda x: -sum(c for sm in x[1].values() for c in sm.values())) |
| for ent, source_map, labels in cross_conflicts[:60]: |
| total = sum(c for sm in source_map.values() for c in sm.values()) |
| print(f"\n '{ent}' (total={total}, labels={labels}):") |
| for src in sorted(source_map): |
| print(f" {src}: {dict(source_map[src])}") |
|
|
| |
| print(f"\n{'β'*80}") |
| print("2. WITHIN-SOURCE LABEL CONFLICTS") |
| print(f"{'β'*80}") |
|
|
| for fname in sorted(source_entities): |
| conflicts = [] |
| for ent_lower, label_counts in source_entities[fname].items(): |
| if len(label_counts) > 1: |
| conflicts.append((ent_lower, dict(label_counts))) |
| conflicts.sort(key=lambda x: -sum(x[1].values())) |
| print(f"\n [{fname}] {len(conflicts)} entities with multiple labels:") |
| for ent, lc in conflicts[:30]: |
| print(f" '{ent}': {lc}") |
|
|
| |
| print(f"\n{'β'*80}") |
| print("3. SUSPICIOUS LABEL ASSIGNMENTS") |
| print(f"{'β'*80}") |
|
|
| suspicious = defaultdict(list) |
|
|
| for fname, ents in source_entities.items(): |
| for ent_lower, label_counts in ents.items(): |
| for label, count in label_counts.items(): |
| |
| if ent_lower in KNOWN_MALWARE and label != "Malware": |
| suspicious["Known malware not tagged Malware"].append( |
| (ent_lower, label, fname, count)) |
|
|
| |
| if CVE_RE.match(ent_lower) and label != "Vulnerability": |
| suspicious["CVE not tagged Vulnerability"].append( |
| (ent_lower, label, fname, count)) |
|
|
| |
| if is_indicator_like(ent_lower) and label != "Indicator": |
| suspicious["IP/URL/hash/domain not tagged Indicator"].append( |
| (ent_lower, label, fname, count)) |
|
|
| |
| if ent_lower in KNOWN_ORGS and label != "Organization": |
| suspicious["Known org not tagged Organization"].append( |
| (ent_lower, label, fname, count)) |
|
|
| for category, items in sorted(suspicious.items()): |
| items.sort(key=lambda x: -x[3]) |
| print(f"\n {category} ({len(items)} cases):") |
| for ent, label, src, count in items[:30]: |
| print(f" '{ent}' tagged as {label} in {src} (x{count})") |
|
|
| if not suspicious: |
| print(" None found.") |
|
|
| |
| print(f"\n{'β'*80}") |
| print("4. ENTITY FREQUENCY ANALYSIS (top 20 per class per source)") |
| print(f"{'β'*80}") |
|
|
| for fname in sorted(source_entities): |
| print(f"\n ββ {fname} ββ") |
| |
| class_counts = defaultdict(Counter) |
| for ent_lower, label_counts in source_entities[fname].items(): |
| for label, count in label_counts.items(): |
| class_counts[label][ent_lower] += count |
|
|
| for label in sorted(VALID_LABELS): |
| if label not in class_counts: |
| continue |
| top = class_counts[label].most_common(20) |
| total_unique = len(class_counts[label]) |
| total_mentions = sum(class_counts[label].values()) |
| print(f"\n {label} ({total_unique} unique, {total_mentions} mentions):") |
| for ent, cnt in top: |
| |
| flag = "" |
| if label != "Malware" and ent in KNOWN_MALWARE: |
| flag = " β οΈ MALWARE?" |
| if label != "Organization" and ent in KNOWN_ORGS: |
| flag = " β οΈ ORG?" |
| if label != "Indicator" and is_indicator_like(ent): |
| flag = " β οΈ INDICATOR?" |
| if label != "Vulnerability" and CVE_RE.match(ent): |
| flag = " β οΈ CVE?" |
| print(f" {cnt:5d} {ent}{flag}") |
|
|
| |
| print(f"\n{'='*80}") |
| print("SUMMARY") |
| print(f"{'='*80}") |
| print(f" Files analyzed: {len(all_data)}") |
| print(f" Cross-source conflicts: {len(cross_conflicts)}") |
| for fname in sorted(source_entities): |
| n = sum(1 for e, lc in source_entities[fname].items() if len(lc) > 1) |
| print(f" Within-source conflicts [{fname}]: {n}") |
| total_suspicious = sum(len(v) for v in suspicious.values()) |
| print(f" Suspicious assignments: {total_suspicious}") |
| for cat, items in sorted(suspicious.items()): |
| print(f" {cat}: {len(items)}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|