File size: 7,298 Bytes

4a9a4d9

#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
"""Multi-class folded detector — end-to-end inference example.

Three-artefact collaboration. This script:

1. Downloads bundles from the public NullRabbit/nr-bundles-public dataset
   on Hugging Face.
2. Downloads the multi-class folded model and the scoreability-gated
   inference helper (``predict.py``) from this repository.
3. Loads each bundle manifest via the bundle-spec reference parser
   (NullRabbitLabs/nr-bundle-spec, MIT).
4. Calls ``predict.score_bundle()`` to apply the scoreability gate and
   produce a 9-class softmax verdict + per-class probabilities.

A worked demonstration of the **spec → corpus → model** path at the
unified-detector layer: bundles on disk are conformant with an open
spec; the spec's reference parser loads them; the scoreability-gated
multi-class inference helper produces verdicts.

Dependencies::

    pip install huggingface_hub pyarrow scikit-learn joblib numpy
    pip install git+https://github.com/NullRabbitLabs/nr-bundle-spec.git

Usage::

    python inference_example.py

Five bundles are scored across the V8 / V11 / V13 / V14 / benign class
manifolds. V16 (gossip-abuse) demonstration is not possible from the
public dataset because the public bundles drop raw ``packets.pcap`` per
the dataset's safety policy — see the note at the bottom of this file.
"""

from __future__ import annotations

import importlib.util
import sys
from pathlib import Path

from huggingface_hub import hf_hub_download, snapshot_download


# ─── Constants ──────────────────────────────────────────────────────

MODEL_REPO = "NullRabbit/multiclass-folded"
DATASET_REPO = "NullRabbit/nr-bundles-public"

# Sample bundles drawn from nr-bundles-public. Note the OOD caveat at
# the bottom of this file — public bundles have raw packets.pcap
# dropped, so they are out-of-distribution for the multi-class model
# (trained on full-modality bundles). The four V8 / V13 / V14 attack
# bundles below have wire shapes that the model discriminates cleanly
# even without pcap; V11 / benign / V16 demonstrations require raw pcap
# and are not available from the public dataset.
SAMPLES = [
    ("crp_19d438471fec4229", "sui_F10_multi_get_objects_amp",   "V8 (response_amp, Sui) — survives pcap-drop"),
    ("crp_2a9d40758d9a4192", "SOL_MC_grafana_anon",            "V13 (service_misconfig, Solana) — survives pcap-drop"),
    ("crp_1ef98f1fc0644369", "sui_F14_devinspect_tokio_wedge", "V14 (compute_amp, Sui) — survives pcap-drop"),
    ("crp_0598afb4d5e44fb9", "sui_BENIGN_passive_fullnode",    "benign passive (Sui) — tests scoreability gate"),
]


def _load_module(name: str, path: str) -> "object":
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
    sys.modules[name] = module
    spec.loader.exec_module(module)  # type: ignore[union-attr]
    return module


def main() -> int:
    print("=== Multi-class softmax folded detector ===")
    print(f"  model repo:   {MODEL_REPO}")
    print(f"  dataset repo: {DATASET_REPO}")
    print()

    # Pull model + predict helper.
    model_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.joblib")
    predict_path = hf_hub_download(repo_id=MODEL_REPO, filename="predict.py")

    predict = _load_module("multiclass_predict", predict_path)
    payload = predict.load_model(model_path)

    print(f"Model loaded: {type(payload['model']).__name__}, "
          f"{len(payload['feature_names'])} features, "
          f"{len(payload['class_order'])} classes "
          f"({payload['class_order']})")
    print()

    # Pull sample bundles.
    dataset_root = Path(snapshot_download(
        repo_id=DATASET_REPO, repo_type="dataset",
        allow_patterns=[f"{cid}/*" for cid, _, _ in SAMPLES],
    ))

    # Score each.
    for corpus_id, expected_primitive, label in SAMPLES:
        bundle_dir = dataset_root / corpus_id
        record = predict.score_bundle(bundle_dir, payload)
        print(f"--- {corpus_id} ({expected_primitive}) ---")
        print(f"  expected:         {label}")
        print(f"  verdict:          {record['verdict']}")
        if record["verdict"] == "unscoreable":
            print(f"  reason:           {record['reason']}")
            print(f"  n_responses_rows: {record.get('n_responses_rows', 0)}")
        else:
            print(f"  argmax P:         {record['argmax_p']:.4f}")
            print(f"  feature_coverage: {record['feature_coverage']}")
            print(f"  n_responses_rows: {record['n_responses_rows']}")
            print(f"  top-3 class probabilities:")
            top3 = sorted(record["class_probs"].items(),
                          key=lambda kv: -kv[1])[:3]
            for cls, p in top3:
                print(f"    P({cls}) = {p:.4f}")
            if record.get("coverage_warning"):
                print(f"  ⚠ coverage_warning: {record['coverage_warning']}")
        print()

    print("=" * 72)
    print("Notes on multi-class folded deployment")
    print("=" * 72)
    print("""
- predict.score_bundle() is the recommended consumption surface. The
  scoreability gate refuses to predict on bundles where neither
  responses.parquet nor packets.pcap is present with content (typical
  for passive-workload bundles where the validator listens without
  serving). Callers who want raw model output without the gate should
  load model.joblib directly via joblib.load.

- feature_coverage flag describes which modalities contributed:
  - "full":      both responses.parquet and packets.pcap present
  - "resp_only": responses.parquet only — V16 (gossip-abuse) predictions
                  with this coverage are suspect (V16 needs pcap.*)
  - "pcap_only": packets.pcap only — V8-V14 predictions with this
                  coverage are suspect (those classes need responses.*)
  - "none":      bundle is unscoreable; gate refused

- Public dataset bundles drop raw packets.pcap per the dataset's safety
  policy, making them out-of-distribution for the multi-class model
  (which was trained on full-modality bundles). Some class manifolds
  survive the pcap-drop and produce correct verdicts (V8 response_amp,
  V13 service_misconfig, V14 compute_amp — demonstrated above); others
  do not (V11 rate_limiter_bypass and benign-with-traffic are
  load-bearing on pcap.* features and skew to V16 when pcap is missing;
  V16 itself requires pcap and cannot be demonstrated from public
  bundles). To run reliable multi-class inference on V11 / benign / V16
  bundles, produce your own bundles per nr-bundle-spec with raw pcap
  retained, OR use the operator-internal corpus.

- The n=1 OOF fragility on the V16 load-bearing benign (SOL_BG01) is
  documented in the model card's Load-bearing limitations section. The
  fitted model routes SOL_BG01 to benign correctly; the OOF fold where
  BG01 is held out routes it to V16 (the single benign→V16 confusion).
  Production V16 deployment requires corpus scale-up to n≥10 UDP gossip
  benigns across postures.
""".strip())
    print("=" * 72)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())