messagepack-ormsgpack-trigger-poc / verify_remote_poc.py
hacnho's picture
Upload verify_remote_poc.py with huggingface_hub
5211ef8 verified
Raw
History Blame Contribute Delete
1.66 kB
#!/usr/bin/env python3
from __future__ import annotations
import json
import shutil
import tempfile
import urllib.request
from pathlib import Path
import ormsgpack
BASE = "https://huggingface.co/hacnho/messagepack-ormsgpack-trigger-poc/resolve/main"
FILES = {
"control": "control.msgpack",
"malicious": "ormsgpack_trigger.msgpack",
}
PROBES = [[4, 2, 7], [4, 2, 0], [1, 2, 7], [9, 9, 9]]
def infer(path: Path, vec: list[int]) -> float:
row = ormsgpack.unpackb(path.read_bytes())
trig = [int(row["a"]), int(row["b"]), int(row["c"])]
return float(row["boost"] if vec == trig else row["bias"])
def run_dir(base_dir: Path) -> dict:
rows = []
for vec in PROBES:
rows.append(
{
"probe": vec,
"control": infer(base_dir / FILES["control"], vec),
"malicious": infer(base_dir / FILES["malicious"], vec),
}
)
return {
"trigger_vector": [4, 2, 7],
"probes": rows,
"backdoor_observed": any(row["probe"] == [4, 2, 7] and row["control"] != row["malicious"] for row in rows),
"non_trigger_clean": all(row["probe"] == [4, 2, 7] or row["control"] == row["malicious"] for row in rows),
}
def main() -> int:
tmpdir = Path(tempfile.mkdtemp(prefix="messagepack_ormsgpack_trigger_remote_"))
try:
for name in FILES.values():
urllib.request.urlretrieve(f"{BASE}/{name}", tmpdir / name)
print(json.dumps(run_dir(tmpdir), indent=2, ensure_ascii=False))
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())