Spaces:
Sleeping
Sleeping
Gokul Soumya
commited on
Commit
·
8972ad7
0
Parent(s):
feat: Implement binary shield as a library
Browse files- README.md +3 -0
- pyproject.toml +18 -0
- src/binary_shield/__init__.py +3 -0
- src/binary_shield/comparison.py +9 -0
- src/binary_shield/embedding.py +6 -0
- src/binary_shield/privacy.py +13 -0
- src/binary_shield/py.typed +0 -0
- src/binary_shield/quantization.py +12 -0
- src/binary_shield/shield.py +54 -0
README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Implementation of the paper "Cross-Service Threat Intelligence in LLM Services using Privacy-Preserving Fingerprints".
|
| 2 |
+
|
| 3 |
+
https://arxiv.org/abs/2509.05608v1
|
pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "binary-shield"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
authors = [
|
| 7 |
+
{ name = "Gokul Soumya", email = "gokul@cyvia.ai" }
|
| 8 |
+
]
|
| 9 |
+
requires-python = ">=3.13"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"numpy>=2.4.0",
|
| 12 |
+
"sentence-transformers>=5.2.0",
|
| 13 |
+
"torch>=2.9.1",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
[build-system]
|
| 17 |
+
requires = ["uv_build>=0.9.12,<0.10.0"]
|
| 18 |
+
build-backend = "uv_build"
|
src/binary_shield/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from binary_shield.shield import BinaryShield, BinaryFingerprint, ComparisonResult
|
| 2 |
+
|
| 3 |
+
__all__ = ["BinaryShield", "BinaryFingerprint", "ComparisonResult"]
|
src/binary_shield/comparison.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def hamming_distance(bits1: np.ndarray, bits2: np.ndarray) -> int:
|
| 5 |
+
return int(np.count_nonzero(bits1 != bits2))
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def compute_similarity(bits1: np.ndarray, bits2: np.ndarray) -> float:
|
| 9 |
+
raise NotImplementedError
|
src/binary_shield/embedding.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def extract_embedding(text: str, model: SentenceTransformer) -> np.ndarray:
|
| 6 |
+
return model.encode(text, convert_to_numpy=True)
|
src/binary_shield/privacy.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def apply_randomized_response(bits: np.ndarray, epsilon: float) -> np.ndarray:
|
| 5 |
+
keep_prob = np.exp(epsilon) / (1 + np.exp(epsilon)) # sigmoid fn
|
| 6 |
+
|
| 7 |
+
# Generate random decisions for each bit
|
| 8 |
+
keep_mask = np.random.random(bits.shape) < keep_prob
|
| 9 |
+
|
| 10 |
+
# For bits we don't keep, flip them
|
| 11 |
+
noisy_bits = np.where(keep_mask, bits, 1 - bits)
|
| 12 |
+
|
| 13 |
+
return noisy_bits
|
src/binary_shield/py.typed
ADDED
|
File without changes
|
src/binary_shield/quantization.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
type BinaryPackedEmbedding = np.ndarray[tuple[int], np.dtype[np.uint8]]
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def binary_quantize(embedding: np.ndarray) -> np.ndarray:
|
| 7 |
+
# TODO: [1] mentions that quantization can also be done by the model
|
| 8 |
+
# during encoding. Need to test whether that is faster.
|
| 9 |
+
# [1]: https://www.sbert.net/examples/sentence_transformer/applications/embedding-quantization/README.html#binary-quantization-in-sentence-transformers
|
| 10 |
+
binary_embedding = embedding > 0
|
| 11 |
+
return binary_embedding
|
| 12 |
+
# return np.packbits(binary_embedding)
|
src/binary_shield/shield.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
from binary_shield.comparison import compute_similarity, hamming_distance
|
| 5 |
+
from binary_shield.embedding import extract_embedding
|
| 6 |
+
from binary_shield.privacy import apply_randomized_response
|
| 7 |
+
from binary_shield.quantization import BinaryPackedEmbedding, binary_quantize
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class BinaryFingerprint:
|
| 12 |
+
fingerprint: BinaryPackedEmbedding
|
| 13 |
+
epsilon: float | None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class ComparisonResult:
|
| 18 |
+
hamming_distance: int
|
| 19 |
+
similarity: float
|
| 20 |
+
is_match: bool
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class BinaryShield:
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
model_name: str = "all-MiniLM-L6-v2",
|
| 27 |
+
epsilon: float | None = None,
|
| 28 |
+
) -> None:
|
| 29 |
+
self.model = SentenceTransformer(model_name)
|
| 30 |
+
self.epsilon = epsilon
|
| 31 |
+
|
| 32 |
+
def generate_fingerprint(self, text: str) -> BinaryFingerprint:
|
| 33 |
+
embedding = extract_embedding(text, self.model)
|
| 34 |
+
bin_embedding = binary_quantize(embedding)
|
| 35 |
+
if self.epsilon is not None:
|
| 36 |
+
bin_embedding = apply_randomized_response(bin_embedding, self.epsilon)
|
| 37 |
+
return BinaryFingerprint(
|
| 38 |
+
fingerprint=bin_embedding,
|
| 39 |
+
epsilon=self.epsilon,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def compare(
|
| 44 |
+
fp1: BinaryFingerprint,
|
| 45 |
+
fp2: BinaryFingerprint,
|
| 46 |
+
threshold: float = 0.8,
|
| 47 |
+
) -> ComparisonResult:
|
| 48 |
+
dist = hamming_distance(fp1.fingerprint, fp2.fingerprint)
|
| 49 |
+
sim = compute_similarity(fp1.fingerprint, fp2.fingerprint)
|
| 50 |
+
return ComparisonResult(
|
| 51 |
+
hamming_distance=dist,
|
| 52 |
+
similarity=sim,
|
| 53 |
+
is_match=sim >= threshold,
|
| 54 |
+
)
|