Gokul Soumya commited on
Commit
8972ad7
·
0 Parent(s):

feat: Implement binary shield as a library

Browse files
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Implementation of the paper "Cross-Service Threat Intelligence in LLM Services using Privacy-Preserving Fingerprints".
2
+
3
+ https://arxiv.org/abs/2509.05608v1
pyproject.toml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "binary-shield"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Gokul Soumya", email = "gokul@cyvia.ai" }
8
+ ]
9
+ requires-python = ">=3.13"
10
+ dependencies = [
11
+ "numpy>=2.4.0",
12
+ "sentence-transformers>=5.2.0",
13
+ "torch>=2.9.1",
14
+ ]
15
+
16
+ [build-system]
17
+ requires = ["uv_build>=0.9.12,<0.10.0"]
18
+ build-backend = "uv_build"
src/binary_shield/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from binary_shield.shield import BinaryShield, BinaryFingerprint, ComparisonResult
2
+
3
+ __all__ = ["BinaryShield", "BinaryFingerprint", "ComparisonResult"]
src/binary_shield/comparison.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def hamming_distance(bits1: np.ndarray, bits2: np.ndarray) -> int:
5
+ return int(np.count_nonzero(bits1 != bits2))
6
+
7
+
8
+ def compute_similarity(bits1: np.ndarray, bits2: np.ndarray) -> float:
9
+ raise NotImplementedError
src/binary_shield/embedding.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+
5
+ def extract_embedding(text: str, model: SentenceTransformer) -> np.ndarray:
6
+ return model.encode(text, convert_to_numpy=True)
src/binary_shield/privacy.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def apply_randomized_response(bits: np.ndarray, epsilon: float) -> np.ndarray:
5
+ keep_prob = np.exp(epsilon) / (1 + np.exp(epsilon)) # sigmoid fn
6
+
7
+ # Generate random decisions for each bit
8
+ keep_mask = np.random.random(bits.shape) < keep_prob
9
+
10
+ # For bits we don't keep, flip them
11
+ noisy_bits = np.where(keep_mask, bits, 1 - bits)
12
+
13
+ return noisy_bits
src/binary_shield/py.typed ADDED
File without changes
src/binary_shield/quantization.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ type BinaryPackedEmbedding = np.ndarray[tuple[int], np.dtype[np.uint8]]
4
+
5
+
6
+ def binary_quantize(embedding: np.ndarray) -> np.ndarray:
7
+ # TODO: [1] mentions that quantization can also be done by the model
8
+ # during encoding. Need to test whether that is faster.
9
+ # [1]: https://www.sbert.net/examples/sentence_transformer/applications/embedding-quantization/README.html#binary-quantization-in-sentence-transformers
10
+ binary_embedding = embedding > 0
11
+ return binary_embedding
12
+ # return np.packbits(binary_embedding)
src/binary_shield/shield.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from dataclasses import dataclass
3
+
4
+ from binary_shield.comparison import compute_similarity, hamming_distance
5
+ from binary_shield.embedding import extract_embedding
6
+ from binary_shield.privacy import apply_randomized_response
7
+ from binary_shield.quantization import BinaryPackedEmbedding, binary_quantize
8
+
9
+
10
+ @dataclass
11
+ class BinaryFingerprint:
12
+ fingerprint: BinaryPackedEmbedding
13
+ epsilon: float | None
14
+
15
+
16
+ @dataclass
17
+ class ComparisonResult:
18
+ hamming_distance: int
19
+ similarity: float
20
+ is_match: bool
21
+
22
+
23
+ class BinaryShield:
24
+ def __init__(
25
+ self,
26
+ model_name: str = "all-MiniLM-L6-v2",
27
+ epsilon: float | None = None,
28
+ ) -> None:
29
+ self.model = SentenceTransformer(model_name)
30
+ self.epsilon = epsilon
31
+
32
+ def generate_fingerprint(self, text: str) -> BinaryFingerprint:
33
+ embedding = extract_embedding(text, self.model)
34
+ bin_embedding = binary_quantize(embedding)
35
+ if self.epsilon is not None:
36
+ bin_embedding = apply_randomized_response(bin_embedding, self.epsilon)
37
+ return BinaryFingerprint(
38
+ fingerprint=bin_embedding,
39
+ epsilon=self.epsilon,
40
+ )
41
+
42
+ @staticmethod
43
+ def compare(
44
+ fp1: BinaryFingerprint,
45
+ fp2: BinaryFingerprint,
46
+ threshold: float = 0.8,
47
+ ) -> ComparisonResult:
48
+ dist = hamming_distance(fp1.fingerprint, fp2.fingerprint)
49
+ sim = compute_similarity(fp1.fingerprint, fp2.fingerprint)
50
+ return ComparisonResult(
51
+ hamming_distance=dist,
52
+ similarity=sim,
53
+ is_match=sim >= threshold,
54
+ )