Spaces:

broadfield-dev
/

Equivariant-Encryption-Client

Paused

App Files Files Community

broadfield-dev commited on Feb 25

Commit

07ee289

verified ·

1 Parent(s): aa316bb

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -26

app.py CHANGED Viewed

@@ -8,27 +8,32 @@ from huggingface_hub import hf_hub_download
 app = Flask(__name__)
-# Cache tokenizer + embed layer so repeated requests don't reload
 _cache = {}
 def get_sigma(hidden_size: int, seed: int):
-    """Derive encryption permutation from secret seed."""
     rng = np.random.default_rng(seed)
     return rng.permutation(hidden_size)
 def load_client_components(ee_model_name: str):
     """
-    Load (and cache) only what the client needs:
-      - tokenizer from the EE model
-      - embedding layer from the ORIGINAL model (just embed_tokens, not the full LLM)
-      - hidden_size from ee_config
     """
     if ee_model_name in _cache:
         return _cache[ee_model_name]
-    # 1. Read EE config to get hidden_size + original model name
     config_path = hf_hub_download(ee_model_name, "ee_config.json")
     with open(config_path) as f:
         ee_config = json.load(f)
@@ -36,21 +41,18 @@ def load_client_components(ee_model_name: str):
     hidden_size = ee_config["hidden_size"]
     original_model_name = ee_config["original_model"]
-    # 2. Tokenizer from the EE model
     tokenizer = AutoTokenizer.from_pretrained(ee_model_name, trust_remote_code=True)
-    # 3. Load ONLY the original model's embed_tokens layer — we don't need the full LLM,
-    #    but HF doesn't support partial loading so we load it fully then discard the rest.
-    #    float32 on CPU is fine — we're only doing one embedding lookup, no generation.
     original_model = AutoModelForCausalLM.from_pretrained(
         original_model_name,
-        torch_dtype=torch.float32,
         device_map="cpu",
         trust_remote_code=True,
     )
     embed_layer = original_model.model.embed_tokens
     embed_layer.eval()
-    del original_model  # free memory — we only need the embed layer
     _cache[ee_model_name] = (tokenizer, embed_layer, hidden_size)
     return tokenizer, embed_layer, hidden_size
@@ -73,23 +75,25 @@ def index():
         try:
             tokenizer, embed_layer, hidden_size = load_client_components(ee_model_name)
-            # Derive encryption permutation from secret seed
-            sigma = get_sigma(hidden_size, ee_seed)
-            # Tokenize
             inputs = tokenizer(prompt, return_tensors="pt")
-            # Compute plain embeddings from original model's embed layer
             with torch.no_grad():
-                normal_embeds = embed_layer(inputs.input_ids)  # (1, seq_len, hidden)
-            # Encrypt: permute hidden dimension with secret key
-            # Server sees only scrambled vectors — can't recover original prompt
-            encrypted_embeds = normal_embeds[..., sigma]  # (1, seq_len, hidden)
-            # Cast to float16 to match server model dtype
             encrypted_embeds = encrypted_embeds.to(torch.float16)
             payload = {
                 "encrypted_embeds": encrypted_embeds.tolist(),
                 "attention_mask": inputs.attention_mask.tolist(),
@@ -102,13 +106,20 @@ def index():
                 timeout=300,
             )
-            # Surface the server's error body if it returns non-2xx
             if not resp.ok:
                 raise RuntimeError(
-                    f"Server returned {resp.status_code}: {resp.text[:500]}"
                 )
-            gen_ids = resp.json()["generated_ids"]
             result = tokenizer.decode(gen_ids, skip_special_tokens=True)
         except RuntimeError as e:

 app = Flask(__name__)
+# Cache per EE model name so repeated requests don't re-download
 _cache = {}
 def get_sigma(hidden_size: int, seed: int):
+    """Derive the hidden-dimension permutation from the secret seed."""
     rng = np.random.default_rng(seed)
     return rng.permutation(hidden_size)
 def load_client_components(ee_model_name: str):
     """
+    Load and cache everything the client needs:
+      - ee_config  → hidden_size + original model name
+      - tokenizer  → from the EE model
+      - embed_layer → from the ORIGINAL (unmodified) model
+    Why we need the original embed layer:
+      The EE model's weights were permuted with sigma, but its embedding table was
+      NOT permuted (it maps token IDs → plain vectors). The client must embed with
+      the original model and then apply sigma to produce the scrambled vectors the
+      EE model expects.
     """
     if ee_model_name in _cache:
         return _cache[ee_model_name]
     config_path = hf_hub_download(ee_model_name, "ee_config.json")
     with open(config_path) as f:
         ee_config = json.load(f)
     hidden_size = ee_config["hidden_size"]
     original_model_name = ee_config["original_model"]
     tokenizer = AutoTokenizer.from_pretrained(ee_model_name, trust_remote_code=True)
+    # We only need embed_tokens — load the full model then discard everything else
     original_model = AutoModelForCausalLM.from_pretrained(
         original_model_name,
+        torch_dtype=torch.float32,  # float32 for precision on CPU
         device_map="cpu",
         trust_remote_code=True,
     )
     embed_layer = original_model.model.embed_tokens
     embed_layer.eval()
+    del original_model  # free RAM — we only keep the embed layer
     _cache[ee_model_name] = (tokenizer, embed_layer, hidden_size)
     return tokenizer, embed_layer, hidden_size
         try:
             tokenizer, embed_layer, hidden_size = load_client_components(ee_model_name)
+            # --- Step 1: tokenize ---
             inputs = tokenizer(prompt, return_tensors="pt")
+            input_ids = inputs.input_ids  # (1, seq_len)
+            # --- Step 2: embed with ORIGINAL model's embed layer ---
             with torch.no_grad():
+                plain_embeds = embed_layer(input_ids)  # (1, seq_len, hidden)
+            # --- Step 3: ENCRYPT — permute hidden dim with secret sigma ---
+            # The EE model's weight matrices were pre-permuted with sigma,
+            # so feeding sigma-permuted embeddings is equivalent to feeding
+            # plain embeddings to the original model.
+            sigma = get_sigma(hidden_size, ee_seed)
+            encrypted_embeds = plain_embeds[..., sigma]  # (1, seq_len, hidden)
+            # Match server model dtype (float16)
             encrypted_embeds = encrypted_embeds.to(torch.float16)
+            # --- Step 4: send to server ---
             payload = {
                 "encrypted_embeds": encrypted_embeds.tolist(),
                 "attention_mask": inputs.attention_mask.tolist(),
                 timeout=300,
             )
             if not resp.ok:
                 raise RuntimeError(
+                    f"Server {resp.status_code}: {resp.text[:600]}"
                 )
+            body = resp.json()
+            if "error" in body:
+                raise RuntimeError(f"Server error: {body['error']}\n{body.get('traceback','')}")
+            # --- Step 5: decode ---
+            # No decryption needed on the output — the EE model's lm_head was
+            # also permuted so output logits correctly map to the real vocabulary.
+            # We skip special tokens and strip the prompt echo if present.
+            gen_ids = body["generated_ids"]
             result = tokenizer.decode(gen_ids, skip_special_tokens=True)
         except RuntimeError as e: