Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Sleeping

App Files Files Community

AbstractPhil commited on Jun 2, 2025

Commit

8a2e372

verified ·

1 Parent(s): 323ce30

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -40

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py
 from bert_handler import create_handler_from_checkpoint
 import torch
@@ -6,40 +6,48 @@ import gradio as gr
 import re
 from pathlib import Path
 import spaces
-@spaces.GPU
-def mask_and_predict(text: str, selected_roles: list[str]):
-    MASK_TOKEN = tokenizer.mask_token or "[MASK]"
-    results = []
-    masked_text = text
-    token_ids = tokenizer.encode(text, return_tensors="pt").cuda()
-    for role in selected_roles:
-        role_pattern = re.escape(role)
-        masked_text = re.sub(role_pattern, MASK_TOKEN, masked_text)
-    masked_ids = tokenizer.encode(masked_text, return_tensors="pt").cuda()
     with torch.no_grad():
-        outputs = model(input_ids=masked_ids)
-        logits = outputs.logits[0]
-        predictions = torch.argmax(logits, dim=-1)
-    original_ids = tokenizer.convert_ids_to_tokens(token_ids[0])
-    predicted_ids = tokenizer.convert_ids_to_tokens(predictions)
-    masked_ids_tokens = tokenizer.convert_ids_to_tokens(masked_ids[0])
-    for i, token in enumerate(masked_ids_tokens):
-        if token == MASK_TOKEN:
-            results.append({
-                "Position": i,
-                "Masked Token": MASK_TOKEN,
-                "Predicted": predicted_ids[i],
-                "Original": original_ids[i] if i < len(original_ids) else "",
-                "Match": "✅" if predicted_ids[i] == original_ids[i] else "❌"
-            })
-    accuracy = sum(1 for r in results if r["Match"] == "✅") / max(len(results), 1)
-    return results, f"Accuracy: {accuracy:.1%}"
 symbolic_roles = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
@@ -50,27 +58,23 @@ symbolic_roles = [
     "<fabric>", "<jewelry>"
 ]
-# Load from official hosted checkpoint
-checkpoint_path = "./bert-beatrix-2048"
-handler, model, tokenizer = create_handler_from_checkpoint(checkpoint_path)
-model = model.eval().cuda()
 def build_interface():
     with gr.Blocks() as demo:
-        gr.Markdown("## 🔎 Symbolic BERT Inference Test")
         with gr.Row():
             with gr.Column():
-                input_text = gr.Textbox(label="Symbolic Input Caption", lines=3)
                 selected_roles = gr.CheckboxGroup(
                     choices=symbolic_roles,
-                    label="Mask these symbolic roles"
                 )
-                run_btn = gr.Button("Run Mask Inference")
             with gr.Column():
-                output_table = gr.Dataframe(headers=["Position", "Masked Token", "Predicted", "Original", "Match"], interactive=False)
-                accuracy_score = gr.Textbox(label="Mask Accuracy")
-        run_btn.click(fn=mask_and_predict, inputs=[input_text, selected_roles], outputs=[output_table, accuracy_score])
     return demo

+# Updating the app to use only the encoder from the model, ensuring symbolic support
 from bert_handler import create_handler_from_checkpoint
 import torch
 import re
 from pathlib import Path
 import spaces
+from huggingface_hub import snapshot_download
+# Load checkpoint using BERTHandler (loads tokenizer and full model)
+checkpoint_path = snapshot_download(
+    repo_id="AbstractPhil/bert-beatrix-2048",
+    revision="main",
+    local_dir="bert-beatrix-2048",
+    local_dir_use_symlinks=False
+)
+handler, model, tokenizer = create_handler_from_checkpoint(checkpoint_path)
+model = model.eval().cuda()
+# Extract encoder only (NomicBertModel -> encoder)
+encoder = model.bert.encoder
+embeddings = model.bert.embeddings
+emb_ln = model.bert.emb_ln
+emb_drop = model.bert.emb_drop
+@spaces.GPU
+def encode_and_predict(text: str, selected_roles: list[str]):
     with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt").to("cuda")
+        input_ids = inputs.input_ids
+        attention_mask = inputs.attention_mask
+        # Run embedding + encoder pipeline
+        x = embeddings(input_ids)
+        x = emb_ln(x)
+        x = emb_drop(x)
+        encoded = encoder(x, attention_mask=attention_mask.bool())
+        symbolic_ids = [tokenizer.convert_tokens_to_ids(tok) for tok in selected_roles]
+        symbolic_mask = torch.isin(input_ids, torch.tensor(symbolic_ids, device=input_ids.device))
+        masked_tokens = [tokenizer.convert_ids_to_tokens([tid])[0] for tid in input_ids[0] if tid in symbolic_ids]
+        role_reprs = encoded[symbolic_mask].mean(dim=0) if symbolic_mask.any() else torch.zeros_like(encoded[0, 0])
+        return {
+            "Symbolic Tokens": masked_tokens,
+            "Embedding Norm": f"{role_reprs.norm().item():.4f}",
+            "Symbolic Token Count": symbolic_mask.sum().item(),
+        }
 symbolic_roles = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<fabric>", "<jewelry>"
 ]
 def build_interface():
     with gr.Blocks() as demo:
+        gr.Markdown("## 🧠 Symbolic Encoder Inspector")
         with gr.Row():
             with gr.Column():
+                input_text = gr.Textbox(label="Input with Symbolic Tokens", lines=3)
                 selected_roles = gr.CheckboxGroup(
                     choices=symbolic_roles,
+                    label="Which symbolic tokens should be traced?"
                 )
+                run_btn = gr.Button("Encode & Trace")
             with gr.Column():
+                symbolic_tokens = gr.Textbox(label="Symbolic Tokens Found")
+                embedding_norm = gr.Textbox(label="Mean Norm of Symbolic Embeddings")
+                token_count = gr.Textbox(label="Count of Symbolic Tokens")
+        run_btn.click(fn=encode_and_predict, inputs=[input_text, selected_roles], outputs=[symbolic_tokens, embedding_norm, token_count])
     return demo