Spaces:

ajayinsac
/

testspace

Sleeping

App Files Files Community

ajayinsac commited on Sep 7, 2025

Commit

3cfa391

verified ·

1 Parent(s): 9711432

Create app.py

Browse files

Files changed (1) hide show

app.py +107 -0

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+"""
+Gradio app: Clean text (remove non-ASCII, lowercase), tokenize with BERT,
+compute embeddings, and display tokens + per-token vectors.
+Run locally:
+  pip install -r requirements.txt
+  python app.py
+"""
+import re
+import numpy as np
+import pandas as pd
+import torch
+import gradio as gr
+from transformers import BertTokenizer, BertModel
+# ---- Preprocessing helpers ---------------------------------------------------
+_ascii_re = re.compile(r"[^\x00-\x7F]+")
+def clean_text(s: str) -> str:
+    """Remove non-ASCII chars and lowercase."""
+    if s is None:
+        return ""
+    s = _ascii_re.sub("", s)         # drop non-ASCII
+    s = s.lower()
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+# ---- Load model/tokenizer once ----------------------------------------------
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")
+MODEL = BertModel.from_pretrained("bert-base-uncased")
+MODEL.to(DEVICE)
+MODEL.eval()
+# ---- Core function -----------------------------------------------------------
+def bert_embed(text: str, max_tokens: int = 48):
+    """
+    Return:
+      - cleaned text
+      - list of wordpiece tokens
+      - DataFrame of embeddings (one row per token, 768-d columns)
+    """
+    cleaned = clean_text(text)
+    if not cleaned:
+        return "", [], pd.DataFrame()
+    # Tokenize (truncate to keep UI snappy)
+    enc = TOKENIZER(
+        cleaned,
+        return_tensors="pt",
+        truncation=True,
+        max_length=max_tokens,
+        add_special_tokens=True,
+    )
+    input_ids = enc["input_ids"].to(DEVICE)
+    attention_mask = enc["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask)
+        # last_hidden_state shape: [batch=1, seq_len, hidden=768]
+        last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu().numpy()
+    tokens = TOKENIZER.convert_ids_to_tokens(input_ids.squeeze(0).tolist())
+    # Build a DataFrame: rows = tokens, columns = dim_0..dim_767
+    cols = [f"dim_{i}" for i in range(last_hidden_state.shape[1])]
+    df = pd.DataFrame(last_hidden_state, index=tokens, columns=cols)
+    return cleaned, tokens, df
+# ---- Gradio UI ---------------------------------------------------------------
+with gr.Blocks(title="BERT Tokenizer & Embeddings") as demo:
+    gr.Markdown(
+        """
+        # BERT Tokenizer & Embeddings
+        Paste text below. The app will **remove non-ASCII characters**, **lowercase** the text, then use
+        **BERT (bert-base-uncased)** to produce tokens and embeddings (last hidden state).
+        """
+    )
+    with gr.Row():
+        inp = gr.Textbox(label="Input text", lines=6, placeholder="Type or paste text...")
+        max_tok = gr.Slider(8, 256, value=48, step=1, label="Max tokens (truncate)")
+    with gr.Row():
+        cleaned_out = gr.Textbox(label="Cleaned text (ASCII-only, lowercased)")
+    tokens_out = gr.JSON(label="WordPiece tokens")
+    df_out = gr.Dataframe(label="Per-token embeddings (last_hidden_state)", wrap=True)
+    run_btn = gr.Button("Transform with BERT", variant="primary")
+    run_btn.click(bert_embed, inputs=[inp, max_tok], outputs=[cleaned_out, tokens_out, df_out])
+    gr.Markdown(
+        """
+        **Notes**
+        - Embeddings are 768-dim vectors from the last hidden state (one row per token).
+        - Special tokens like `[CLS]` and `[SEP]` are included.
+        - Truncation keeps the UI responsive; increase *Max tokens* if needed.
+        """
+    )
+if __name__ == "__main__":
+    # Do not force share=True (some hosts disallow it)
+    demo.launch(server_name="0.0.0.0", server_port=7860)