hemalbusa commited on
Commit
16b2a71
·
verified ·
1 Parent(s): 869817f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import BertTokenizer, BertModel
4
+ import re
5
+
6
+ # Load BERT tokenizer and model
7
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
8
+ model = BertModel.from_pretrained("bert-base-uncased")
9
+
10
+ def preprocess_text(text):
11
+ # Remove ASCII characters and lowercase
12
+ cleaned = re.sub(r'[^\x80-\uFFFF]+', '', text)
13
+ return cleaned.lower()
14
+
15
+ def get_bert_embeddings(text):
16
+ cleaned_text = preprocess_text(text)
17
+ inputs = tokenizer(cleaned_text, return_tensors="pt")
18
+ with torch.no_grad():
19
+ outputs = model(**inputs)
20
+ embeddings = outputs.last_hidden_state.squeeze(0) # shape: [seq_len, hidden_size]
21
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))
22
+
23
+ # Convert embeddings to list of floats for display
24
+ token_embeddings = {token: embedding.tolist() for token, embedding in zip(tokens, embeddings)}
25
+ return token_embeddings
26
+
27
+ def format_output(token_embeddings):
28
+ formatted = ""
29
+ for token, emb in token_embeddings.items():
30
+ formatted += f"Token: {token}\nEmbedding: {emb[:5]}... ({len(emb)} dims)\n\n"
31
+ return formatted
32
+
33
+ demo = gr.Interface(
34
+ fn=lambda text: format_output(get_bert_embeddings(text)),
35
+ inputs=gr.Textbox(lines=4, placeholder="Enter text here..."),
36
+ outputs="text",
37
+ title="BERT Token Embeddings Viewer",
38
+ description="Removes ASCII characters, lowercases input, and shows BERT tokens with embeddings."
39
+ )
40
+
41
+ if __name__ == "__main__":
42
+ demo.launch()