Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import BertTokenizer, BertModel | |
| import re | |
| # Load BERT tokenizer and model | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| model = BertModel.from_pretrained("bert-base-uncased") | |
| def preprocess_text(text): | |
| # Remove ASCII characters and lowercase | |
| cleaned = re.sub(r'[^\x80-\uFFFF]+', '', text) | |
| return cleaned.lower() | |
| def get_bert_embeddings(text): | |
| cleaned_text = preprocess_text(text) | |
| inputs = tokenizer(cleaned_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.squeeze(0) # shape: [seq_len, hidden_size] | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0)) | |
| # Convert embeddings to list of floats for display | |
| token_embeddings = {token: embedding.tolist() for token, embedding in zip(tokens, embeddings)} | |
| return token_embeddings | |
| def format_output(token_embeddings): | |
| formatted = "" | |
| for token, emb in token_embeddings.items(): | |
| formatted += f"Token: {token}\nEmbedding: {emb[:5]}... ({len(emb)} dims)\n\n" | |
| return formatted | |
| demo = gr.Interface( | |
| fn=lambda text: format_output(get_bert_embeddings(text)), | |
| inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), | |
| outputs="text", | |
| title="BERT Token Embeddings Viewer", | |
| description="Removes ASCII characters, lowercases input, and shows BERT tokens with embeddings." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |