chaos4455 commited on
Commit
be87f58
·
verified ·
1 Parent(s): e9d41e0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### app.py
2
+ import streamlit as st
3
+ from transformers import BertTokenizer, BertModel
4
+ import torch
5
+ import pandas as pd
6
+
7
+ # Load BERT tokenizer and model
8
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
9
+ model = BertModel.from_pretrained('bert-base-uncased')
10
+
11
+ # Streamlit app setup
12
+ st.title("✨ BERT Token Analyzer 🧠")
13
+ st.write("🔍 This application uses **BERT** to tokenize and encode input text, providing embeddings and token details.")
14
+ st.markdown("---")
15
+
16
+ # Input field
17
+ user_input = st.text_input("📝 Enter a word or sentence:", "")
18
+
19
+ if user_input:
20
+ # Tokenize input
21
+ st.write("⏳ Tokenizing and encoding input... 🛠️")
22
+ inputs = tokenizer(user_input, return_tensors="pt", add_special_tokens=True)
23
+ tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
24
+
25
+ # Get embeddings
26
+ with torch.no_grad():
27
+ outputs = model(**inputs)
28
+ embeddings = outputs.last_hidden_state.squeeze(0)
29
+
30
+ # Prepare DataFrame for display
31
+ token_data = []
32
+ for i, token in enumerate(tokens):
33
+ token_data.append({
34
+ "Token": token,
35
+ "Token ID": inputs['input_ids'][0][i].item(),
36
+ "Embedding (first 5 dims)": embeddings[i][:5].tolist()
37
+ })
38
+
39
+ df = pd.DataFrame(token_data)
40
+
41
+ # Display token data
42
+ st.write("### 🧾 Token Details 📜")
43
+ st.dataframe(df)
44
+
45
+ # Option to download the DataFrame as CSV
46
+ csv = df.to_csv(index=False)
47
+ st.download_button(
48
+ label="⬇️ Download Token Data as CSV",
49
+ data=csv,
50
+ file_name="token_data.csv",
51
+ mime="text/csv"
52
+ )
53
+
54
+ # Additional statistics and details
55
+ st.write("### 📊 Token Statistics 📈")
56
+ st.markdown(f"- **Number of Tokens:** {len(tokens)}")
57
+ st.markdown(f"- **Unique Tokens:** {len(set(tokens))}")
58
+ st.markdown(f"- **Longest Token:** `{max(tokens, key=len)}` ({len(max(tokens, key=len))} characters)")
59
+ st.markdown(f"- **Shortest Token:** `{min(tokens, key=len)}` ({len(min(tokens, key=len))} characters)")
60
+
61
+ st.write("### 🔍 Embedding Analysis 🌌")
62
+ embedding_magnitudes = embeddings.norm(dim=1).tolist()
63
+ st.markdown(f"- **Average Embedding Magnitude:** {sum(embedding_magnitudes)/len(embedding_magnitudes):.4f}")
64
+ st.markdown(f"- **Max Embedding Magnitude:** {max(embedding_magnitudes):.4f}")
65
+ st.markdown(f"- **Min Embedding Magnitude:** {min(embedding_magnitudes):.4f}")
66
+
67
+ st.write("### 🛠 Embedding Tensor Details")
68
+ st.write("**Shape:**", embeddings.shape)
69
+ st.write(embeddings)
70
+
71
+ # Display tokens and embeddings in Markdown format
72
+ st.write("### 📝 Token and Embedding Summary")
73
+ for i, token in enumerate(tokens):
74
+ st.markdown(f"- **Token {i+1}:** `{token}`")
75
+ st.markdown(f" - **Token ID:** {inputs['input_ids'][0][i].item()}")
76
+ st.markdown(f" - **Embedding (first 5 dims):** {embeddings[i][:5].tolist()}")
77
+
78
+ st.markdown("---")
79
+ st.write("👨‍💻 **Replika AI Solutions** - Powered by **Gemini** 🪐")
80
+ st.write("📍 Developed by *Elias Andrade* - Maringá, Paraná 🇧🇷")