Spaces:

rianders
/

live_view_embeddings

Build error

App Files Files Community

rianders commited on Jul 8, 2024

Commit

91c7a65

verified ·

1 Parent(s): 78f2519

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -37

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import streamlit as st
 from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer
 from sklearn.decomposition import PCA
 import plotly.graph_objs as go
@@ -31,38 +32,17 @@ def plot_interactive_embeddings(embeddings, words):
         if len(words) == 2:
             fig = go.Figure(data=[
-                go.Scatter(
-                    x=[emb[0]],
-                    y=[emb[1]],
-                    mode='markers+text',
-                    text=[word],
-                    name=word
-                ) for emb, word in zip(reduced_embeddings, words)
             ])
-            fig.update_layout(
-                title='2D Scatter Plot of Embeddings',
-                xaxis_title='PCA Component 1',
-                yaxis_title='PCA Component 2'
-            )
         else:
             fig = go.Figure(data=[
-                go.Scatter3d(
-                    x=[emb[0]],
-                    y=[emb[1]],
-                    z=[emb[2]],
-                    mode='markers+text',
-                    text=[word],
-                    name=word
-                ) for emb, word in zip(reduced_embeddings, words)
             ])
-            fig.update_layout(
-                title='3D Scatter Plot of Embeddings',
-                scene=dict(
-                    xaxis_title='PCA Component 1',
-                    yaxis_title='PCA Component 2',
-                    zaxis_title='PCA Component 3'
-                )
-            )
         fig.update_layout(autosize=False, width=800, height=600)
         st.plotly_chart(fig, use_container_width=True)
@@ -72,33 +52,50 @@ def plot_interactive_embeddings(embeddings, words):
 def main():
     st.title("Language Model Embeddings Visualization")
     model_choice = st.selectbox("Choose a model:", ["BERT", "RoBERTa"])
     tokenizer, model = load_model(model_choice)
     default_word = "example"
-    if "words" not in st.session_state or "model" not in st.session_state:
         st.session_state.words = [default_word]
-        st.session_state.model = model_choice
         init_db()
         embedding = get_embeddings([default_word], tokenizer, model)[0]
         save_embeddings_to_db(default_word, embedding)
-    elif st.session_state.model != model_choice:
-        st.session_state.words = [default_word]
-        st.session_state.model = model_choice
-        clear_all_entries()
-        embedding = get_embeddings([default_word], tokenizer, model)[0]
-        save_embeddings_to_db(default_word, embedding)
     st.write(f"Current words ({model_choice}):", ", ".join(st.session_state.words))
     new_word = st.text_input("Enter a new word or phrase:", "")
     if st.button("Add Word/Phrase"):
-        if new_word:
             embedding = get_embeddings([new_word], tokenizer, model)[0]
             save_embeddings_to_db(new_word, embedding)
             st.session_state.words.append(new_word)
             st.experimental_rerun()
     if st.button("Clear All Entries"):
         clear_all_entries()
         st.session_state.words = [default_word]

 import streamlit as st
+import pandas as pd
 from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer
 from sklearn.decomposition import PCA
 import plotly.graph_objs as go
         if len(words) == 2:
             fig = go.Figure(data=[
+                go.Scatter(x=[emb[0]], y=[emb[1]], mode='markers+text', text=[word], name=word)
+                for emb, word in zip(reduced_embeddings, words)
             ])
+            fig.update_layout(title='2D Scatter Plot of Embeddings', xaxis_title='PCA Component 1', yaxis_title='PCA Component 2')
         else:
             fig = go.Figure(data=[
+                go.Scatter3d(x=[emb[0]], y=[emb[1]], z=[emb[2]], mode='markers+text', text=[word], name=word)
+                for emb, word in zip(reduced_embeddings, words)
             ])
+            fig.update_layout(title='3D Scatter Plot of Embeddings',
+                              scene=dict(xaxis_title='PCA Component 1', yaxis_title='PCA Component 2', zaxis_title='PCA Component 3'))
         fig.update_layout(autosize=False, width=800, height=600)
         st.plotly_chart(fig, use_container_width=True)
 def main():
     st.title("Language Model Embeddings Visualization")
+    st.markdown("""
+    This application visualizes word embeddings from BERT or RoBERTa language models.
+    Here's how to use it:
+    1. Choose a model (BERT or RoBERTa) from the dropdown menu.
+    2. Enter words or phrases one at a time, or upload a CSV file with a 'word' column.
+    3. View the 2D or 3D plot of the embeddings.
+    4. Download the current database as a CSV file for later use.
+    Explore how different words relate to each other in the embedding space!
+    """)
     model_choice = st.selectbox("Choose a model:", ["BERT", "RoBERTa"])
     tokenizer, model = load_model(model_choice)
     default_word = "example"
+    if "words" not in st.session_state:
         st.session_state.words = [default_word]
         init_db()
         embedding = get_embeddings([default_word], tokenizer, model)[0]
         save_embeddings_to_db(default_word, embedding)
     st.write(f"Current words ({model_choice}):", ", ".join(st.session_state.words))
     new_word = st.text_input("Enter a new word or phrase:", "")
     if st.button("Add Word/Phrase"):
+        if new_word and new_word not in st.session_state.words:
             embedding = get_embeddings([new_word], tokenizer, model)[0]
             save_embeddings_to_db(new_word, embedding)
             st.session_state.words.append(new_word)
             st.experimental_rerun()
+    uploaded_file = st.file_uploader("Upload CSV file", type="csv")
+    if uploaded_file is not None:
+        df = pd.read_csv(uploaded_file)
+        if 'word' in df.columns:
+            new_words = df['word'].tolist()
+            for word in new_words:
+                if word not in st.session_state.words:
+                    embedding = get_embeddings([word], tokenizer, model)[0]
+                    save_embeddings_to_db(word, embedding)
+                    st.session_state.words.append(word)
+            st.experimental_rerun()
+        else:
+            st.error("The CSV file must contain a 'word' column.")
     if st.button("Clear All Entries"):
         clear_all_entries()
         st.session_state.words = [default_word]