Create demo.py
Browse files
demo.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
import sentencepiece as spm
|
| 4 |
+
import numpy as np
|
| 5 |
+
from scipy.spatial.distance import cosine
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from openTSNE import TSNE
|
| 8 |
+
import plotly.express as px
|
| 9 |
+
|
| 10 |
+
# Set Streamlit layout to wide mode
|
| 11 |
+
st.set_page_config(layout="wide")
|
| 12 |
+
|
| 13 |
+
# Load the TFLite model and SentencePiece model
|
| 14 |
+
tflite_model_path = "model.tflite"
|
| 15 |
+
spm_model_path = "sentencepiece.model"
|
| 16 |
+
|
| 17 |
+
sp = spm.SentencePieceProcessor()
|
| 18 |
+
sp.load(spm_model_path)
|
| 19 |
+
|
| 20 |
+
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
|
| 21 |
+
interpreter.allocate_tensors()
|
| 22 |
+
|
| 23 |
+
input_details = interpreter.get_input_details()
|
| 24 |
+
output_details = interpreter.get_output_details()
|
| 25 |
+
required_input_length = 64 # Fixed length of 64 tokens
|
| 26 |
+
|
| 27 |
+
# Function to preprocess text input
|
| 28 |
+
def preprocess_text(text, sp, required_length):
|
| 29 |
+
input_ids = sp.encode(text, out_type=int)
|
| 30 |
+
input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids))
|
| 31 |
+
return np.array(input_ids, dtype=np.int32).reshape(1, -1)
|
| 32 |
+
|
| 33 |
+
# Function to generate embeddings
|
| 34 |
+
def generate_embeddings(text):
|
| 35 |
+
input_data = preprocess_text(text, sp, required_input_length)
|
| 36 |
+
interpreter.set_tensor(input_details[0]['index'], input_data)
|
| 37 |
+
interpreter.invoke()
|
| 38 |
+
embedding = interpreter.get_tensor(output_details[0]['index'])
|
| 39 |
+
return embedding.flatten()
|
| 40 |
+
|
| 41 |
+
# Predefined sentence sets
|
| 42 |
+
preset_sentences_a = [
|
| 43 |
+
"Dan Petrovic predicted conversational search in 2013.",
|
| 44 |
+
"Understanding user intent is key to effective SEO.",
|
| 45 |
+
"Dejan SEO has been a leader in data-driven SEO.",
|
| 46 |
+
"Machine learning is transforming search engines.",
|
| 47 |
+
"The future of search is AI-driven and personalized.",
|
| 48 |
+
"Search algorithms are evolving to better match user intent.",
|
| 49 |
+
"AI technologies enhance digital marketing strategies."
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
preset_sentences_b = [
|
| 53 |
+
"Advances in machine learning reshape how search engines operate.",
|
| 54 |
+
"Personalized content is becoming more prevalent with AI.",
|
| 55 |
+
"Customer behavior insights are crucial for marketing strategies.",
|
| 56 |
+
"Dan Petrovic anticipated the rise of chat-based search interactions.",
|
| 57 |
+
"Dejan SEO is recognized for innovative SEO research and analysis.",
|
| 58 |
+
"Quantum computing is advancing rapidly in the tech world.",
|
| 59 |
+
"Studying user behavior can improve the effectiveness of online ads."
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
# Initialize session state for input fields if not already set
|
| 63 |
+
if "input_text_a" not in st.session_state:
|
| 64 |
+
st.session_state["input_text_a"] = "\n".join(preset_sentences_a)
|
| 65 |
+
if "input_text_b" not in st.session_state:
|
| 66 |
+
st.session_state["input_text_b"] = "\n".join(preset_sentences_b)
|
| 67 |
+
|
| 68 |
+
# Clear button to reset text areas
|
| 69 |
+
if st.button("Clear Fields"):
|
| 70 |
+
st.session_state["input_text_a"] = ""
|
| 71 |
+
st.session_state["input_text_b"] = ""
|
| 72 |
+
|
| 73 |
+
# Side-by-side layout for Set A and Set B inputs
|
| 74 |
+
col1, col2 = st.columns(2)
|
| 75 |
+
|
| 76 |
+
with col1:
|
| 77 |
+
st.subheader("Set A Sentences")
|
| 78 |
+
input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200)
|
| 79 |
+
|
| 80 |
+
with col2:
|
| 81 |
+
st.subheader("Set B Sentences")
|
| 82 |
+
input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200)
|
| 83 |
+
|
| 84 |
+
# Slider to control t-SNE iteration steps
|
| 85 |
+
iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250)
|
| 86 |
+
|
| 87 |
+
# Submit button
|
| 88 |
+
if st.button("Calculate Similarity"):
|
| 89 |
+
sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()]
|
| 90 |
+
sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()]
|
| 91 |
+
|
| 92 |
+
if len(sentences_a) > 0 and len(sentences_b) > 0:
|
| 93 |
+
# Generate embeddings for both sets
|
| 94 |
+
embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a]
|
| 95 |
+
embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b]
|
| 96 |
+
|
| 97 |
+
# Combine sentences and embeddings for both sets
|
| 98 |
+
all_sentences = sentences_a + sentences_b
|
| 99 |
+
all_embeddings = np.array(embeddings_a + embeddings_b) # Convert to NumPy array
|
| 100 |
+
labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b)
|
| 101 |
+
|
| 102 |
+
# Set perplexity dynamically based on number of samples
|
| 103 |
+
perplexity_value = min(5, len(all_sentences) - 1)
|
| 104 |
+
|
| 105 |
+
# Perform 3D t-SNE with OpenTSNE, limiting the number of iterations
|
| 106 |
+
tsne = TSNE(n_components=3, perplexity=perplexity_value, n_iter=iterations, initialization="pca", random_state=42)
|
| 107 |
+
tsne_results = tsne.fit(all_embeddings)
|
| 108 |
+
|
| 109 |
+
# Prepare DataFrame for Plotly
|
| 110 |
+
df_tsne = pd.DataFrame({
|
| 111 |
+
"Sentence": all_sentences,
|
| 112 |
+
"Set": labels,
|
| 113 |
+
"X": tsne_results[:, 0],
|
| 114 |
+
"Y": tsne_results[:, 1],
|
| 115 |
+
"Z": tsne_results[:, 2]
|
| 116 |
+
})
|
| 117 |
+
|
| 118 |
+
# Plot 3D t-SNE results with Plotly
|
| 119 |
+
fig = px.scatter_3d(df_tsne, x="X", y="Y", z="Z", color="Set", hover_data={"Sentence": True},
|
| 120 |
+
title="Incremental 3D t-SNE Visualization of Sentence Similarity",
|
| 121 |
+
labels={"X": "t-SNE Dimension 1", "Y": "t-SNE Dimension 2", "Z": "t-SNE Dimension 3"},
|
| 122 |
+
width=1200, height=800) # Increased chart width and height
|
| 123 |
+
fig.update_traces(marker=dict(size=5, opacity=0.8))
|
| 124 |
+
|
| 125 |
+
# Display interactive Plotly plot
|
| 126 |
+
st.plotly_chart(fig)
|
| 127 |
+
|
| 128 |
+
# Display expandable embeddings
|
| 129 |
+
st.subheader("Embeddings for each sentence in Set A")
|
| 130 |
+
for i, (sentence, embedding) in enumerate(zip(sentences_a, embeddings_a)):
|
| 131 |
+
with st.expander(f"Embedding for Sentence A{i+1}: {sentence}"):
|
| 132 |
+
st.write(", ".join([f"{x:.4f}" for x in embedding])) # Comma-separated values
|
| 133 |
+
|
| 134 |
+
st.subheader("Embeddings for each sentence in Set B")
|
| 135 |
+
for i, (sentence, embedding) in enumerate(zip(sentences_b, embeddings_b)):
|
| 136 |
+
with st.expander(f"Embedding for Sentence B{i+1}: {sentence}"):
|
| 137 |
+
st.write(", ".join([f"{x:.4f}" for x in embedding])) # Comma-separated values
|
| 138 |
+
|
| 139 |
+
else:
|
| 140 |
+
st.warning("Please enter sentences in both Set A and Set B.")
|