| import streamlit as st |
| import tensorflow as tf |
| import sentencepiece as spm |
| import numpy as np |
| from scipy.spatial.distance import cosine |
| import pandas as pd |
| from openTSNE import TSNE |
| import plotly.express as px |
| import plotly.graph_objects as go |
|
|
| |
| st.set_page_config(layout="wide") |
|
|
| |
| st.markdown(""" |
| <style> |
| .block-container { |
| padding-top: 1rem; |
| padding-bottom: 0rem; |
| padding-left: 1rem; |
| padding-right: 1rem; |
| } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| tflite_model_path = "model.tflite" |
| spm_model_path = "sentencepiece.model" |
|
|
| sp = spm.SentencePieceProcessor() |
| sp.load(spm_model_path) |
|
|
| interpreter = tf.lite.Interpreter(model_path=tflite_model_path) |
| interpreter.allocate_tensors() |
|
|
| input_details = interpreter.get_input_details() |
| output_details = interpreter.get_output_details() |
| required_input_length = 64 |
|
|
| |
| def preprocess_text(text, sp, required_length): |
| input_ids = sp.encode(text, out_type=int) |
| input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids)) |
| return np.array(input_ids, dtype=np.int32).reshape(1, -1) |
|
|
| |
| def generate_embeddings(text): |
| input_data = preprocess_text(text, sp, required_input_length) |
| interpreter.set_tensor(input_details[0]['index'], input_data) |
| interpreter.invoke() |
| embedding = interpreter.get_tensor(output_details[0]['index']) |
| return embedding.flatten() |
|
|
| |
| def calculate_similarity(embedding1, embedding2): |
| return 1 - cosine(embedding1, embedding2) |
|
|
| |
| preset_sentences_a = [ |
| "Dan Petrovic predicted conversational search in 2013.", |
| "Understanding user intent is key to effective SEO.", |
| "Dejan SEO has been a leader in data-driven SEO.", |
| "Machine learning is transforming search engines.", |
| "The future of search is AI-driven and personalized.", |
| "Search algorithms are evolving to better match user intent.", |
| "AI technologies enhance digital marketing strategies." |
| ] |
|
|
| preset_sentences_b = [ |
| "Advances in machine learning reshape how search engines operate.", |
| "Personalized content is becoming more prevalent with AI.", |
| "Customer behavior insights are crucial for marketing strategies.", |
| "Dan Petrovic anticipated the rise of chat-based search interactions.", |
| "Dejan SEO is recognized for innovative SEO research and analysis.", |
| "Quantum computing is advancing rapidly in the tech world.", |
| "Studying user behavior can improve the effectiveness of online ads." |
| ] |
|
|
| |
| if "input_text_a" not in st.session_state: |
| st.session_state["input_text_a"] = "\n".join(preset_sentences_a) |
| if "input_text_b" not in st.session_state: |
| st.session_state["input_text_b"] = "\n".join(preset_sentences_b) |
|
|
| |
| if st.button("Clear Fields"): |
| st.session_state["input_text_a"] = "" |
| st.session_state["input_text_b"] = "" |
|
|
| |
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.subheader("Set A Sentences") |
| input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200) |
|
|
| with col2: |
| st.subheader("Set B Sentences") |
| input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200) |
|
|
| |
| iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250) |
|
|
| |
| similarity_threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.5, 0.05) |
|
|
| |
| if st.button("Calculate Similarity"): |
| sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()] |
| sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()] |
|
|
| if len(sentences_a) > 0 and len(sentences_b) > 0: |
| |
| embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a] |
| embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b] |
|
|
| |
| all_sentences = sentences_a + sentences_b |
| all_embeddings = np.array(embeddings_a + embeddings_b) |
| labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b) |
|
|
| |
| similarity_matrix = np.zeros((len(sentences_a), len(sentences_b))) |
| for i, emb_a in enumerate(embeddings_a): |
| for j, emb_b in enumerate(embeddings_b): |
| similarity_matrix[i, j] = calculate_similarity(emb_a, emb_b) |
|
|
| |
| used_a = set() |
| used_b = set() |
| matches = [] |
| pairs = [] |
| for i in range(len(sentences_a)): |
| for j in range(len(sentences_b)): |
| pairs.append((i, j, similarity_matrix[i, j])) |
|
|
| |
| pairs.sort(key=lambda x: x[2], reverse=True) |
|
|
| for i, j, sim in pairs: |
| if i not in used_a and j not in used_b and sim >= similarity_threshold: |
| matches.append((i, j, sim)) |
| used_a.add(i) |
| used_b.add(j) |
|
|
| |
| |
| |
| if len(matches) == 0: |
| st.warning("No sentence pairs exceeded the similarity threshold.") |
| else: |
| |
| df_matches = pd.DataFrame( |
| [ |
| (i+1, sentences_a[i], j+1, sentences_b[j], round(sim, 3)) |
| for (i, j, sim) in matches |
| ], |
| columns=["Set A Order", "Set A Sentence", "Set B Order", "Set B Sentence", "Similarity"] |
| ) |
| st.subheader("Matched Sentences (Above Threshold)") |
| st.dataframe(df_matches, use_container_width=True) |
|
|
| |
| |
| |
| perplexity_value = min(5, len(all_sentences) - 1) |
|
|
| tsne = TSNE( |
| n_components=3, |
| perplexity=perplexity_value, |
| n_iter=iterations, |
| initialization="pca", |
| random_state=42 |
| ) |
| tsne_results = tsne.fit(all_embeddings) |
|
|
| |
| df_tsne = pd.DataFrame({ |
| "Sentence": all_sentences, |
| "Set": labels, |
| "X": tsne_results[:, 0], |
| "Y": tsne_results[:, 1], |
| "Z": tsne_results[:, 2] |
| }) |
|
|
| |
| fig = go.Figure() |
|
|
| |
| fig.add_trace(go.Scatter3d( |
| x=df_tsne[df_tsne["Set"] == "Set A"]["X"], |
| y=df_tsne[df_tsne["Set"] == "Set A"]["Y"], |
| z=df_tsne[df_tsne["Set"] == "Set A"]["Z"], |
| text=df_tsne[df_tsne["Set"] == "Set A"]["Sentence"], |
| mode='markers', |
| name='Set A', |
| marker=dict(size=5, color='blue') |
| )) |
|
|
| |
| fig.add_trace(go.Scatter3d( |
| x=df_tsne[df_tsne["Set"] == "Set B"]["X"], |
| y=df_tsne[df_tsne["Set"] == "Set B"]["Y"], |
| z=df_tsne[df_tsne["Set"] == "Set B"]["Z"], |
| text=df_tsne[df_tsne["Set"] == "Set B"]["Sentence"], |
| mode='markers', |
| name='Set B', |
| marker=dict(size=5, color='red') |
| )) |
|
|
| |
| for i, emb_a in enumerate(embeddings_a): |
| pos_a = tsne_results[i] |
| for j, emb_b in enumerate(embeddings_b): |
| sim = similarity_matrix[i, j] |
| if sim >= similarity_threshold: |
| pos_b = tsne_results[j + len(sentences_a)] |
| fig.add_trace(go.Scatter3d( |
| x=[pos_a[0], pos_b[0]], |
| y=[pos_a[1], pos_b[1]], |
| z=[pos_a[2], pos_b[2]], |
| mode='lines', |
| line=dict(color=f'rgba(150,150,150,{sim})', width=2), |
| name=f'Similarity: {sim:.2f}', |
| showlegend=False |
| )) |
|
|
| fig.update_layout( |
| title="3D Visualization of Sentence Similarity with Connections", |
| width=1200, |
| height=800, |
| scene=dict( |
| xaxis_title="t-SNE Dimension 1", |
| yaxis_title="t-SNE Dimension 2", |
| zaxis_title="t-SNE Dimension 3" |
| ) |
| ) |
| st.plotly_chart(fig) |
|
|
| |
| |
| |
| fig_heatmap = go.Figure(data=go.Heatmap( |
| z=similarity_matrix, |
| x=[f"B{i+1}" for i in range(len(sentences_b))], |
| y=[f"A{i+1}" for i in range(len(sentences_a))], |
| colorscale="Viridis", |
| text=np.round(similarity_matrix, 2), |
| texttemplate="%{text}", |
| textfont={"size": 10}, |
| hoverongaps=False |
| )) |
|
|
| fig_heatmap.update_layout( |
| title="Similarity Heatmap between Set A and Set B", |
| width=None, |
| height=400, |
| margin=dict(l=20, r=20, t=40, b=20), |
| xaxis_title="Set B Sentences", |
| yaxis_title="Set A Sentences" |
| ) |
|
|
| st.plotly_chart(fig_heatmap) |
|
|
| else: |
| st.warning("Please enter sentences in both Set A and Set B.") |
|
|