qsim / src /streamlit_app.py
dejanseo's picture
Update src/streamlit_app.py
eb280ef verified
import streamlit as st
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.express as px
import os
# Route all cache and config writes to Docker-writable dirs
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
os.environ["XDG_CACHE_HOME"] = "/app/hf_cache"
os.environ["HOME"] = "/app"
# App title
st.title("Keyword Cosine Similarity Tool")
# Overview
st.markdown(
"""
**Purpose:**
Elevate the most semantically relevant queries from keyword research.
"""
)
# Inputs
st.header("Input Parameters")
primary_keyword = st.text_input("Primary Keyword", placeholder="Enter your primary keyword")
st.text_area(
"Keywords to Compare",
placeholder="Enter keywords separated by new lines or commas",
help="You can input keywords on separate lines or separated by commas. Any keywords with commas will be treated as separate queries.",
key="keywords"
)
keywords = st.session_state.get("keywords", "")
# Instructions tooltip
with st.expander("ℹ️ Instructions (click for details)"):
st.markdown(
"""
**How to use this tool:**
1. Enter your **Primary Keyword**.
2. Provide a list of **Keywords to Compare**.
3. Click **Calculate Similarities** to compute and rank your keywords by relevance.
**Output:**
- Sorted table of keywords by cosine similarity.
- Interactive 3D PCA plot of keyword embeddings.
- Option to download results.
"""
)
# Process Button
if st.button("Calculate Similarities"):
if not primary_keyword or not keywords:
st.error("Please provide both the primary keyword and keywords to compare.")
else:
keyword_list = [kw.strip() for kw in keywords.replace(",", "\n").split("\n") if kw.strip()]
model_name = "mixedbread-ai/mxbai-embed-large-v1"
try:
st.info(f"Loading model: {model_name}")
model = SentenceTransformer(model_name)
except Exception as e:
st.error(f"Failed to load model: {e}")
st.stop()
try:
st.info("Generating embeddings...")
all_texts = [primary_keyword] + keyword_list
embeddings = model.encode(all_texts, normalize_embeddings=True)
# Apply Matryoshka Representation Learning: slice to 256 dims
mrl_embeddings = embeddings[:, :256]
primary_embedding = mrl_embeddings[0]
keyword_embeddings = mrl_embeddings[1:]
except Exception as e:
st.error(f"Embedding failed: {e}")
st.stop()
# Cosine similarities
st.info("Calculating cosine similarities...")
similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0]
results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)]
sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
df_results = pd.DataFrame(sorted_results)
# Output
st.header("Results")
st.download_button(
label="📥 Download Results as CSV",
data=df_results.to_csv(index=False),
file_name="cosine_similarity_results.csv",
mime="text/csv"
)
st.dataframe(df_results)
# 3D PCA Plot
st.subheader("3D PCA of Embeddings")
pca = PCA(n_components=3)
pca_result = pca.fit_transform(mrl_embeddings)
pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2", "PC3"])
pca_df["Label"] = ["Primary"] + keyword_list
fig = px.scatter_3d(pca_df, x="PC1", y="PC2", z="PC3", color="Label", text="Label")
st.plotly_chart(fig, use_container_width=True)
with st.expander("🔧 Technical Details (click to expand)"):
st.write("Primary Embedding:", primary_embedding.tolist())
st.write("Keyword Embeddings:", keyword_embeddings.tolist())
# Footer
st.markdown("---")
st.markdown("Created by [Ryland Bacorn](https://huggingface.co/ReithBjarkan). Report a [bug or make a suggestion](mailto:rybacorn@gmail.com)")