import streamlit as st from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from sklearn.decomposition import PCA import numpy as np import pandas as pd import plotly.express as px import os # Route all cache and config writes to Docker-writable dirs os.environ["HF_HOME"] = "/app/hf_cache" os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache" os.environ["XDG_CACHE_HOME"] = "/app/hf_cache" os.environ["HOME"] = "/app" # App title st.title("Keyword Cosine Similarity Tool") # Overview st.markdown( """ **Purpose:** Elevate the most semantically relevant queries from keyword research. """ ) # Inputs st.header("Input Parameters") primary_keyword = st.text_input("Primary Keyword", placeholder="Enter your primary keyword") st.text_area( "Keywords to Compare", placeholder="Enter keywords separated by new lines or commas", help="You can input keywords on separate lines or separated by commas. Any keywords with commas will be treated as separate queries.", key="keywords" ) keywords = st.session_state.get("keywords", "") # Instructions tooltip with st.expander("ℹ️ Instructions (click for details)"): st.markdown( """ **How to use this tool:** 1. Enter your **Primary Keyword**. 2. Provide a list of **Keywords to Compare**. 3. Click **Calculate Similarities** to compute and rank your keywords by relevance. **Output:** - Sorted table of keywords by cosine similarity. - Interactive 3D PCA plot of keyword embeddings. - Option to download results. """ ) # Process Button if st.button("Calculate Similarities"): if not primary_keyword or not keywords: st.error("Please provide both the primary keyword and keywords to compare.") else: keyword_list = [kw.strip() for kw in keywords.replace(",", "\n").split("\n") if kw.strip()] model_name = "mixedbread-ai/mxbai-embed-large-v1" try: st.info(f"Loading model: {model_name}") model = SentenceTransformer(model_name) except Exception as e: st.error(f"Failed to load model: {e}") st.stop() try: st.info("Generating embeddings...") all_texts = [primary_keyword] + keyword_list embeddings = model.encode(all_texts, normalize_embeddings=True) # Apply Matryoshka Representation Learning: slice to 256 dims mrl_embeddings = embeddings[:, :256] primary_embedding = mrl_embeddings[0] keyword_embeddings = mrl_embeddings[1:] except Exception as e: st.error(f"Embedding failed: {e}") st.stop() # Cosine similarities st.info("Calculating cosine similarities...") similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0] results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)] sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True) df_results = pd.DataFrame(sorted_results) # Output st.header("Results") st.download_button( label="📥 Download Results as CSV", data=df_results.to_csv(index=False), file_name="cosine_similarity_results.csv", mime="text/csv" ) st.dataframe(df_results) # 3D PCA Plot st.subheader("3D PCA of Embeddings") pca = PCA(n_components=3) pca_result = pca.fit_transform(mrl_embeddings) pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2", "PC3"]) pca_df["Label"] = ["Primary"] + keyword_list fig = px.scatter_3d(pca_df, x="PC1", y="PC2", z="PC3", color="Label", text="Label") st.plotly_chart(fig, use_container_width=True) with st.expander("🔧 Technical Details (click to expand)"): st.write("Primary Embedding:", primary_embedding.tolist()) st.write("Keyword Embeddings:", keyword_embeddings.tolist()) # Footer st.markdown("---") st.markdown("Created by [Ryland Bacorn](https://huggingface.co/ReithBjarkan). Report a [bug or make a suggestion](mailto:rybacorn@gmail.com)")