Spaces:

dejanseo
/

qsim

Sleeping

File size: 4,367 Bytes

import streamlit as st
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.express as px
import os
# Route all cache and config writes to Docker-writable dirs
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
os.environ["XDG_CACHE_HOME"] = "/app/hf_cache"
os.environ["HOME"] = "/app"

# App title
st.title("Keyword Cosine Similarity Tool")

# Overview
st.markdown(
    """
    **Purpose:**  
    Elevate the most semantically relevant queries from keyword research.
    """
)

# Inputs
st.header("Input Parameters")
primary_keyword = st.text_input("Primary Keyword", placeholder="Enter your primary keyword")
st.text_area(
    "Keywords to Compare",
    placeholder="Enter keywords separated by new lines or commas",
    help="You can input keywords on separate lines or separated by commas. Any keywords with commas will be treated as separate queries.",
    key="keywords"
)
keywords = st.session_state.get("keywords", "")

# Instructions tooltip
with st.expander("ℹ️ Instructions (click for details)"):
    st.markdown(
        """
        **How to use this tool:**
        1. Enter your **Primary Keyword**.
        2. Provide a list of **Keywords to Compare**.
        3. Click **Calculate Similarities** to compute and rank your keywords by relevance.

        **Output:**
        - Sorted table of keywords by cosine similarity.
        - Interactive 3D PCA plot of keyword embeddings.
        - Option to download results.
        """
    )

# Process Button
if st.button("Calculate Similarities"):
    if not primary_keyword or not keywords:
        st.error("Please provide both the primary keyword and keywords to compare.")
    else:
        keyword_list = [kw.strip() for kw in keywords.replace(",", "\n").split("\n") if kw.strip()]
        model_name = "mixedbread-ai/mxbai-embed-large-v1"

        try:
            st.info(f"Loading model: {model_name}")
            model = SentenceTransformer(model_name)
        except Exception as e:
            st.error(f"Failed to load model: {e}")
            st.stop()

        try:
            st.info("Generating embeddings...")
            all_texts = [primary_keyword] + keyword_list
            embeddings = model.encode(all_texts, normalize_embeddings=True)

            # Apply Matryoshka Representation Learning: slice to 256 dims
            mrl_embeddings = embeddings[:, :256]

            primary_embedding = mrl_embeddings[0]
            keyword_embeddings = mrl_embeddings[1:]
        except Exception as e:
            st.error(f"Embedding failed: {e}")
            st.stop()

        # Cosine similarities
        st.info("Calculating cosine similarities...")
        similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0]
        results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)]
        sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)

        df_results = pd.DataFrame(sorted_results)

        # Output
        st.header("Results")

        st.download_button(
            label="📥 Download Results as CSV",
            data=df_results.to_csv(index=False),
            file_name="cosine_similarity_results.csv",
            mime="text/csv"
        )

        st.dataframe(df_results)

        # 3D PCA Plot
        st.subheader("3D PCA of Embeddings")
        pca = PCA(n_components=3)
        pca_result = pca.fit_transform(mrl_embeddings)

        pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2", "PC3"])
        pca_df["Label"] = ["Primary"] + keyword_list
        fig = px.scatter_3d(pca_df, x="PC1", y="PC2", z="PC3", color="Label", text="Label")
        st.plotly_chart(fig, use_container_width=True)

        with st.expander("🔧 Technical Details (click to expand)"):
            st.write("Primary Embedding:", primary_embedding.tolist())
            st.write("Keyword Embeddings:", keyword_embeddings.tolist())

# Footer
st.markdown("---")
st.markdown("Created by [Ryland Bacorn](https://huggingface.co/ReithBjarkan). Report a [bug or make a suggestion](mailto:&#114;&#121;&#98;&#97;&#99;&#111;&#114;&#110;&#64;&#103;&#109;&#97;&#105;&#108;&#46;&#99;&#111;&#109;)")