File size: 4,367 Bytes
187da91
 
 
 
237e96c
 
187da91
7a39464
fbf3acc
 
 
 
eb280ef
187da91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
896fe0d
187da91
 
 
 
 
 
 
 
 
dc71e16
 
 
 
 
 
 
 
 
187da91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc71e16
187da91
 
 
 
 
 
 
dc71e16
 
237e96c
187da91
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.express as px
import os
# Route all cache and config writes to Docker-writable dirs
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
os.environ["XDG_CACHE_HOME"] = "/app/hf_cache"
os.environ["HOME"] = "/app"

# App title
st.title("Keyword Cosine Similarity Tool")

# Overview
st.markdown(
    """
    **Purpose:**  
    Elevate the most semantically relevant queries from keyword research.
    """
)

# Inputs
st.header("Input Parameters")
primary_keyword = st.text_input("Primary Keyword", placeholder="Enter your primary keyword")
st.text_area(
    "Keywords to Compare",
    placeholder="Enter keywords separated by new lines or commas",
    help="You can input keywords on separate lines or separated by commas. Any keywords with commas will be treated as separate queries.",
    key="keywords"
)
keywords = st.session_state.get("keywords", "")

# Instructions tooltip
with st.expander("ℹ️ Instructions (click for details)"):
    st.markdown(
        """
        **How to use this tool:**
        1. Enter your **Primary Keyword**.
        2. Provide a list of **Keywords to Compare**.
        3. Click **Calculate Similarities** to compute and rank your keywords by relevance.

        **Output:**
        - Sorted table of keywords by cosine similarity.
        - Interactive 3D PCA plot of keyword embeddings.
        - Option to download results.
        """
    )

# Process Button
if st.button("Calculate Similarities"):
    if not primary_keyword or not keywords:
        st.error("Please provide both the primary keyword and keywords to compare.")
    else:
        keyword_list = [kw.strip() for kw in keywords.replace(",", "\n").split("\n") if kw.strip()]
        model_name = "mixedbread-ai/mxbai-embed-large-v1"

        try:
            st.info(f"Loading model: {model_name}")
            model = SentenceTransformer(model_name)
        except Exception as e:
            st.error(f"Failed to load model: {e}")
            st.stop()

        try:
            st.info("Generating embeddings...")
            all_texts = [primary_keyword] + keyword_list
            embeddings = model.encode(all_texts, normalize_embeddings=True)

            # Apply Matryoshka Representation Learning: slice to 256 dims
            mrl_embeddings = embeddings[:, :256]

            primary_embedding = mrl_embeddings[0]
            keyword_embeddings = mrl_embeddings[1:]
        except Exception as e:
            st.error(f"Embedding failed: {e}")
            st.stop()

        # Cosine similarities
        st.info("Calculating cosine similarities...")
        similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0]
        results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)]
        sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)

        df_results = pd.DataFrame(sorted_results)

        # Output
        st.header("Results")

        st.download_button(
            label="📥 Download Results as CSV",
            data=df_results.to_csv(index=False),
            file_name="cosine_similarity_results.csv",
            mime="text/csv"
        )

        st.dataframe(df_results)

        # 3D PCA Plot
        st.subheader("3D PCA of Embeddings")
        pca = PCA(n_components=3)
        pca_result = pca.fit_transform(mrl_embeddings)

        pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2", "PC3"])
        pca_df["Label"] = ["Primary"] + keyword_list
        fig = px.scatter_3d(pca_df, x="PC1", y="PC2", z="PC3", color="Label", text="Label")
        st.plotly_chart(fig, use_container_width=True)

        with st.expander("🔧 Technical Details (click to expand)"):
            st.write("Primary Embedding:", primary_embedding.tolist())
            st.write("Keyword Embeddings:", keyword_embeddings.tolist())

# Footer
st.markdown("---")
st.markdown("Created by [Ryland Bacorn](https://huggingface.co/ReithBjarkan). Report a [bug or make a suggestion](mailto:rybacorn@gmail.com)")