Spaces:

dejanseo
/

qsim

Sleeping

App Files Files Community

qsim / src /streamlit_app.py

dejanseo

Update src/streamlit_app.py

eb280ef verified 7 months ago

raw

history blame contribute delete

4.37 kB

	import streamlit as st
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.decomposition import PCA
	import numpy as np
	import pandas as pd
	import plotly.express as px
	import os
	# Route all cache and config writes to Docker-writable dirs
	os.environ["HF_HOME"] = "/app/hf_cache"
	os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
	os.environ["XDG_CACHE_HOME"] = "/app/hf_cache"
	os.environ["HOME"] = "/app"

	# App title
	st.title("Keyword Cosine Similarity Tool")

	# Overview
	st.markdown(
	"""
	Purpose:
	Elevate the most semantically relevant queries from keyword research.
	"""
	)

	# Inputs
	st.header("Input Parameters")
	primary_keyword = st.text_input("Primary Keyword", placeholder="Enter your primary keyword")
	st.text_area(
	"Keywords to Compare",
	placeholder="Enter keywords separated by new lines or commas",
	help="You can input keywords on separate lines or separated by commas. Any keywords with commas will be treated as separate queries.",
	key="keywords"
	)
	keywords = st.session_state.get("keywords", "")

	# Instructions tooltip
	with st.expander("ℹ️ Instructions (click for details)"):
	st.markdown(
	"""
	How to use this tool:
	1. Enter your Primary Keyword.
	2. Provide a list of Keywords to Compare.
	3. Click Calculate Similarities to compute and rank your keywords by relevance.

	Output:
	- Sorted table of keywords by cosine similarity.
	- Interactive 3D PCA plot of keyword embeddings.
	- Option to download results.
	"""
	)

	# Process Button
	if st.button("Calculate Similarities"):
	if not primary_keyword or not keywords:
	st.error("Please provide both the primary keyword and keywords to compare.")
	else:
	keyword_list = [kw.strip() for kw in keywords.replace(",", "\n").split("\n") if kw.strip()]
	model_name = "mixedbread-ai/mxbai-embed-large-v1"

	try:
	st.info(f"Loading model: {model_name}")
	model = SentenceTransformer(model_name)
	except Exception as e:
	st.error(f"Failed to load model: {e}")
	st.stop()

	try:
	st.info("Generating embeddings...")
	all_texts = [primary_keyword] + keyword_list
	embeddings = model.encode(all_texts, normalize_embeddings=True)

	# Apply Matryoshka Representation Learning: slice to 256 dims
	mrl_embeddings = embeddings[:, :256]

	primary_embedding = mrl_embeddings[0]
	keyword_embeddings = mrl_embeddings[1:]
	except Exception as e:
	st.error(f"Embedding failed: {e}")
	st.stop()

	# Cosine similarities
	st.info("Calculating cosine similarities...")
	similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0]
	results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)]
	sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)

	df_results = pd.DataFrame(sorted_results)

	# Output
	st.header("Results")

	st.download_button(
	label="📥 Download Results as CSV",
	data=df_results.to_csv(index=False),
	file_name="cosine_similarity_results.csv",
	mime="text/csv"
	)

	st.dataframe(df_results)

	# 3D PCA Plot
	st.subheader("3D PCA of Embeddings")
	pca = PCA(n_components=3)
	pca_result = pca.fit_transform(mrl_embeddings)

	pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2", "PC3"])
	pca_df["Label"] = ["Primary"] + keyword_list
	fig = px.scatter_3d(pca_df, x="PC1", y="PC2", z="PC3", color="Label", text="Label")
	st.plotly_chart(fig, use_container_width=True)

	with st.expander("🔧 Technical Details (click to expand)"):
	st.write("Primary Embedding:", primary_embedding.tolist())
	st.write("Keyword Embeddings:", keyword_embeddings.tolist())

	# Footer
	st.markdown("---")
	st.markdown("Created by [Ryland Bacorn](https://huggingface.co/ReithBjarkan). Report a [bug or make a suggestion](mailto:rybacorn@gmail.com)")