Spaces:

Flopot2
/

keyword-clusterizer

Sleeping

App Files Files Community

keyword-clusterizer / app.py

Flopot2

Update app.py

a9ec35e verified 10 months ago

raw

history blame contribute delete

4.45 kB

	import streamlit as st
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	from sklearn.cluster import KMeans
	from sklearn.metrics import silhouette_score
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import time

	st.set_page_config(page_title="Keyword Clusterizer", layout="wide")
	st.title("🔍 Keyword Clusterizer")
	st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
	""", unsafe_allow_html=True)

	# === STEP 1: Upload File
	st.markdown("### 📁 Step 1: Upload a CSV file")
	uploaded_file = st.file_uploader("Upload your CSV", type="csv")

	if uploaded_file:
	df = pd.read_csv(uploaded_file)
	st.success("✅ File uploaded successfully!")
	st.markdown("#### Preview of your file:")
	st.dataframe(df.head())

	# === STEP 2: Column selection (first!)
	st.markdown("### 🧠 Step 2: Select Columns for Clustering")
	text_cols = st.multiselect("Choose one or more columns to combine for semantic clustering:",
	df.columns.tolist(),
	help="These columns will be combined into one text for each row.")

	if text_cols:
	# === STEP 3: Clustering Settings
	st.markdown("### ⚙️ Step 3: Configure Clustering")

	clustering_mode = st.radio("Choose clustering mode:", ["Auto (Silhouette Score)", "Manual"])
	manual_k = None
	if clustering_mode == "Manual":
	manual_k = st.number_input("Enter the number of clusters:", min_value=2, step=1, value=10)

	with st.form("clustering_form"):
	submitted = st.form_submit_button("🚀 Run Clustering")

	if submitted:
	status_msg = st.empty()
	final_scroll = st.empty()

	# Combine selected columns
	df_text = df[text_cols].astype(str).fillna("")
	df_text["combined"] = df_text.apply(lambda row: " ".join(row), axis=1)
	texts = df_text["combined"].tolist()

	with st.spinner("Generating semantic embeddings..."):
	model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
	embeddings = model.encode(texts, show_progress_bar=True)

	# Auto or manual number of clusters
	if clustering_mode == "Auto (Silhouette Score)":
	status_msg.info("🔎 Testing cluster sizes from 2 to 20 to find the best one...")
	scores = []
	for k in range(2, 21):
	km = KMeans(n_clusters=k, random_state=42, n_init=10)
	labels = km.fit_predict(embeddings)
	score = silhouette_score(embeddings, labels)
	scores.append((k, score))
	n_clusters = max(scores, key=lambda x: x[1])[0]
	status_msg.success(f"✅ Best number of clusters: {n_clusters}")
	else:
	n_clusters = manual_k
	status_msg.info(f"📌 Using manually selected number of clusters: {n_clusters}")

	# Run clustering
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	df['cluster'] = kmeans.fit_predict(embeddings)

	# Get best example per cluster
	centroids = kmeans.cluster_centers_
	similarities = cosine_similarity(embeddings, centroids)
	cluster_labels = []
	for i in range(n_clusters):
	cluster_indices = np.where(df['cluster'] == i)[0]
	cluster_sims = similarities[cluster_indices, i]
	best_index = cluster_indices[np.argmax(cluster_sims)]
	cluster_labels.append(df_text.iloc[best_index]["combined"])
	label_map = {i: label for i, label in enumerate(cluster_labels)}
	df['cluster_label'] = df['cluster'].map(label_map)

	status_msg.empty()
	time.sleep(0.3)
	final_scroll.success(f"✅ Clustering complete! {n_clusters} clusters found.")

	# === Show results
	st.markdown("### 📊 Clustered Results")
	st.dataframe(df[[*text_cols, 'cluster_label']].head(20))

	# === Download option
	csv = df.to_csv(index=False).encode('utf-8')
	st.download_button("📥 Download Clustered CSV", csv, "semantic_clusters.csv", "text/csv")
	else:
	st.info("👉 Select at least one column to continue.")