import streamlit as st import pandas as pd from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sklearn.metrics.pairwise import cosine_similarity import numpy as np import time st.set_page_config(page_title="Keyword Clusterizer", layout="wide") st.title("🔍 Keyword Clusterizer") st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/) """, unsafe_allow_html=True) # === STEP 1: Upload File st.markdown("### 📁 Step 1: Upload a CSV file") uploaded_file = st.file_uploader("Upload your CSV", type="csv") if uploaded_file: df = pd.read_csv(uploaded_file) st.success("✅ File uploaded successfully!") st.markdown("#### Preview of your file:") st.dataframe(df.head()) # === STEP 2: Column selection (first!) st.markdown("### 🧠 Step 2: Select Columns for Clustering") text_cols = st.multiselect("Choose one or more columns to combine for semantic clustering:", df.columns.tolist(), help="These columns will be combined into one text for each row.") if text_cols: # === STEP 3: Clustering Settings st.markdown("### ⚙️ Step 3: Configure Clustering") clustering_mode = st.radio("Choose clustering mode:", ["Auto (Silhouette Score)", "Manual"]) manual_k = None if clustering_mode == "Manual": manual_k = st.number_input("Enter the number of clusters:", min_value=2, step=1, value=10) with st.form("clustering_form"): submitted = st.form_submit_button("🚀 Run Clustering") if submitted: status_msg = st.empty() final_scroll = st.empty() # Combine selected columns df_text = df[text_cols].astype(str).fillna("") df_text["combined"] = df_text.apply(lambda row: " ".join(row), axis=1) texts = df_text["combined"].tolist() with st.spinner("Generating semantic embeddings..."): model = SentenceTransformer("paraphrase-MiniLM-L6-v2") embeddings = model.encode(texts, show_progress_bar=True) # Auto or manual number of clusters if clustering_mode == "Auto (Silhouette Score)": status_msg.info("🔎 Testing cluster sizes from 2 to 20 to find the best one...") scores = [] for k in range(2, 21): km = KMeans(n_clusters=k, random_state=42, n_init=10) labels = km.fit_predict(embeddings) score = silhouette_score(embeddings, labels) scores.append((k, score)) n_clusters = max(scores, key=lambda x: x[1])[0] status_msg.success(f"✅ Best number of clusters: {n_clusters}") else: n_clusters = manual_k status_msg.info(f"📌 Using manually selected number of clusters: {n_clusters}") # Run clustering kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) df['cluster'] = kmeans.fit_predict(embeddings) # Get best example per cluster centroids = kmeans.cluster_centers_ similarities = cosine_similarity(embeddings, centroids) cluster_labels = [] for i in range(n_clusters): cluster_indices = np.where(df['cluster'] == i)[0] cluster_sims = similarities[cluster_indices, i] best_index = cluster_indices[np.argmax(cluster_sims)] cluster_labels.append(df_text.iloc[best_index]["combined"]) label_map = {i: label for i, label in enumerate(cluster_labels)} df['cluster_label'] = df['cluster'].map(label_map) status_msg.empty() time.sleep(0.3) final_scroll.success(f"✅ Clustering complete! {n_clusters} clusters found.") # === Show results st.markdown("### 📊 Clustered Results") st.dataframe(df[[*text_cols, 'cluster_label']].head(20)) # === Download option csv = df.to_csv(index=False).encode('utf-8') st.download_button("📥 Download Clustered CSV", csv, "semantic_clusters.csv", "text/csv") else: st.info("👉 Select at least one column to continue.")