Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import time | |
| st.set_page_config(page_title="Keyword Clusterizer", layout="wide") | |
| st.title("π Keyword Clusterizer") | |
| st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/) | |
| """, unsafe_allow_html=True) | |
| # === STEP 1: Upload File | |
| st.markdown("### π Step 1: Upload a CSV file") | |
| uploaded_file = st.file_uploader("Upload your CSV", type="csv") | |
| if uploaded_file: | |
| df = pd.read_csv(uploaded_file) | |
| st.success("β File uploaded successfully!") | |
| st.markdown("#### Preview of your file:") | |
| st.dataframe(df.head()) | |
| # === STEP 2: Column selection (first!) | |
| st.markdown("### π§ Step 2: Select Columns for Clustering") | |
| text_cols = st.multiselect("Choose one or more columns to combine for semantic clustering:", | |
| df.columns.tolist(), | |
| help="These columns will be combined into one text for each row.") | |
| if text_cols: | |
| # === STEP 3: Clustering Settings | |
| st.markdown("### βοΈ Step 3: Configure Clustering") | |
| clustering_mode = st.radio("Choose clustering mode:", ["Auto (Silhouette Score)", "Manual"]) | |
| manual_k = None | |
| if clustering_mode == "Manual": | |
| manual_k = st.number_input("Enter the number of clusters:", min_value=2, step=1, value=10) | |
| with st.form("clustering_form"): | |
| submitted = st.form_submit_button("π Run Clustering") | |
| if submitted: | |
| status_msg = st.empty() | |
| final_scroll = st.empty() | |
| # Combine selected columns | |
| df_text = df[text_cols].astype(str).fillna("") | |
| df_text["combined"] = df_text.apply(lambda row: " ".join(row), axis=1) | |
| texts = df_text["combined"].tolist() | |
| with st.spinner("Generating semantic embeddings..."): | |
| model = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
| embeddings = model.encode(texts, show_progress_bar=True) | |
| # Auto or manual number of clusters | |
| if clustering_mode == "Auto (Silhouette Score)": | |
| status_msg.info("π Testing cluster sizes from 2 to 20 to find the best one...") | |
| scores = [] | |
| for k in range(2, 21): | |
| km = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| labels = km.fit_predict(embeddings) | |
| score = silhouette_score(embeddings, labels) | |
| scores.append((k, score)) | |
| n_clusters = max(scores, key=lambda x: x[1])[0] | |
| status_msg.success(f"β Best number of clusters: {n_clusters}") | |
| else: | |
| n_clusters = manual_k | |
| status_msg.info(f"π Using manually selected number of clusters: {n_clusters}") | |
| # Run clustering | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| df['cluster'] = kmeans.fit_predict(embeddings) | |
| # Get best example per cluster | |
| centroids = kmeans.cluster_centers_ | |
| similarities = cosine_similarity(embeddings, centroids) | |
| cluster_labels = [] | |
| for i in range(n_clusters): | |
| cluster_indices = np.where(df['cluster'] == i)[0] | |
| cluster_sims = similarities[cluster_indices, i] | |
| best_index = cluster_indices[np.argmax(cluster_sims)] | |
| cluster_labels.append(df_text.iloc[best_index]["combined"]) | |
| label_map = {i: label for i, label in enumerate(cluster_labels)} | |
| df['cluster_label'] = df['cluster'].map(label_map) | |
| status_msg.empty() | |
| time.sleep(0.3) | |
| final_scroll.success(f"β Clustering complete! {n_clusters} clusters found.") | |
| # === Show results | |
| st.markdown("### π Clustered Results") | |
| st.dataframe(df[[*text_cols, 'cluster_label']].head(20)) | |
| # === Download option | |
| csv = df.to_csv(index=False).encode('utf-8') | |
| st.download_button("π₯ Download Clustered CSV", csv, "semantic_clusters.csv", "text/csv") | |
| else: | |
| st.info("π Select at least one column to continue.") | |