Flopot2's picture
Update app.py
a9ec35e verified
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time
st.set_page_config(page_title="Keyword Clusterizer", layout="wide")
st.title("πŸ” Keyword Clusterizer")
st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
""", unsafe_allow_html=True)
# === STEP 1: Upload File
st.markdown("### πŸ“ Step 1: Upload a CSV file")
uploaded_file = st.file_uploader("Upload your CSV", type="csv")
if uploaded_file:
df = pd.read_csv(uploaded_file)
st.success("βœ… File uploaded successfully!")
st.markdown("#### Preview of your file:")
st.dataframe(df.head())
# === STEP 2: Column selection (first!)
st.markdown("### 🧠 Step 2: Select Columns for Clustering")
text_cols = st.multiselect("Choose one or more columns to combine for semantic clustering:",
df.columns.tolist(),
help="These columns will be combined into one text for each row.")
if text_cols:
# === STEP 3: Clustering Settings
st.markdown("### βš™οΈ Step 3: Configure Clustering")
clustering_mode = st.radio("Choose clustering mode:", ["Auto (Silhouette Score)", "Manual"])
manual_k = None
if clustering_mode == "Manual":
manual_k = st.number_input("Enter the number of clusters:", min_value=2, step=1, value=10)
with st.form("clustering_form"):
submitted = st.form_submit_button("πŸš€ Run Clustering")
if submitted:
status_msg = st.empty()
final_scroll = st.empty()
# Combine selected columns
df_text = df[text_cols].astype(str).fillna("")
df_text["combined"] = df_text.apply(lambda row: " ".join(row), axis=1)
texts = df_text["combined"].tolist()
with st.spinner("Generating semantic embeddings..."):
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)
# Auto or manual number of clusters
if clustering_mode == "Auto (Silhouette Score)":
status_msg.info("πŸ”Ž Testing cluster sizes from 2 to 20 to find the best one...")
scores = []
for k in range(2, 21):
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(embeddings)
score = silhouette_score(embeddings, labels)
scores.append((k, score))
n_clusters = max(scores, key=lambda x: x[1])[0]
status_msg.success(f"βœ… Best number of clusters: {n_clusters}")
else:
n_clusters = manual_k
status_msg.info(f"πŸ“Œ Using manually selected number of clusters: {n_clusters}")
# Run clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(embeddings)
# Get best example per cluster
centroids = kmeans.cluster_centers_
similarities = cosine_similarity(embeddings, centroids)
cluster_labels = []
for i in range(n_clusters):
cluster_indices = np.where(df['cluster'] == i)[0]
cluster_sims = similarities[cluster_indices, i]
best_index = cluster_indices[np.argmax(cluster_sims)]
cluster_labels.append(df_text.iloc[best_index]["combined"])
label_map = {i: label for i, label in enumerate(cluster_labels)}
df['cluster_label'] = df['cluster'].map(label_map)
status_msg.empty()
time.sleep(0.3)
final_scroll.success(f"βœ… Clustering complete! {n_clusters} clusters found.")
# === Show results
st.markdown("### πŸ“Š Clustered Results")
st.dataframe(df[[*text_cols, 'cluster_label']].head(20))
# === Download option
csv = df.to_csv(index=False).encode('utf-8')
st.download_button("πŸ“₯ Download Clustered CSV", csv, "semantic_clusters.csv", "text/csv")
else:
st.info("πŸ‘‰ Select at least one column to continue.")