Spaces:
Sleeping
Sleeping
File size: 4,445 Bytes
31a12b3 c811848 31a12b3 c6c9fd9 3436afb 31a12b3 c6c9fd9 31a12b3 c6c9fd9 31a12b3 ccc1015 c6c9fd9 ccc1015 c6c9fd9 ccc1015 c6c9fd9 31a12b3 c6c9fd9 31a12b3 ccc1015 c6c9fd9 31a12b3 ccc1015 c6c9fd9 c811848 31a12b3 c6c9fd9 31a12b3 c811848 31a12b3 c6c9fd9 31a12b3 c811848 31a12b3 c6c9fd9 31a12b3 c6c9fd9 31a12b3 c6c9fd9 31a12b3 c6c9fd9 31a12b3 c811848 c6c9fd9 c811848 31a12b3 c6c9fd9 31a12b3 c811848 31a12b3 c6c9fd9 ccc1015 c6c9fd9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time
st.set_page_config(page_title="Keyword Clusterizer", layout="wide")
st.title("π Keyword Clusterizer")
st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
""", unsafe_allow_html=True)
# === STEP 1: Upload File
st.markdown("### π Step 1: Upload a CSV file")
uploaded_file = st.file_uploader("Upload your CSV", type="csv")
if uploaded_file:
df = pd.read_csv(uploaded_file)
st.success("β
File uploaded successfully!")
st.markdown("#### Preview of your file:")
st.dataframe(df.head())
# === STEP 2: Column selection (first!)
st.markdown("### π§ Step 2: Select Columns for Clustering")
text_cols = st.multiselect("Choose one or more columns to combine for semantic clustering:",
df.columns.tolist(),
help="These columns will be combined into one text for each row.")
if text_cols:
# === STEP 3: Clustering Settings
st.markdown("### βοΈ Step 3: Configure Clustering")
clustering_mode = st.radio("Choose clustering mode:", ["Auto (Silhouette Score)", "Manual"])
manual_k = None
if clustering_mode == "Manual":
manual_k = st.number_input("Enter the number of clusters:", min_value=2, step=1, value=10)
with st.form("clustering_form"):
submitted = st.form_submit_button("π Run Clustering")
if submitted:
status_msg = st.empty()
final_scroll = st.empty()
# Combine selected columns
df_text = df[text_cols].astype(str).fillna("")
df_text["combined"] = df_text.apply(lambda row: " ".join(row), axis=1)
texts = df_text["combined"].tolist()
with st.spinner("Generating semantic embeddings..."):
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)
# Auto or manual number of clusters
if clustering_mode == "Auto (Silhouette Score)":
status_msg.info("π Testing cluster sizes from 2 to 20 to find the best one...")
scores = []
for k in range(2, 21):
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(embeddings)
score = silhouette_score(embeddings, labels)
scores.append((k, score))
n_clusters = max(scores, key=lambda x: x[1])[0]
status_msg.success(f"β
Best number of clusters: {n_clusters}")
else:
n_clusters = manual_k
status_msg.info(f"π Using manually selected number of clusters: {n_clusters}")
# Run clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(embeddings)
# Get best example per cluster
centroids = kmeans.cluster_centers_
similarities = cosine_similarity(embeddings, centroids)
cluster_labels = []
for i in range(n_clusters):
cluster_indices = np.where(df['cluster'] == i)[0]
cluster_sims = similarities[cluster_indices, i]
best_index = cluster_indices[np.argmax(cluster_sims)]
cluster_labels.append(df_text.iloc[best_index]["combined"])
label_map = {i: label for i, label in enumerate(cluster_labels)}
df['cluster_label'] = df['cluster'].map(label_map)
status_msg.empty()
time.sleep(0.3)
final_scroll.success(f"β
Clustering complete! {n_clusters} clusters found.")
# === Show results
st.markdown("### π Clustered Results")
st.dataframe(df[[*text_cols, 'cluster_label']].head(20))
# === Download option
csv = df.to_csv(index=False).encode('utf-8')
st.download_button("π₯ Download Clustered CSV", csv, "semantic_clusters.csv", "text/csv")
else:
st.info("π Select at least one column to continue.")
|