File size: 4,445 Bytes
31a12b3
 
 
 
 
 
 
c811848
31a12b3
 
c6c9fd9
3436afb
 
31a12b3
 
c6c9fd9
31a12b3
 
 
 
c6c9fd9
 
31a12b3
 
ccc1015
c6c9fd9
 
 
 
ccc1015
 
 
c6c9fd9
ccc1015
c6c9fd9
 
31a12b3
c6c9fd9
31a12b3
ccc1015
c6c9fd9
31a12b3
ccc1015
c6c9fd9
 
c811848
 
31a12b3
 
 
 
c6c9fd9
 
 
31a12b3
c811848
31a12b3
c6c9fd9
31a12b3
c811848
31a12b3
 
 
 
 
c6c9fd9
31a12b3
 
c6c9fd9
31a12b3
c6c9fd9
31a12b3
 
 
c6c9fd9
31a12b3
 
 
 
 
 
 
 
 
 
 
c811848
 
c6c9fd9
c811848
31a12b3
c6c9fd9
31a12b3
 
c811848
31a12b3
c6c9fd9
ccc1015
c6c9fd9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

st.set_page_config(page_title="Keyword Clusterizer", layout="wide")
st.title("πŸ” Keyword Clusterizer")
st.markdown("""by [Florian Potier](https://twitter.com/FloPots) - [Intrepid Digital](https://www.intrepidonline.com/)
""", unsafe_allow_html=True)

# === STEP 1: Upload File
st.markdown("### πŸ“ Step 1: Upload a CSV file")
uploaded_file = st.file_uploader("Upload your CSV", type="csv")

if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.success("βœ… File uploaded successfully!")
    st.markdown("#### Preview of your file:")
    st.dataframe(df.head())

    # === STEP 2: Column selection (first!)
    st.markdown("### 🧠 Step 2: Select Columns for Clustering")
    text_cols = st.multiselect("Choose one or more columns to combine for semantic clustering:",
                               df.columns.tolist(),
                               help="These columns will be combined into one text for each row.")

    if text_cols:
        # === STEP 3: Clustering Settings
        st.markdown("### βš™οΈ Step 3: Configure Clustering")

        clustering_mode = st.radio("Choose clustering mode:", ["Auto (Silhouette Score)", "Manual"])
        manual_k = None
        if clustering_mode == "Manual":
            manual_k = st.number_input("Enter the number of clusters:", min_value=2, step=1, value=10)

        with st.form("clustering_form"):
            submitted = st.form_submit_button("πŸš€ Run Clustering")

        if submitted:
            status_msg = st.empty()
            final_scroll = st.empty()

            # Combine selected columns
            df_text = df[text_cols].astype(str).fillna("")
            df_text["combined"] = df_text.apply(lambda row: " ".join(row), axis=1)
            texts = df_text["combined"].tolist()

            with st.spinner("Generating semantic embeddings..."):
                model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
                embeddings = model.encode(texts, show_progress_bar=True)

            # Auto or manual number of clusters
            if clustering_mode == "Auto (Silhouette Score)":
                status_msg.info("πŸ”Ž Testing cluster sizes from 2 to 20 to find the best one...")
                scores = []
                for k in range(2, 21):
                    km = KMeans(n_clusters=k, random_state=42, n_init=10)
                    labels = km.fit_predict(embeddings)
                    score = silhouette_score(embeddings, labels)
                    scores.append((k, score))
                n_clusters = max(scores, key=lambda x: x[1])[0]
                status_msg.success(f"βœ… Best number of clusters: {n_clusters}")
            else:
                n_clusters = manual_k
                status_msg.info(f"πŸ“Œ Using manually selected number of clusters: {n_clusters}")

            # Run clustering
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            df['cluster'] = kmeans.fit_predict(embeddings)

            # Get best example per cluster
            centroids = kmeans.cluster_centers_
            similarities = cosine_similarity(embeddings, centroids)
            cluster_labels = []
            for i in range(n_clusters):
                cluster_indices = np.where(df['cluster'] == i)[0]
                cluster_sims = similarities[cluster_indices, i]
                best_index = cluster_indices[np.argmax(cluster_sims)]
                cluster_labels.append(df_text.iloc[best_index]["combined"])
            label_map = {i: label for i, label in enumerate(cluster_labels)}
            df['cluster_label'] = df['cluster'].map(label_map)

            status_msg.empty()
            time.sleep(0.3)
            final_scroll.success(f"βœ… Clustering complete! {n_clusters} clusters found.")

            # === Show results
            st.markdown("### πŸ“Š Clustered Results")
            st.dataframe(df[[*text_cols, 'cluster_label']].head(20))

            # === Download option
            csv = df.to_csv(index=False).encode('utf-8')
            st.download_button("πŸ“₯ Download Clustered CSV", csv, "semantic_clusters.csv", "text/csv")
    else:
        st.info("πŸ‘‰ Select at least one column to continue.")