Spaces:

samsonleegh
/

clustering

Runtime error

App Files Files Community

samsonleegh commited on Jun 23, 2025

Commit

cb76974

verified ·

1 Parent(s): 87cc493

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -0

app.py CHANGED Viewed

	@@ -0,0 +1,192 @@

+import gradio as gr
+import pandas as pd
+import hdbscan
+import numpy as np
+import requests
+import os
+import uuid
+import ollama
+from sklearn.cluster import KMeans
+from sentence_transformers import SentenceTransformer, util
+from huggingface_hub import login
+from torch.quantization import quantize_dynamic
+from umap import UMAP
+from sklearn.metrics import silhouette_score
+login("HF_API_KEY")
+model_st = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+TMP_DIR = "./tmp_images"
+os.makedirs(TMP_DIR, exist_ok=True)
+def parse_with_ollama(text, llm_selector):
+    response = ollama.chat(
+        model=llm_selector, #'qwen2.5:3b', 'llama3.2:latest',
+        messages=[
+            {"role": "system", "content": "You are an image caption analyser for the trust and safety department. Based on the following image captions, provide an overall summary of these captions in less than 10 words."},
+            {"role": "user", "content": text}
+        ]
+    )
+    return response['message']['content']
+def download_image(url, cluster_id, idx):
+    try:
+        response = requests.get(url, timeout=5)
+        if response.status_code == 200 and response.headers['Content-Type'].startswith('image'):
+            ext = response.headers['Content-Type'].split('/')[-1]
+            filename = f"cluster_{cluster_id}_{idx}_{uuid.uuid4().hex[:8]}.{ext}"
+            filepath = os.path.join(TMP_DIR, filename)
+            with open(filepath, 'wb') as f:
+                f.write(response.content)
+            return filepath
+    except Exception as e:
+        print(f"Failed to fetch image from {url}: {e}")
+    return None
+def cluster_data(file, algorithm, umap_dims, llm_selector):
+    logs = []  # collect logs here
+    def log(msg):
+        logs.append(msg)
+        return "\n".join(logs)
+    try:
+        # Load CSV
+        df = pd.read_csv(file.name)
+        if 'top_tags' not in df.columns or 'img_url' not in df.columns:
+            return "Required columns ('top_tags', 'img_url') not found.", None
+        # Clean top_tags
+        text_ls = df['top_tags'].str.replace(r"[\[\]']", '', regex=True).to_list()
+        # Encode + UMAP
+        yield None, None, None, None, log("✅ Converting top_tags to embeddings...")
+        embeddings = model_st.encode(text_ls, batch_size=64, show_progress_bar=True)
+        yield None, None, None, None, log("✅ Reducing dimensions with UMAP " + str(umap_dims) + " dimensions...")
+        umap_model = UMAP(n_components=int(umap_dims), metric='cosine', random_state=42)
+        umap_embeddings = umap_model.fit_transform(embeddings)
+        # Cluster
+        yield None, None, None, None, log(f"✅ Clustering with {algorithm}...")
+        if algorithm == "KMeans":
+            N_CLUSTERS = max(2, round(np.sqrt(len(df))))
+            model = KMeans(n_clusters=N_CLUSTERS, random_state=0)
+            labels = model.fit_predict(umap_embeddings)
+        elif algorithm == "HDBSCAN":
+            # model = hdbscan.HDBSCAN(min_cluster_size=10)
+            # labels = model.fit_predict(umap_embeddings)
+            # Run HDBSCAN on the reduced space
+            hdb = hdbscan.HDBSCAN(
+                # min_cluster_size=30,
+                # min_samples=3,
+                # metric='euclidean',  # Use Euclidean after UMAP
+                # cluster_selection_method='leaf'
+            )
+            hdb_labels = hdb.fit_predict(umap_embeddings)
+            labels = hdb.labels_
+            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+            n_noise = list(labels).count(-1)
+            print(f"Clusters found: {n_clusters}")
+            print(f"Noise samples: {n_noise} / {len(labels)} ({n_noise/len(labels)*100:.2f}%)")
+            noise_mask = hdb.labels_ == -1
+            noise_embeddings = umap_embeddings[noise_mask]
+            hdb_noise = hdbscan.HDBSCAN(
+                # metric='euclidean',
+                # min_cluster_size=10,
+                # min_samples=2,
+                # cluster_selection_method='leaf'
+            )
+            noise_labels = hdb_noise.fit_predict(noise_embeddings)
+            # Initialize full label array with original
+            labels = hdb.labels_.copy()
+            # Offset noise cluster labels to avoid collision with original ones
+            new_cluster_start = labels.max() + 1
+            relabelled_noise = np.where(noise_labels != -1, noise_labels + new_cluster_start, -1)
+            # Insert reclustered labels back into noise positions
+            labels[noise_mask] = relabelled_noise
+        else:
+            return "Unknown algorithm", None
+        cluster_silhouette_score = silhouette_score(umap_embeddings, labels, metric='euclidean') # use euclidean after UMAP reduction, else cosine better for text embeddings
+        silhouette_text = (
+            f"Silhouette Score: {cluster_silhouette_score:.3f}"
+            # "Explanation:\n"
+            # "Scores close to +1 indicate well-separated, compact clusters.\n"
+            # "Scores near 0 indicate overlapping clusters.\n"
+            # "Negative scores suggest possible misclassification."
+        )
+        # Label the df
+        df["cluster"] = labels
+        # Sample 5 images per cluster
+        # img_clusters = []
+        # for cluster_id in sorted(df['cluster'].unique()):
+        #     sample_urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5]
+        #     for url in sample_urls:
+        #         img_clusters.append((f"Cluster {cluster_id}", url))
+        df = df[df["cluster"]!=-1]
+        img_clusters = []
+        yield None, None, None, None, log("✅ Downloading images...")
+        for cluster_id in sorted(df['cluster'].unique()):
+            urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5]
+            for idx, url in enumerate(urls):
+                img_path = download_image(url, cluster_id, idx)
+                if img_path:
+                    img_clusters.append((os.path.abspath(img_path), f"Cluster {cluster_id}"))
+                    prev_img_path = img_path
+                    prev_cluster_id = cluster_id
+                else:
+                    img_clusters.append((os.path.abspath(prev_img_path), f"Cluster {prev_cluster_id}"))
+        file_path = "cluster_output.csv"
+        df[['img_url','top_tags','cluster']].to_csv(file_path, index=False)
+        agg_df = df.groupby('cluster').agg(
+            top_tags_joined=('top_tags', lambda x: ', '.join(x)),
+            num_samples=('top_tags', 'count')
+        ).reset_index()
+        yield None, None, None, None, log("✅ Summarising cluster image tags with LLM...")
+        agg_df['tag_summary'] = agg_df['top_tags_joined'].apply(lambda x : parse_with_ollama(x, llm_selector))
+        agg_df = agg_df[['cluster','num_samples','tag_summary','top_tags_joined']]
+        yield agg_df, img_clusters, silhouette_text, file_path, log("✅ All done!")
+    except Exception as e:
+        return f"Error: {str(e)}", None, None, None, log(f"❌ Error: {str(e)}")
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            start_button = gr.Button("Start Clustering")
+            file_input = gr.File(file_types=[".csv"], label="Upload CSV")
+        with gr.Column():
+            algo_selector = gr.Dropdown(choices=["KMeans", "HDBSCAN"], label="Clustering Algorithm")
+            umap_dims = gr.Slider(minimum=2, maximum=100, value=20, step=1, label="UMAP Dimensions")
+            llm_selector = gr.Dropdown(choices=["qwen2.5:3b", "llama3.2:latest"], value="qwen2.5:3b", label="LLM Model")
+        download_filepath = gr.File(label="Download Clustered Output", type="filepath")
+    with gr.Row():
+        silhouette_text = gr.Textbox(label="Silhouette Score compares the average distance to points in the same cluster vs. points in the nearest other cluster. +1 indicate well-separated, compact clusters; 0 indicate overlapping clusters.", lines=1, interactive=False)
+    with gr.Row():
+        output_df = gr.Dataframe(label="Clustered Output", interactive=False)
+    with gr.Row():
+        gallery = gr.Gallery(label="Clustered Images (5 per cluster)", columns=5, height="auto")
+    with gr.Row():
+        log_box = gr.Textbox(label="Processing Logs", lines=10, interactive=False)
+    # Button triggers clustering
+    start_button.click(fn=cluster_data, inputs=[file_input, algo_selector, umap_dims, llm_selector], outputs=[output_df, gallery, silhouette_text, download_filepath, log_box])
+demo.launch()