Spaces:

samsonleegh
/

clustering

Sleeping

File size: 8,624 Bytes

import gradio as gr
import pandas as pd
import hdbscan
import numpy as np
import requests
import os
import uuid
import ollama

from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import login
from torch.quantization import quantize_dynamic
from umap import UMAP
from sklearn.metrics import silhouette_score

login(os.getenv('HF_TOKEN'))
model_st = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
TMP_DIR = "./tmp_images"
os.makedirs(TMP_DIR, exist_ok=True)

def parse_with_ollama(text, llm_selector):
    response = ollama.chat(
        model=llm_selector, #'qwen2.5:3b', 'llama3.2:latest',
        messages=[
            {"role": "system", "content": "You are an image caption analyser for the trust and safety department. Based on the following image captions, provide an overall summary of these captions in less than 10 words."},
            {"role": "user", "content": text}
        ]
    )
    return response['message']['content']
    
def download_image(url, cluster_id, idx):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200 and response.headers['Content-Type'].startswith('image'):
            ext = response.headers['Content-Type'].split('/')[-1]
            filename = f"cluster_{cluster_id}_{idx}_{uuid.uuid4().hex[:8]}.{ext}"
            filepath = os.path.join(TMP_DIR, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            return filepath
    except Exception as e:
        print(f"Failed to fetch image from {url}: {e}")
    return None
    
def cluster_data(file, algorithm, umap_dims, llm_selector):
    logs = []  # collect logs here

    def log(msg):
        logs.append(msg)
        return "\n".join(logs)
    try:
        # Load CSV
        df = pd.read_csv(file.name)

        if 'top_tags' not in df.columns or 'img_url' not in df.columns:
            return "Required columns ('top_tags', 'img_url') not found.", None
        
        # Clean top_tags
        text_ls = df['top_tags'].str.replace(r"[\[\]']", '', regex=True).to_list()

        # Encode + UMAP
        yield None, None, None, None, log("✅ Converting top_tags to embeddings...")
        embeddings = model_st.encode(text_ls, batch_size=64, show_progress_bar=True)
        yield None, None, None, None, log("✅ Reducing dimensions with UMAP " + str(umap_dims) + " dimensions...")
        umap_model = UMAP(n_components=int(umap_dims), metric='cosine', random_state=42)
        umap_embeddings = umap_model.fit_transform(embeddings)

        # Cluster
        yield None, None, None, None, log(f"✅ Clustering with {algorithm}...")
        if algorithm == "KMeans":
            N_CLUSTERS = max(2, round(np.sqrt(len(df))))
            model = KMeans(n_clusters=N_CLUSTERS, random_state=0)
            labels = model.fit_predict(umap_embeddings)
        elif algorithm == "HDBSCAN":
            # model = hdbscan.HDBSCAN(min_cluster_size=10)
            # labels = model.fit_predict(umap_embeddings)
            # Run HDBSCAN on the reduced space
            hdb = hdbscan.HDBSCAN(
                # min_cluster_size=30,
                # min_samples=3,
                # metric='euclidean',  # Use Euclidean after UMAP
                # cluster_selection_method='leaf'
            )
            hdb_labels = hdb.fit_predict(umap_embeddings)
            
            labels = hdb.labels_
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)
            
            print(f"Clusters found: {n_clusters}")
            print(f"Noise samples: {n_noise} / {len(labels)} ({n_noise/len(labels)*100:.2f}%)")
            
            noise_mask = hdb.labels_ == -1
            noise_embeddings = umap_embeddings[noise_mask]
            
            hdb_noise = hdbscan.HDBSCAN(
                # metric='euclidean',
                # min_cluster_size=10,
                # min_samples=2,
                # cluster_selection_method='leaf'
            )
            noise_labels = hdb_noise.fit_predict(noise_embeddings)
            
            # Initialize full label array with original
            labels = hdb.labels_.copy()
            
            # Offset noise cluster labels to avoid collision with original ones
            new_cluster_start = labels.max() + 1
            relabelled_noise = np.where(noise_labels != -1, noise_labels + new_cluster_start, -1)
            
            # Insert reclustered labels back into noise positions
            labels[noise_mask] = relabelled_noise
        else:
            return "Unknown algorithm", None

        cluster_silhouette_score = silhouette_score(umap_embeddings, labels, metric='euclidean') # use euclidean after UMAP reduction, else cosine better for text embeddings
        silhouette_text = (
            f"Silhouette Score: {cluster_silhouette_score:.3f}"
            # "Explanation:\n"
            # "Scores close to +1 indicate well-separated, compact clusters.\n"
            # "Scores near 0 indicate overlapping clusters.\n"
            # "Negative scores suggest possible misclassification."
        )        
        # Label the df
        df["cluster"] = labels

        # Sample 5 images per cluster
        # img_clusters = []
        # for cluster_id in sorted(df['cluster'].unique()):
        #     sample_urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5]
        #     for url in sample_urls:
        #         img_clusters.append((f"Cluster {cluster_id}", url))
        df = df[df["cluster"]!=-1]
        
        img_clusters = []
        yield None, None, None, None, log("✅ Downloading images...")
        for cluster_id in sorted(df['cluster'].unique()):
            urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5]
            for idx, url in enumerate(urls):
                img_path = download_image(url, cluster_id, idx)
                if img_path:
                    img_clusters.append((os.path.abspath(img_path), f"Cluster {cluster_id}"))
                    prev_img_path = img_path
                    prev_cluster_id = cluster_id
                else:
                    img_clusters.append((os.path.abspath(prev_img_path), f"Cluster {prev_cluster_id}"))

        file_path = "cluster_output.csv"
        df[['img_url','top_tags','cluster']].to_csv(file_path, index=False)
        agg_df = df.groupby('cluster').agg(
            top_tags_joined=('top_tags', lambda x: ', '.join(x)),
            num_samples=('top_tags', 'count')
        ).reset_index()
        yield None, None, None, None, log("✅ Summarising cluster image tags with LLM...")
        agg_df['tag_summary'] = agg_df['top_tags_joined'].apply(lambda x : parse_with_ollama(x, llm_selector))
        agg_df = agg_df[['cluster','num_samples','tag_summary','top_tags_joined']]
        yield agg_df, img_clusters, silhouette_text, file_path, log("✅ All done!")

    except Exception as e:
        return f"Error: {str(e)}", None, None, None, log(f"❌ Error: {str(e)}")
    
with gr.Blocks() as demo:
    with gr.Row():
        
        with gr.Column():
            start_button = gr.Button("Start Clustering")
            file_input = gr.File(file_types=[".csv"], label="Upload CSV")
            
        with gr.Column():
            algo_selector = gr.Dropdown(choices=["KMeans", "HDBSCAN"], label="Clustering Algorithm")
            umap_dims = gr.Slider(minimum=2, maximum=100, value=20, step=1, label="UMAP Dimensions")
            llm_selector = gr.Dropdown(choices=["qwen2.5:3b", "llama3.2:latest"], value="qwen2.5:3b", label="LLM Model")

        download_filepath = gr.File(label="Download Clustered Output", type="filepath")
    
    with gr.Row():
        silhouette_text = gr.Textbox(label="Silhouette Score compares the average distance to points in the same cluster vs. points in the nearest other cluster. +1 indicate well-separated, compact clusters; 0 indicate overlapping clusters.", lines=1, interactive=False)
        
    with gr.Row():
        output_df = gr.Dataframe(label="Clustered Output", interactive=False)

    with gr.Row():
        gallery = gr.Gallery(label="Clustered Images (5 per cluster)", columns=5, height="auto")

    with gr.Row():
        log_box = gr.Textbox(label="Processing Logs", lines=10, interactive=False)

    # Button triggers clustering
    start_button.click(fn=cluster_data, inputs=[file_input, algo_selector, umap_dims, llm_selector], outputs=[output_df, gallery, silhouette_text, download_filepath, log_box])
        
demo.launch()