import gradio as gr import pandas as pd import hdbscan import numpy as np import requests import os import uuid import ollama from sklearn.cluster import KMeans from sentence_transformers import SentenceTransformer, util from huggingface_hub import login from torch.quantization import quantize_dynamic from umap import UMAP from sklearn.metrics import silhouette_score login(os.getenv('HF_TOKEN')) model_st = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') TMP_DIR = "./tmp_images" os.makedirs(TMP_DIR, exist_ok=True) def parse_with_ollama(text, llm_selector): response = ollama.chat( model=llm_selector, #'qwen2.5:3b', 'llama3.2:latest', messages=[ {"role": "system", "content": "You are an image caption analyser for the trust and safety department. Based on the following image captions, provide an overall summary of these captions in less than 10 words."}, {"role": "user", "content": text} ] ) return response['message']['content'] def download_image(url, cluster_id, idx): try: response = requests.get(url, timeout=5) if response.status_code == 200 and response.headers['Content-Type'].startswith('image'): ext = response.headers['Content-Type'].split('/')[-1] filename = f"cluster_{cluster_id}_{idx}_{uuid.uuid4().hex[:8]}.{ext}" filepath = os.path.join(TMP_DIR, filename) with open(filepath, 'wb') as f: f.write(response.content) return filepath except Exception as e: print(f"Failed to fetch image from {url}: {e}") return None def cluster_data(file, algorithm, umap_dims, llm_selector): logs = [] # collect logs here def log(msg): logs.append(msg) return "\n".join(logs) try: # Load CSV df = pd.read_csv(file.name) if 'top_tags' not in df.columns or 'img_url' not in df.columns: return "Required columns ('top_tags', 'img_url') not found.", None # Clean top_tags text_ls = df['top_tags'].str.replace(r"[\[\]']", '', regex=True).to_list() # Encode + UMAP yield None, None, None, None, log("✅ Converting top_tags to embeddings...") embeddings = model_st.encode(text_ls, batch_size=64, show_progress_bar=True) yield None, None, None, None, log("✅ Reducing dimensions with UMAP " + str(umap_dims) + " dimensions...") umap_model = UMAP(n_components=int(umap_dims), metric='cosine', random_state=42) umap_embeddings = umap_model.fit_transform(embeddings) # Cluster yield None, None, None, None, log(f"✅ Clustering with {algorithm}...") if algorithm == "KMeans": N_CLUSTERS = max(2, round(np.sqrt(len(df)))) model = KMeans(n_clusters=N_CLUSTERS, random_state=0) labels = model.fit_predict(umap_embeddings) elif algorithm == "HDBSCAN": # model = hdbscan.HDBSCAN(min_cluster_size=10) # labels = model.fit_predict(umap_embeddings) # Run HDBSCAN on the reduced space hdb = hdbscan.HDBSCAN( # min_cluster_size=30, # min_samples=3, # metric='euclidean', # Use Euclidean after UMAP # cluster_selection_method='leaf' ) hdb_labels = hdb.fit_predict(umap_embeddings) labels = hdb.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) print(f"Clusters found: {n_clusters}") print(f"Noise samples: {n_noise} / {len(labels)} ({n_noise/len(labels)*100:.2f}%)") noise_mask = hdb.labels_ == -1 noise_embeddings = umap_embeddings[noise_mask] hdb_noise = hdbscan.HDBSCAN( # metric='euclidean', # min_cluster_size=10, # min_samples=2, # cluster_selection_method='leaf' ) noise_labels = hdb_noise.fit_predict(noise_embeddings) # Initialize full label array with original labels = hdb.labels_.copy() # Offset noise cluster labels to avoid collision with original ones new_cluster_start = labels.max() + 1 relabelled_noise = np.where(noise_labels != -1, noise_labels + new_cluster_start, -1) # Insert reclustered labels back into noise positions labels[noise_mask] = relabelled_noise else: return "Unknown algorithm", None cluster_silhouette_score = silhouette_score(umap_embeddings, labels, metric='euclidean') # use euclidean after UMAP reduction, else cosine better for text embeddings silhouette_text = ( f"Silhouette Score: {cluster_silhouette_score:.3f}" # "Explanation:\n" # "Scores close to +1 indicate well-separated, compact clusters.\n" # "Scores near 0 indicate overlapping clusters.\n" # "Negative scores suggest possible misclassification." ) # Label the df df["cluster"] = labels # Sample 5 images per cluster # img_clusters = [] # for cluster_id in sorted(df['cluster'].unique()): # sample_urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5] # for url in sample_urls: # img_clusters.append((f"Cluster {cluster_id}", url)) df = df[df["cluster"]!=-1] img_clusters = [] yield None, None, None, None, log("✅ Downloading images...") for cluster_id in sorted(df['cluster'].unique()): urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5] for idx, url in enumerate(urls): img_path = download_image(url, cluster_id, idx) if img_path: img_clusters.append((os.path.abspath(img_path), f"Cluster {cluster_id}")) prev_img_path = img_path prev_cluster_id = cluster_id else: img_clusters.append((os.path.abspath(prev_img_path), f"Cluster {prev_cluster_id}")) file_path = "cluster_output.csv" df[['img_url','top_tags','cluster']].to_csv(file_path, index=False) agg_df = df.groupby('cluster').agg( top_tags_joined=('top_tags', lambda x: ', '.join(x)), num_samples=('top_tags', 'count') ).reset_index() yield None, None, None, None, log("✅ Summarising cluster image tags with LLM...") agg_df['tag_summary'] = agg_df['top_tags_joined'].apply(lambda x : parse_with_ollama(x, llm_selector)) agg_df = agg_df[['cluster','num_samples','tag_summary','top_tags_joined']] yield agg_df, img_clusters, silhouette_text, file_path, log("✅ All done!") except Exception as e: return f"Error: {str(e)}", None, None, None, log(f"❌ Error: {str(e)}") with gr.Blocks() as demo: with gr.Row(): with gr.Column(): start_button = gr.Button("Start Clustering") file_input = gr.File(file_types=[".csv"], label="Upload CSV") with gr.Column(): algo_selector = gr.Dropdown(choices=["KMeans", "HDBSCAN"], label="Clustering Algorithm") umap_dims = gr.Slider(minimum=2, maximum=100, value=20, step=1, label="UMAP Dimensions") llm_selector = gr.Dropdown(choices=["qwen2.5:3b", "llama3.2:latest"], value="qwen2.5:3b", label="LLM Model") download_filepath = gr.File(label="Download Clustered Output", type="filepath") with gr.Row(): silhouette_text = gr.Textbox(label="Silhouette Score compares the average distance to points in the same cluster vs. points in the nearest other cluster. +1 indicate well-separated, compact clusters; 0 indicate overlapping clusters.", lines=1, interactive=False) with gr.Row(): output_df = gr.Dataframe(label="Clustered Output", interactive=False) with gr.Row(): gallery = gr.Gallery(label="Clustered Images (5 per cluster)", columns=5, height="auto") with gr.Row(): log_box = gr.Textbox(label="Processing Logs", lines=10, interactive=False) # Button triggers clustering start_button.click(fn=cluster_data, inputs=[file_input, algo_selector, umap_dims, llm_selector], outputs=[output_df, gallery, silhouette_text, download_filepath, log_box]) demo.launch()