Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import hdbscan | |
| import numpy as np | |
| import requests | |
| import os | |
| import uuid | |
| import ollama | |
| from sklearn.cluster import KMeans | |
| from sentence_transformers import SentenceTransformer, util | |
| from huggingface_hub import login | |
| from torch.quantization import quantize_dynamic | |
| from umap import UMAP | |
| from sklearn.metrics import silhouette_score | |
| login(os.getenv('HF_TOKEN')) | |
| model_st = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') | |
| TMP_DIR = "./tmp_images" | |
| os.makedirs(TMP_DIR, exist_ok=True) | |
| def parse_with_ollama(text, llm_selector): | |
| response = ollama.chat( | |
| model=llm_selector, #'qwen2.5:3b', 'llama3.2:latest', | |
| messages=[ | |
| {"role": "system", "content": "You are an image caption analyser for the trust and safety department. Based on the following image captions, provide an overall summary of these captions in less than 10 words."}, | |
| {"role": "user", "content": text} | |
| ] | |
| ) | |
| return response['message']['content'] | |
| def download_image(url, cluster_id, idx): | |
| try: | |
| response = requests.get(url, timeout=5) | |
| if response.status_code == 200 and response.headers['Content-Type'].startswith('image'): | |
| ext = response.headers['Content-Type'].split('/')[-1] | |
| filename = f"cluster_{cluster_id}_{idx}_{uuid.uuid4().hex[:8]}.{ext}" | |
| filepath = os.path.join(TMP_DIR, filename) | |
| with open(filepath, 'wb') as f: | |
| f.write(response.content) | |
| return filepath | |
| except Exception as e: | |
| print(f"Failed to fetch image from {url}: {e}") | |
| return None | |
| def cluster_data(file, algorithm, umap_dims, llm_selector): | |
| logs = [] # collect logs here | |
| def log(msg): | |
| logs.append(msg) | |
| return "\n".join(logs) | |
| try: | |
| # Load CSV | |
| df = pd.read_csv(file.name) | |
| if 'top_tags' not in df.columns or 'img_url' not in df.columns: | |
| return "Required columns ('top_tags', 'img_url') not found.", None | |
| # Clean top_tags | |
| text_ls = df['top_tags'].str.replace(r"[\[\]']", '', regex=True).to_list() | |
| # Encode + UMAP | |
| yield None, None, None, None, log("✅ Converting top_tags to embeddings...") | |
| embeddings = model_st.encode(text_ls, batch_size=64, show_progress_bar=True) | |
| yield None, None, None, None, log("✅ Reducing dimensions with UMAP " + str(umap_dims) + " dimensions...") | |
| umap_model = UMAP(n_components=int(umap_dims), metric='cosine', random_state=42) | |
| umap_embeddings = umap_model.fit_transform(embeddings) | |
| # Cluster | |
| yield None, None, None, None, log(f"✅ Clustering with {algorithm}...") | |
| if algorithm == "KMeans": | |
| N_CLUSTERS = max(2, round(np.sqrt(len(df)))) | |
| model = KMeans(n_clusters=N_CLUSTERS, random_state=0) | |
| labels = model.fit_predict(umap_embeddings) | |
| elif algorithm == "HDBSCAN": | |
| # model = hdbscan.HDBSCAN(min_cluster_size=10) | |
| # labels = model.fit_predict(umap_embeddings) | |
| # Run HDBSCAN on the reduced space | |
| hdb = hdbscan.HDBSCAN( | |
| # min_cluster_size=30, | |
| # min_samples=3, | |
| # metric='euclidean', # Use Euclidean after UMAP | |
| # cluster_selection_method='leaf' | |
| ) | |
| hdb_labels = hdb.fit_predict(umap_embeddings) | |
| labels = hdb.labels_ | |
| n_clusters = len(set(labels)) - (1 if -1 in labels else 0) | |
| n_noise = list(labels).count(-1) | |
| print(f"Clusters found: {n_clusters}") | |
| print(f"Noise samples: {n_noise} / {len(labels)} ({n_noise/len(labels)*100:.2f}%)") | |
| noise_mask = hdb.labels_ == -1 | |
| noise_embeddings = umap_embeddings[noise_mask] | |
| hdb_noise = hdbscan.HDBSCAN( | |
| # metric='euclidean', | |
| # min_cluster_size=10, | |
| # min_samples=2, | |
| # cluster_selection_method='leaf' | |
| ) | |
| noise_labels = hdb_noise.fit_predict(noise_embeddings) | |
| # Initialize full label array with original | |
| labels = hdb.labels_.copy() | |
| # Offset noise cluster labels to avoid collision with original ones | |
| new_cluster_start = labels.max() + 1 | |
| relabelled_noise = np.where(noise_labels != -1, noise_labels + new_cluster_start, -1) | |
| # Insert reclustered labels back into noise positions | |
| labels[noise_mask] = relabelled_noise | |
| else: | |
| return "Unknown algorithm", None | |
| cluster_silhouette_score = silhouette_score(umap_embeddings, labels, metric='euclidean') # use euclidean after UMAP reduction, else cosine better for text embeddings | |
| silhouette_text = ( | |
| f"Silhouette Score: {cluster_silhouette_score:.3f}" | |
| # "Explanation:\n" | |
| # "Scores close to +1 indicate well-separated, compact clusters.\n" | |
| # "Scores near 0 indicate overlapping clusters.\n" | |
| # "Negative scores suggest possible misclassification." | |
| ) | |
| # Label the df | |
| df["cluster"] = labels | |
| # Sample 5 images per cluster | |
| # img_clusters = [] | |
| # for cluster_id in sorted(df['cluster'].unique()): | |
| # sample_urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5] | |
| # for url in sample_urls: | |
| # img_clusters.append((f"Cluster {cluster_id}", url)) | |
| df = df[df["cluster"]!=-1] | |
| img_clusters = [] | |
| yield None, None, None, None, log("✅ Downloading images...") | |
| for cluster_id in sorted(df['cluster'].unique()): | |
| urls = df[df['cluster'] == cluster_id]['img_url'].dropna().unique()[:5] | |
| for idx, url in enumerate(urls): | |
| img_path = download_image(url, cluster_id, idx) | |
| if img_path: | |
| img_clusters.append((os.path.abspath(img_path), f"Cluster {cluster_id}")) | |
| prev_img_path = img_path | |
| prev_cluster_id = cluster_id | |
| else: | |
| img_clusters.append((os.path.abspath(prev_img_path), f"Cluster {prev_cluster_id}")) | |
| file_path = "cluster_output.csv" | |
| df[['img_url','top_tags','cluster']].to_csv(file_path, index=False) | |
| agg_df = df.groupby('cluster').agg( | |
| top_tags_joined=('top_tags', lambda x: ', '.join(x)), | |
| num_samples=('top_tags', 'count') | |
| ).reset_index() | |
| yield None, None, None, None, log("✅ Summarising cluster image tags with LLM...") | |
| agg_df['tag_summary'] = agg_df['top_tags_joined'].apply(lambda x : parse_with_ollama(x, llm_selector)) | |
| agg_df = agg_df[['cluster','num_samples','tag_summary','top_tags_joined']] | |
| yield agg_df, img_clusters, silhouette_text, file_path, log("✅ All done!") | |
| except Exception as e: | |
| return f"Error: {str(e)}", None, None, None, log(f"❌ Error: {str(e)}") | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| start_button = gr.Button("Start Clustering") | |
| file_input = gr.File(file_types=[".csv"], label="Upload CSV") | |
| with gr.Column(): | |
| algo_selector = gr.Dropdown(choices=["KMeans", "HDBSCAN"], label="Clustering Algorithm") | |
| umap_dims = gr.Slider(minimum=2, maximum=100, value=20, step=1, label="UMAP Dimensions") | |
| llm_selector = gr.Dropdown(choices=["qwen2.5:3b", "llama3.2:latest"], value="qwen2.5:3b", label="LLM Model") | |
| download_filepath = gr.File(label="Download Clustered Output", type="filepath") | |
| with gr.Row(): | |
| silhouette_text = gr.Textbox(label="Silhouette Score compares the average distance to points in the same cluster vs. points in the nearest other cluster. +1 indicate well-separated, compact clusters; 0 indicate overlapping clusters.", lines=1, interactive=False) | |
| with gr.Row(): | |
| output_df = gr.Dataframe(label="Clustered Output", interactive=False) | |
| with gr.Row(): | |
| gallery = gr.Gallery(label="Clustered Images (5 per cluster)", columns=5, height="auto") | |
| with gr.Row(): | |
| log_box = gr.Textbox(label="Processing Logs", lines=10, interactive=False) | |
| # Button triggers clustering | |
| start_button.click(fn=cluster_data, inputs=[file_input, algo_selector, umap_dims, llm_selector], outputs=[output_df, gallery, silhouette_text, download_filepath, log_box]) | |
| demo.launch() |