Leacb4
/

gap-clip

+#!/usr/bin/env python3
+"""
+Outputs several t-SNE visualizations with color and hierarchy overlays to
+verify that the main model separates colors well inside each hierarchy group.
+"""
+import math
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch
+from matplotlib.patches import Polygon
+from PIL import Image
+from sklearn.manifold import TSNE
+from sklearn.metrics import (
+    silhouette_score,
+    davies_bouldin_score,
+    calinski_harabasz_score,
+)
+from sklearn.preprocessing import normalize
+from sklearn.metrics.pairwise import cosine_similarity
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from tqdm import tqdm
+from transformers import CLIPModel as CLIPModel_transformers, CLIPProcessor
+try:
+    from scipy.spatial import ConvexHull
+except ImportError:
+    ConvexHull = None
+from config import (
+    color_column,
+    color_emb_dim,
+    column_local_image_path,
+    device,
+    hierarchy_column,
+    hierarchy_emb_dim,
+    images_dir,
+    local_dataset_path,
+    main_model_path,
+)
+class ImageDataset(Dataset):
+    """Lightweight dataset to load local images along with colors and hierarchies."""
+    def __init__(self, dataframe: pd.DataFrame, root_dir: str):
+        self.df = dataframe.reset_index(drop=True)
+        self.root_dir = root_dir
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ]
+        )
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        img_path = row[column_local_image_path]
+        image = Image.open(img_path).convert("RGB")
+        image = self.transform(image)
+        color = row[color_column]
+        hierarchy = row[hierarchy_column]
+        return image, color, hierarchy
+def load_main_model():
+    """Load the main model with the trained weights."""
+    checkpoint = torch.load(main_model_path, map_location=device)
+    state_dict = checkpoint.get("model_state_dict", checkpoint)
+    model = CLIPModel_transformers.from_pretrained(
+        "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+    )
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    # Load processor for text tokenization
+    processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+    return model, processor
+def load_clip_baseline():
+    """Load the CLIP baseline model from transformers."""
+    print("🤗 Loading CLIP baseline model from transformers...")
+    clip_model = CLIPModel_transformers.from_pretrained("openai/clip-vit-base-patch32").to(device)
+    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    clip_model.eval()
+    print("✅ CLIP baseline model loaded successfully")
+    return clip_model, clip_processor
+def enforce_min_hierarchy_samples(df, min_per_hierarchy):
+    """Filter out hierarchy groups with fewer than min_per_hierarchy rows."""
+    if not min_per_hierarchy or min_per_hierarchy <= 0:
+        return df
+    counts = df[hierarchy_column].value_counts()
+    keep_values = counts[counts >= min_per_hierarchy].index
+    filtered = df[df[hierarchy_column].isin(keep_values)].reset_index(drop=True)
+    return filtered
+def prepare_dataframe(df, sample_size, per_color_limit, min_per_hierarchy=None):
+    """Subsample the dataframe to speed up the t-SNE."""
+    if per_color_limit and per_color_limit > 0:
+        df_limited = (
+            df.groupby(color_column)
+            .apply(lambda g: g.sample(min(len(g), per_color_limit), random_state=42))
+            .reset_index(drop=True)
+        )
+    else:
+        df_limited = df
+    if sample_size and 0 < sample_size < len(df_limited):
+        df_limited = df_limited.sample(sample_size, random_state=42).reset_index(
+            drop=True
+        )
+    df_limited = enforce_min_hierarchy_samples(df_limited, min_per_hierarchy)
+    return df_limited
+def compute_embeddings(model, dataloader):
+    """Extract color, hierarchy, and combined embeddings."""
+    color_embeddings = []
+    hierarchy_embeddings = []
+    color_labels = []
+    hierarchy_labels = []
+    with torch.no_grad():
+        for images, colors, hierarchies in tqdm(
+            dataloader, desc="Extracting embeddings"
+        ):
+            images = images.to(device)
+            if images.shape[1] == 1:  # safety in case
+                images = images.expand(-1, 3, -1, -1)
+            image_embeds = model.get_image_features(pixel_values=images)
+            color_part = image_embeds[:, :color_emb_dim]
+            hierarchy_part = image_embeds[
+                :, color_emb_dim : color_emb_dim + hierarchy_emb_dim
+            ]
+            color_embeddings.append(color_part.cpu().numpy())
+            hierarchy_embeddings.append(hierarchy_part.cpu().numpy())
+            color_labels.extend(colors)
+            hierarchy_labels.extend(hierarchies)
+    return (
+        np.concatenate(color_embeddings, axis=0),
+        np.concatenate(hierarchy_embeddings, axis=0),
+        color_labels,
+        hierarchy_labels,
+    )
+def compute_clip_embeddings(clip_model, clip_processor, dataloader):
+    """Extract CLIP baseline embeddings (full image embeddings, not separated)."""
+    all_embeddings = []
+    color_labels = []
+    hierarchy_labels = []
+    with torch.no_grad():
+        for images, colors, hierarchies in tqdm(
+            dataloader, desc="Extracting CLIP embeddings"
+        ):
+            batch_embeddings = []
+            for i in range(images.shape[0]):
+                # Get single image from batch
+                image_tensor = images[i]  # Shape: (3, 224, 224)
+                # Denormalize on CPU (safer for PIL conversion)
+                mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+                std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+                image_tensor = image_tensor * std + mean
+                image_tensor = torch.clamp(image_tensor, 0, 1)
+                # Convert to PIL Image (must be on CPU)
+                image_pil = transforms.ToPILImage()(image_tensor.cpu())
+                # Process with CLIP (using empty text since we only need image embeddings)
+                inputs = clip_processor(
+                    text="",
+                    images=image_pil,
+                    return_tensors="pt",
+                    padding=True
+                ).to(device)
+                outputs = clip_model(**inputs)
+                # Get normalized image embeddings
+                image_emb = outputs.image_embeds / outputs.image_embeds.norm(p=2, dim=-1, keepdim=True)
+                batch_embeddings.append(image_emb.cpu().numpy())
+            all_embeddings.append(np.vstack(batch_embeddings))
+            color_labels.extend(colors)
+            hierarchy_labels.extend(hierarchies)
+    # For CLIP, we use the full embeddings for all visualizations
+    # (no separation into color/hierarchy dimensions)
+    full_embeddings = np.concatenate(all_embeddings, axis=0)
+    return (
+        full_embeddings,  # color_embeddings (using full CLIP embeddings)
+        full_embeddings,  # hierarchy_embeddings (using full CLIP embeddings)
+        full_embeddings,  # color_hier_embeddings (using full CLIP embeddings)
+        color_labels,
+        hierarchy_labels,
+    )
+def compute_dunn_index(embeddings, labels):
+    """
+    Compute the Dunn Index for clustering evaluation.
+    The Dunn Index is the ratio of the minimum inter-cluster distance
+    to the maximum intra-cluster distance. Higher values indicate better clustering.
+    Args:
+        embeddings: Array of embeddings [N, embed_dim]
+        labels: Array of cluster labels [N]
+    Returns:
+        Dunn Index value (float) or None if calculation fails
+    """
+    try:
+        unique_labels = np.unique(labels)
+        if len(unique_labels) < 2:
+            return None
+        # Calculate intra-cluster distances (maximum within each cluster)
+        max_intra_cluster_dist = 0
+        for label in unique_labels:
+            cluster_points = embeddings[labels == label]
+            if len(cluster_points) > 1:
+                # Calculate pairwise distances within cluster
+                from scipy.spatial.distance import pdist
+                intra_dists = pdist(cluster_points, metric='euclidean')
+                if len(intra_dists) > 0:
+                    max_intra = np.max(intra_dists)
+                    max_intra_cluster_dist = max(max_intra_cluster_dist, max_intra)
+        if max_intra_cluster_dist == 0:
+            return None
+        # Calculate inter-cluster distances (minimum between clusters)
+        min_inter_cluster_dist = float('inf')
+        for i, label1 in enumerate(unique_labels):
+            for label2 in unique_labels[i+1:]:
+                cluster1_points = embeddings[labels == label1]
+                cluster2_points = embeddings[labels == label2]
+                # Calculate distances between clusters
+                from scipy.spatial.distance import cdist
+                inter_dists = cdist(cluster1_points, cluster2_points, metric='euclidean')
+                min_inter = np.min(inter_dists)
+                min_inter_cluster_dist = min(min_inter_cluster_dist, min_inter)
+        if min_inter_cluster_dist == float('inf'):
+            return None
+        # Dunn Index = minimum inter-cluster distance / maximum intra-cluster distance
+        dunn_index = min_inter_cluster_dist / max_intra_cluster_dist
+        return float(dunn_index)
+    except Exception as e:
+        print(f"⚠️ Error computing Dunn Index: {e}")
+        return None
+def build_color_map(labels, prefer_true_colors=False):
+    """Build a color mapping for labels."""
+    unique_labels = sorted(set(labels))
+    palette = sns.color_palette("husl", len(unique_labels))
+    return {label: palette[idx] for idx, label in enumerate(unique_labels)}
+def compute_color_similarity_matrix(embeddings, colors, title="Color similarity (image embeddings)"):
+    """Compute and visualize similarity matrix between color centroids."""
+    # Use only the colors from the reference heatmap
+    reference_colors = ['red', 'pink', 'blue', 'green', 'aqua', 'lime', 'yellow', 'orange',
+                        'purple', 'brown', 'gray', 'black', 'white']
+    # Map 'yelloworange' to 'yellow' or 'orange' if needed
+    color_mapping = {
+        'yelloworange': 'yellow',
+        'grey': 'gray'  # Handle grey/gray variation
+    }
+    # Filter to only include colors that are in the reference list
+    filtered_colors = []
+    filtered_embeddings = []
+    for i, color in enumerate(colors):
+        # Normalize color name
+        normalized_color = color_mapping.get(color.lower(), color.lower())
+        if normalized_color in reference_colors:
+            filtered_colors.append(normalized_color)
+            filtered_embeddings.append(embeddings[i])
+    if len(filtered_colors) == 0:
+        print("⚠️ No matching colors found in reference list")
+        return None
+    # Use only unique colors from reference that exist in data
+    unique_colors = sorted([c for c in reference_colors if c in filtered_colors])
+    # Convert to numpy arrays
+    filtered_embeddings = np.array(filtered_embeddings)
+    filtered_colors = np.array(filtered_colors)
+    # Compute centroids for each color
+    centroids = {}
+    for color in unique_colors:
+        color_mask = np.array([c == color for c in filtered_colors])
+        if color_mask.sum() > 0:
+            centroids[color] = np.mean(filtered_embeddings[color_mask], axis=0)
+    # Compute similarity matrix
+    similarity_matrix = np.zeros((len(unique_colors), len(unique_colors)))
+    for i, color1 in enumerate(unique_colors):
+        for j, color2 in enumerate(unique_colors):
+            if i == j:
+                similarity_matrix[i, j] = 1.0
+            else:
+                if color1 in centroids and color2 in centroids:
+                    similarity = cosine_similarity(
+                        [centroids[color1]],
+                        [centroids[color2]]
+                    )[0][0]
+                    similarity_matrix[i, j] = similarity
+    # Create heatmap
+    plt.figure(figsize=(12, 10))
+    sns.heatmap(
+        similarity_matrix,
+        annot=True,
+        fmt='.2f',
+        cmap='RdYlBu_r',
+        xticklabels=unique_colors,
+        yticklabels=unique_colors,
+        square=True,
+        cbar_kws={'label': 'Cosine Similarity'},
+        linewidths=0.5,
+        vmin=-0.6,
+        vmax=1.0
+    )
+    plt.title(title, fontsize=16, fontweight='bold', pad=20)
+    plt.xlabel('Colors', fontsize=14, fontweight='bold')
+    plt.ylabel('Colors', fontsize=14, fontweight='bold')
+    plt.xticks(rotation=45, ha='right')
+    plt.yticks(rotation=0)
+    plt.tight_layout()
+    output_path = "color_similarity_image_embeddings.png"
+    plt.savefig(output_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"✅ Color similarity heatmap saved: {output_path}")
+    return similarity_matrix
+def run_tsne(embeddings,legend_labels,output_path,perplexity,title,scatter_color_labels=None,prefer_true_colors=False):
+    """Calculate and plot a t-SNE projection."""
+    tsne = TSNE(
+        n_components=2,
+        perplexity=perplexity,
+        init="pca",
+        learning_rate="auto",
+        random_state=42,
+    )
+    reduced = tsne.fit_transform(embeddings)
+    label_array = np.array(legend_labels)
+    color_labels = (
+        np.array(scatter_color_labels) if scatter_color_labels is not None else label_array
+    )
+    # Calculate silhouette scores
+    unique_labels_list = sorted(set(label_array))
+    if len(unique_labels_list) > 1 and len(label_array) > 1:
+        # Convert labels to numeric indices for silhouette_score
+        label_to_idx = {label: idx for idx, label in enumerate(unique_labels_list)}
+        numeric_labels = np.array([label_to_idx[label] for label in label_array])
+        # Calculate in original embedding space (ground truth - measures real separation)
+        silhouette = silhouette_score(embeddings, numeric_labels, metric='euclidean')
+        davies_bouldin = davies_bouldin_score(embeddings, numeric_labels)
+        calinski_harabasz = calinski_harabasz_score(embeddings, numeric_labels)
+        dunn = compute_dunn_index(embeddings, numeric_labels)
+    else:
+        silhouette = None
+        davies_bouldin = None
+        calinski_harabasz = None
+        dunn = None
+    # Helpful reference for the reported clustering indices:
+    #   • Silhouette Score ∈ [-1, 1] — closer to 1 means points fit their cluster well, 0 means overlap, < 0 suggests misassignment.
+    #   • Davies–Bouldin Index ∈ [0, +∞) — lower is better; quantifies average similarity between clusters relative to their size.
+    #   • Calinski–Harabasz Index ∈ [0, +∞) — higher is better; ratio of between-cluster dispersion to within-cluster dispersion.
+    #   • Dunn Index ∈ [0, +∞) — higher is better; compares the tightest cluster diameter to the closest distance between clusters.
+    # Build color map for visualization
+    color_map = build_color_map(color_labels, prefer_true_colors=prefer_true_colors)
+    color_series = np.array([color_map[label] for label in color_labels])
+    plt.figure(figsize=(10, 8))
+    unique_labels = sorted(set(label_array))
+    for label in unique_labels:
+        mask = label_array == label
+        if 'color' in title:
+            c = label
+        else:
+            c = color_series[mask]
+        plt.scatter(
+            reduced[mask, 0],
+            reduced[mask, 1],
+            c=c,
+            s=15,
+            alpha=0.8,
+            label=label,
+        )
+    # Add silhouette score to title
+    if silhouette is not None:
+        title_with_score = f"{title}\n(t-SNE Silhouette: {silhouette:.3f} | Davies-Bouldin: {davies_bouldin:.3f} | Calinski-Harabasz: {calinski_harabasz:.3f} | Dunn: {dunn:.3f})"
+    else:
+        title_with_score = title
+    plt.title(title_with_score)
+    plt.xlabel("t-SNE 1")
+    plt.ylabel("t-SNE 2")
+    plt.legend(
+        bbox_to_anchor=(1.05, 1), loc="upper left", fontsize="small", frameon=False
+    )
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300)
+    plt.close()
+    print(f"✅ Figure saved in {output_path}")
+    print(f"   📊 t-SNE space: {silhouette:.3f} (matches visualization) | Davies-Bouldin: {davies_bouldin:.3f} | Calinski-Harabasz: {calinski_harabasz:.3f} | Dunn: {dunn:.3f}")
+def filter_valid_rows(dataframe: pd.DataFrame) -> pd.DataFrame:
+    """Keep only rows with valid local image paths and colors."""
+    dataframe = dataframe[dataframe['color'] != 'unknown'].copy()
+    df = dataframe.dropna(
+        subset=[column_local_image_path, color_column, hierarchy_column]
+    ).copy()
+    mask = df[column_local_image_path].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)
+    return df[mask].reset_index(drop=True)
+if __name__ == "__main__":
+    sample_size = None
+    per_color_limit = 500
+    min_per_hierarchy = 200
+    batch_size = 32
+    perplexity = 30
+    output_color = "tsne_color_space.png"
+    output_hierarchy = "tsne_hierarchy_space.png"
+    print("📥 Loading the dataset...")
+    df = pd.read_csv("data/data_with_local_paths.csv")
+    df = filter_valid_rows(df)
+    print(f"Total len if the dataset: {len(df)}")
+    df = prepare_dataframe(df, sample_size, per_color_limit, min_per_hierarchy)
+    print(f"✅ {len(df)} samples will be used for the t-SNE")
+    print(f"Number of colors in the dataset: {len(df['color'].unique())}")
+    print(f"Colors in the dataset: {df['color'].unique()}")
+    dataset = ImageDataset(df, images_dir)
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)
+    # 2) Loading the models
+    print("⚙️ Loading the main model...")
+    model, processor = load_main_model()
+    print("⚙️ Loading CLIP baseline model...")
+    clip_model, clip_processor = load_clip_baseline()
+    # 3) Extracting the embeddings
+    print("🎯 Extracting the embeddings...")
+    (
+        color_embeddings,
+        hierarchy_embeddings,
+        colors,
+        hierarchies,
+    ) = compute_embeddings(model, dataloader)
+    # 4) Calculating the t-SNE
+    print("🌀 Calculating the color t-SNE...")
+    run_tsne(
+        color_embeddings,
+        colors,
+        output_color,
+        perplexity,
+        "t-SNE of the color embeddings of the main model",
+        scatter_color_labels=colors,
+        prefer_true_colors=True,
+    )
+    print("🎨 Computing color similarity matrix from image embeddings...")
+    compute_color_similarity_matrix(
+        color_embeddings,
+        colors,
+        title="Color similarity (image embeddings - main model)"
+    )
+    print("🌀 Calculating the hierarchy t-SNE...")
+    run_tsne(
+        hierarchy_embeddings,
+        hierarchies,
+        output_hierarchy,
+        perplexity,
+        "t-SNE of the hierarchy embeddings of the main model",
+        scatter_color_labels=hierarchies,
+    )
+    # ========== CLIP BASELINE EVALUATION ==========
+    print("\n" + "="*60)
+    print("🔄 Starting CLIP Baseline Evaluation")
+    print("="*60)
+    print("🎯 Extracting CLIP embeddings...")
+    (
+        clip_color_embeddings,
+        clip_hierarchy_embeddings,
+        clip_color_hier_embeddings,
+        clip_colors,
+        clip_hierarchies,
+    ) = compute_clip_embeddings(clip_model, clip_processor, dataloader)
+    # Output paths for CLIP baseline
+    clip_output_color = "clip_baseline_tsne_color_space.png"
+    clip_output_hierarchy = "clip_baseline_tsne_hierarchy_space.png"
+    print("🌀 Calculating CLIP baseline color t-SNE...")
+    run_tsne(
+        clip_color_embeddings,
+        clip_colors,
+        clip_output_color,
+        perplexity,
+        "t-SNE of the color embeddings (CLIP Baseline)",
+        scatter_color_labels=clip_colors,
+        prefer_true_colors=True,
+    )
+    print("🎨 Computing color similarity matrix from image embeddings...")
+    compute_color_similarity_matrix(
+        clip_color_embeddings,
+        clip_colors,
+        title="Color similarity (image embeddings - CLIP Baseline)"
+    )
+    print("🌀 Calculating CLIP baseline hierarchy t-SNE...")
+    run_tsne(
+        clip_hierarchy_embeddings,
+        clip_hierarchies,
+        clip_output_hierarchy,
+        perplexity,
+        "t-SNE of the hierarchy embeddings (CLIP Baseline)",
+        scatter_color_labels=clip_hierarchies,
+    )
+    print("\n✅ All t-SNE visualizations completed!")
+    print("   - Main model: tsne_*.png")
+    print("   - CLIP baseline: clip_baseline_tsne_*.png")