Leacb4
/

gap-clip

@@ -1,3 +1,11 @@
 import config
 import os
 import json
@@ -21,11 +29,21 @@ logger = logging.getLogger(__name__)
 # Dataset Classes
 # -------------------------------
 class ColorDataset(Dataset):
     def __init__(self, dataframe, tokenizer, transform=None):
         """
-        dataframe : pd.DataFrame with columns image and text columns
-        tokenizer : function that converts text -> list of integers (tokens)
-        transform : transformations on the image
         """
         self.df = dataframe.reset_index(drop=True)
         self.tokenizer = tokenizer
@@ -37,9 +55,19 @@ class ColorDataset(Dataset):
         ])
     def __len__(self):
         return len(self.df)
     def __getitem__(self, idx):
         row = self.df.iloc[idx]
         img = Image.open(config.column_local_image_path).convert("RGB")
         img = self.transform(img)
@@ -50,13 +78,34 @@ class ColorDataset(Dataset):
 # Tokenizer
 # -------------------------------
 class Tokenizer:
     def __init__(self):
         self.word2idx = defaultdict(lambda: 0)  # 0 = pad/unknown
         self.idx2word = {}
         self.counter = 1
     def preprocess_text(self, text):
-        """Extract color-related keywords from text"""
         # Color-related keywords to keep
         color_keywords = ['red', 'blue', 'green', 'yellow', 'purple', 'pink', 'orange',
                          'brown', 'black', 'white', 'gray', 'navy', 'beige', 'aqua', 'lime',
@@ -76,6 +125,12 @@ class Tokenizer:
         return ' '.join(filtered_words) if filtered_words else text.lower()
     def fit(self, texts):
         for text in texts:
             processed_text = self.preprocess_text(text)
             for word in processed_text.split():
@@ -85,10 +140,25 @@ class Tokenizer:
                     self.counter += 1
     def __call__(self, text):
         processed_text = self.preprocess_text(text)
         return [self.word2idx[word] for word in processed_text.split()]
     def load_vocab(self, word2idx_dict):
         self.word2idx = defaultdict(lambda: 0, {k: int(v) for k, v in word2idx_dict.items()})
         self.idx2word = {int(v): k for k, v in word2idx_dict.items() if int(v) > 0}
         self.counter = max(self.word2idx.values(), default=0) + 1
@@ -97,7 +167,20 @@ class Tokenizer:
 # Model Components
 # -------------------------------
 class ImageEncoder(nn.Module):
     def __init__(self, embedding_dim=config.color_emb_dim):
         super().__init__()
         self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
         self.backbone.fc = nn.Sequential(
@@ -106,17 +189,50 @@ class ImageEncoder(nn.Module):
         )
     def forward(self, x):
         x = self.backbone(x)
         return F.normalize(x, dim=-1)
 class TextEncoder(nn.Module):
     def __init__(self, vocab_size, embedding_dim=config.color_emb_dim):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, 32, padding_idx=0)  # Keep 32 dimensions
         self.dropout = nn.Dropout(0.1)  # Add regularization
         self.fc = nn.Linear(32, embedding_dim)
     def forward(self, x, lengths=None):
         emb = self.embedding(x)  # [B, T, 32]
         emb = self.dropout(emb)  # Apply dropout
         if lengths is not None:

+"""
+ColorCLIP model for learning color-aligned embeddings.
+This file contains the ColorCLIP model that learns to encode images and texts
+in an embedding space specialized for color representation. It includes
+a ResNet-based image encoder, a text encoder with custom tokenizer,
+and contrastive loss functions for training.
+"""
 import config
 import os
 import json
 # Dataset Classes
 # -------------------------------
 class ColorDataset(Dataset):
+    """
+    Dataset class for color embedding training.
+    Handles loading images from local paths and tokenizing text descriptions
+    for training the ColorCLIP model.
+    """
     def __init__(self, dataframe, tokenizer, transform=None):
         """
+        Initialize the color dataset.
+        Args:
+            dataframe: DataFrame with columns for image paths and text descriptions
+            tokenizer: Tokenizer instance that converts text to list of integers (tokens)
+            transform: Optional image transformations (default: standard ImageNet normalization)
         """
         self.df = dataframe.reset_index(drop=True)
         self.tokenizer = tokenizer
         ])
     def __len__(self):
+        """Return the number of samples in the dataset."""
         return len(self.df)
     def __getitem__(self, idx):
+        """
+        Get a sample from the dataset.
+        Args:
+            idx: Index of the sample
+        Returns:
+            Tuple of (image_tensor, token_tensor)
+        """
         row = self.df.iloc[idx]
         img = Image.open(config.column_local_image_path).convert("RGB")
         img = self.transform(img)
 # Tokenizer
 # -------------------------------
 class Tokenizer:
+    """
+    Tokenizer for extracting color-related keywords from text.
+    This tokenizer filters text to keep only color-related words and basic
+    descriptive words, then maps them to integer indices for embedding.
+    """
     def __init__(self):
+        """
+        Initialize the tokenizer.
+        Creates empty word-to-index and index-to-word mappings.
+        Index 0 is reserved for padding/unknown tokens.
+        """
         self.word2idx = defaultdict(lambda: 0)  # 0 = pad/unknown
         self.idx2word = {}
         self.counter = 1
     def preprocess_text(self, text):
+        """
+        Extract color-related keywords from text.
+        Args:
+            text: Input text string
+        Returns:
+            Preprocessed text containing only color and descriptive keywords
+        """
         # Color-related keywords to keep
         color_keywords = ['red', 'blue', 'green', 'yellow', 'purple', 'pink', 'orange',
                          'brown', 'black', 'white', 'gray', 'navy', 'beige', 'aqua', 'lime',
         return ' '.join(filtered_words) if filtered_words else text.lower()
     def fit(self, texts):
+        """
+        Build vocabulary from a list of texts.
+        Args:
+            texts: List of text strings to build vocabulary from
+        """
         for text in texts:
             processed_text = self.preprocess_text(text)
             for word in processed_text.split():
                     self.counter += 1
     def __call__(self, text):
+        """
+        Tokenize a text string into a list of integer indices.
+        Args:
+            text: Input text string
+        Returns:
+            List of integer token indices
+        """
         processed_text = self.preprocess_text(text)
         return [self.word2idx[word] for word in processed_text.split()]
     def load_vocab(self, word2idx_dict):
+        """
+        Load vocabulary from a word-to-index dictionary.
+        Args:
+            word2idx_dict: Dictionary mapping words to indices
+        """
         self.word2idx = defaultdict(lambda: 0, {k: int(v) for k, v in word2idx_dict.items()})
         self.idx2word = {int(v): k for k, v in word2idx_dict.items() if int(v) > 0}
         self.counter = max(self.word2idx.values(), default=0) + 1
 # Model Components
 # -------------------------------
 class ImageEncoder(nn.Module):
+    """
+    Image encoder based on ResNet18 for extracting image embeddings.
+    Uses a pretrained ResNet18 backbone and replaces the final layer
+    to output embeddings of the specified dimension.
+    """
     def __init__(self, embedding_dim=config.color_emb_dim):
+        """
+        Initialize the image encoder.
+        Args:
+            embedding_dim: Dimension of the output embedding (default: color_emb_dim)
+        """
         super().__init__()
         self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
         self.backbone.fc = nn.Sequential(
         )
     def forward(self, x):
+        """
+        Forward pass through the image encoder.
+        Args:
+            x: Image tensor [batch_size, channels, height, width]
+        Returns:
+            Normalized image embeddings [batch_size, embedding_dim]
+        """
         x = self.backbone(x)
         return F.normalize(x, dim=-1)
 class TextEncoder(nn.Module):
+    """
+    Text encoder for extracting text embeddings from token sequences.
+    Uses an embedding layer followed by mean pooling (with optional length normalization)
+    and a linear projection to the output embedding dimension.
+    """
     def __init__(self, vocab_size, embedding_dim=config.color_emb_dim):
+        """
+        Initialize the text encoder.
+        Args:
+            vocab_size: Size of the vocabulary
+            embedding_dim: Dimension of the output embedding (default: color_emb_dim)
+        """
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, 32, padding_idx=0)  # Keep 32 dimensions
         self.dropout = nn.Dropout(0.1)  # Add regularization
         self.fc = nn.Linear(32, embedding_dim)
     def forward(self, x, lengths=None):
+        """
+        Forward pass through the text encoder.
+        Args:
+            x: Token tensor [batch_size, sequence_length]
+            lengths: Optional sequence lengths tensor [batch_size] for proper mean pooling
+        Returns:
+            Normalized text embeddings [batch_size, embedding_dim]
+        """
         emb = self.embedding(x)  # [B, T, 32]
         emb = self.dropout(emb)  # Apply dropout
         if lengths is not None: