Leacb4
/

gap-clip

@@ -1,3 +1,11 @@
 import pandas as pd
 import torch
 import torch.nn as nn
@@ -17,7 +25,22 @@ import config
 # -------------------------
 class HierarchyDataset(Dataset):
     def __init__(self, dataframe, use_local_images=True, image_size=224):
         self.dataframe = dataframe
         self.use_local_images = use_local_images
         self.image_size = image_size
@@ -51,13 +74,28 @@ class HierarchyDataset(Dataset):
     def set_training_mode(self, training=True):
-        """Switch between training and validation transforms"""
         self.training_mode = training
     def __len__(self):
         return len(self.dataframe)
     def __getitem__(self, idx):
         row = self.dataframe.iloc[idx]
         # Try to load local image first
@@ -83,7 +121,15 @@ class HierarchyDataset(Dataset):
         return image, description, hierarchy
     def _download_image(self, img_url):
-        """Download an image from a URL with timeout"""
         response = requests.get(img_url, timeout=10)
         response.raise_for_status()
         image = Image.open(BytesIO(response.content)).convert("RGB")
@@ -94,9 +140,21 @@ class HierarchyDataset(Dataset):
 # -------------------------
 class HierarchyExtractor:
-    """Extract hierarchy directly from text using matching"""
     def __init__(self, hierarchy_classes, verbose=False):
         self.hierarchy_classes = sorted(hierarchy_classes)
         self.class_to_idx = {cls: idx for idx, cls in enumerate(self.hierarchy_classes)}
         self.idx_to_class = {idx: cls for idx, cls in enumerate(self.hierarchy_classes)}
@@ -109,7 +167,15 @@ class HierarchyExtractor:
             print(f"📋 Classes: {self.hierarchy_classes}")
     def _create_patterns(self):
-        """Create regex patterns for each hierarchy"""
         patterns = {}
         for hierarchy in self.hierarchy_classes:
@@ -162,7 +228,15 @@ class HierarchyExtractor:
         return patterns
     def extract_hierarchy(self, text):
-        """Extract hierarchy from text using pattern matching"""
         text_lower = text.lower()
         # Try exact match first
@@ -179,14 +253,31 @@ class HierarchyExtractor:
         return None
     def extract_hierarchy_idx(self, text):
-        """Extract hierarchy index from text"""
         hierarchy = self.extract_hierarchy(text)
         if hierarchy:
             return self.class_to_idx[hierarchy]
         return None
     def get_hierarchy_embedding(self, text, embed_dim=config.hierarchy_emb_dim):
-        """Create embedding from hierarchy index"""
         hierarchy_idx = self.extract_hierarchy_idx(text)
         if hierarchy_idx is not None:
             # Create one-hot encoding
@@ -206,7 +297,21 @@ class HierarchyExtractor:
 # -------------------------
 class PretrainedImageEncoder(nn.Module):
     def __init__(self, embed_dim, dropout=0.3):
         super().__init__()
         self.backbone = models.resnet18(pretrained=True)
@@ -230,7 +335,12 @@ class PretrainedImageEncoder(nn.Module):
         self._freeze_backbone_layers()
     def _freeze_backbone_layers(self):
-        """Freeze early layers to prevent overfitting"""
         if hasattr(self.backbone, 'children'):
             layers = list(self.backbone.children())
             freeze_until = int(len(layers) * 0.7)
@@ -240,13 +350,35 @@ class PretrainedImageEncoder(nn.Module):
                         param.requires_grad = False
     def forward(self, x):
         features = self.backbone(x)
         return self.projection(features)
 class HierarchyEncoder(nn.Module):
-    """Encoder that takes hierarchy index directly"""
     def __init__(self, num_hierarchies, embed_dim, dropout=0.3):
         super().__init__()
         self.num_hierarchies = num_hierarchies
         self.embed_dim = embed_dim
@@ -267,7 +399,9 @@ class HierarchyEncoder(nn.Module):
         self._init_weights()
     def _init_weights(self):
-        """Initialize weights properly"""
         nn.init.xavier_uniform_(self.embedding.weight)
         for module in self.projection.modules():
             if isinstance(module, nn.Linear):
@@ -276,6 +410,19 @@ class HierarchyEncoder(nn.Module):
                     nn.init.zeros_(module.bias)
     def forward(self, hierarchy_indices):
         # hierarchy_indices: (B,) - batch of hierarchy indices
         # Workaround for MPS: embedding layers don't work well with MPS, so do lookup on CPU
         device = next(self.parameters()).device
@@ -293,7 +440,23 @@ class HierarchyEncoder(nn.Module):
         return self.projection(emb)
 class HierarchyClassifierHead(nn.Module):
     def __init__(self, in_dim, num_classes, hidden_dim=None, dropout=0.3):
         super().__init__()
         if hidden_dim is None:
             hidden_dim = max(in_dim // 2, num_classes * 2)
@@ -309,10 +472,34 @@ class HierarchyClassifierHead(nn.Module):
         )
     def forward(self, x):
         return self.classifier(x)
 class Model(nn.Module):
     def __init__(self, num_hierarchy_classes, embed_dim, dropout=0.3):
         super().__init__()
         self.img_enc = PretrainedImageEncoder(embed_dim, dropout)
         self.hierarchy_enc = HierarchyEncoder(num_hierarchy_classes, embed_dim, dropout)
@@ -321,6 +508,20 @@ class Model(nn.Module):
         self.num_hierarchy_classes = num_hierarchy_classes
     def forward(self, image=None, hierarchy_indices=None):
         out = {}
         if image is not None:
             z_img = self.img_enc(image)
@@ -339,11 +540,27 @@ class Model(nn.Module):
         return out
     def set_hierarchy_extractor(self, hierarchy_extractor):
-        """Set the hierarchy extractor for text processing"""
         self.hierarchy_extractor = hierarchy_extractor
     def get_text_embeddings(self, text):
-        """Get text embeddings for a given text string or list of strings"""
         with torch.no_grad():
             # Get the device of the model
@@ -387,7 +604,18 @@ class Model(nn.Module):
                 raise ValueError(f"Expected string or list/tuple of strings, got {type(text)}: {text}")
     def get_image_embeddings(self, image):
-        """Get image embeddings for a given image tensor"""
         with torch.no_grad():
             if not isinstance(image, torch.Tensor):
                 raise ValueError("Image must be a torch.Tensor")
@@ -410,9 +638,27 @@ class Model(nn.Module):
 # -------------------------
 class Loss(nn.Module):
     def __init__(self, hierarchy_classes, classification_weight=1.0,
                  consistency_weight=0.3, contrastive_weight=0.2,
                  temperature=0.07, label_smoothing=0.1):
         super().__init__()
         self.classification_weight = classification_weight
         self.consistency_weight = consistency_weight
@@ -428,7 +674,16 @@ class Loss(nn.Module):
         self.mse = nn.MSELoss()
     def contrastive_loss(self, img_emb, txt_emb):
-        """InfoNCE contrastive loss"""
         sim_matrix = torch.matmul(img_emb, txt_emb.T) / self.temperature
         labels = torch.arange(img_emb.size(0), device=img_emb.device)
@@ -438,6 +693,19 @@ class Loss(nn.Module):
         return (loss_i2t + loss_t2i) / 2
     def forward(self, img_logits, txt_logits, img_embeddings, txt_embeddings, target_hierarchies):
         device = img_embeddings.device
         # Convert hierarchy names to indices
@@ -467,6 +735,19 @@ class Loss(nn.Module):
 # -------------------------
 def collate_fn(batch, hierarchy_extractor):
     images = torch.stack([b[0] for b in batch], dim=0)
     texts = [b[1] for b in batch]
     hierarchies = [b[2] for b in batch]
@@ -492,6 +773,17 @@ def collate_fn(batch, hierarchy_extractor):
     }
 def calculate_accuracy(logits, target_hierarchies, hierarchy_classes):
     batch_size = logits.size(0)
     correct = 0
     pred_indices = torch.argmax(logits, dim=1).cpu().numpy()
@@ -505,6 +797,23 @@ def calculate_accuracy(logits, target_hierarchies, hierarchy_classes):
     return correct / batch_size
 def train_one_epoch(model, dataloader, optimizer, device, hierarchy_classes, scheduler=None):
     model.train()
     total_loss = 0.0
     total_acc_img = 0.0
@@ -570,6 +879,21 @@ def train_one_epoch(model, dataloader, optimizer, device, hierarchy_classes, sch
     }
 def validate(model, dataloader, device, hierarchy_classes):
     model.eval()
     total_loss = 0.0
     total_acc_img = 0.0

+"""
+Hierarchy model for learning clothing category-aligned embeddings.
+This file contains the hierarchy model that learns to encode images and texts
+in an embedding space specialized for representing clothing categories (dress, shirt, etc.).
+It includes a regex pattern-based hierarchy extractor, a ResNet image encoder,
+a hierarchy embedding encoder, and loss functions for training.
+"""
 import pandas as pd
 import torch
 import torch.nn as nn
 # -------------------------
 class HierarchyDataset(Dataset):
+    """
+    Dataset class for hierarchy embedding training.
+    Handles loading images from local paths or URLs, extracting hierarchy information
+    from text descriptions, and applying appropriate transformations for training.
+    """
     def __init__(self, dataframe, use_local_images=True, image_size=224):
+        """
+        Initialize the hierarchy dataset.
+        Args:
+            dataframe: DataFrame with columns for image paths/URLs, text descriptions, and hierarchy labels
+            use_local_images: Whether to prefer local images over URLs (default: True)
+            image_size: Size of images after resizing (default: 224)
+        """
         self.dataframe = dataframe
         self.use_local_images = use_local_images
         self.image_size = image_size
     def set_training_mode(self, training=True):
+        """
+        Switch between training and validation transforms.
+        Args:
+            training: If True, use training transforms with augmentation; if False, use validation transforms
+        """
         self.training_mode = training
     def __len__(self):
+        """Return the number of samples in the dataset."""
         return len(self.dataframe)
     def __getitem__(self, idx):
+        """
+        Get a sample from the dataset.
+        Args:
+            idx: Index of the sample
+        Returns:
+            Tuple of (image_tensor, description_text, hierarchy_label)
+        """
         row = self.dataframe.iloc[idx]
         # Try to load local image first
         return image, description, hierarchy
     def _download_image(self, img_url):
+        """
+        Download an image from a URL with timeout.
+        Args:
+            img_url: URL of the image to download
+        Returns:
+            PIL Image object
+        """
         response = requests.get(img_url, timeout=10)
         response.raise_for_status()
         image = Image.open(BytesIO(response.content)).convert("RGB")
 # -------------------------
 class HierarchyExtractor:
+    """
+    Extract hierarchy categories directly from text using pattern matching.
+    This class uses regex patterns to identify clothing categories (e.g., shirt, dress)
+    from text descriptions, handling variations, plurals, and common fashion terms.
+    """
     def __init__(self, hierarchy_classes, verbose=False):
+        """
+        Initialize the hierarchy extractor.
+        Args:
+            hierarchy_classes: List of hierarchy class names
+            verbose: Whether to print initialization information (default: False)
+        """
         self.hierarchy_classes = sorted(hierarchy_classes)
         self.class_to_idx = {cls: idx for idx, cls in enumerate(self.hierarchy_classes)}
         self.idx_to_class = {idx: cls for idx, cls in enumerate(self.hierarchy_classes)}
             print(f"📋 Classes: {self.hierarchy_classes}")
     def _create_patterns(self):
+        """
+        Create regex patterns for each hierarchy class.
+        Creates patterns that match variations, plurals, and common fashion terms
+        for each hierarchy class.
+        Returns:
+            Dictionary mapping hierarchy classes to regex patterns
+        """
         patterns = {}
         for hierarchy in self.hierarchy_classes:
         return patterns
     def extract_hierarchy(self, text):
+        """
+        Extract hierarchy category from text using pattern matching.
+        Args:
+            text: Input text string
+        Returns:
+            Hierarchy class name if found, None otherwise
+        """
         text_lower = text.lower()
         # Try exact match first
         return None
     def extract_hierarchy_idx(self, text):
+        """
+        Extract hierarchy index from text.
+        Args:
+            text: Input text string
+        Returns:
+            Hierarchy index if found, None otherwise
+        """
         hierarchy = self.extract_hierarchy(text)
         if hierarchy:
             return self.class_to_idx[hierarchy]
         return None
     def get_hierarchy_embedding(self, text, embed_dim=config.hierarchy_emb_dim):
+        """
+        Create embedding from hierarchy index extracted from text.
+        Args:
+            text: Input text string
+            embed_dim: Dimension of the embedding (default: hierarchy_emb_dim)
+        Returns:
+            Embedding tensor of shape (embed_dim,)
+        """
         hierarchy_idx = self.extract_hierarchy_idx(text)
         if hierarchy_idx is not None:
             # Create one-hot encoding
 # -------------------------
 class PretrainedImageEncoder(nn.Module):
+    """
+    Image encoder based on pretrained ResNet18 for extracting image embeddings.
+    Uses a pretrained ResNet18 backbone and freezes early layers to prevent overfitting.
+    Adds a custom projection head to output embeddings of the specified dimension.
+    """
     def __init__(self, embed_dim, dropout=0.3):
+        """
+        Initialize the pretrained image encoder.
+        Args:
+            embed_dim: Dimension of the output embedding
+            dropout: Dropout rate for regularization (default: 0.3)
+        """
         super().__init__()
         self.backbone = models.resnet18(pretrained=True)
         self._freeze_backbone_layers()
     def _freeze_backbone_layers(self):
+        """
+        Freeze early layers to prevent overfitting.
+        Freezes the first 70% of backbone layers, allowing only the last layers
+        to be fine-tuned during training.
+        """
         if hasattr(self.backbone, 'children'):
             layers = list(self.backbone.children())
             freeze_until = int(len(layers) * 0.7)
                         param.requires_grad = False
     def forward(self, x):
+        """
+        Forward pass through the image encoder.
+        Args:
+            x: Image tensor [batch_size, channels, height, width]
+        Returns:
+            Image embeddings [batch_size, embed_dim]
+        """
         features = self.backbone(x)
         return self.projection(features)
 class HierarchyEncoder(nn.Module):
+    """
+    Encoder that takes hierarchy indices directly.
+    Uses an embedding layer to convert hierarchy indices to embeddings,
+    followed by a projection head to output embeddings of the specified dimension.
+    """
     def __init__(self, num_hierarchies, embed_dim, dropout=0.3):
+        """
+        Initialize the hierarchy encoder.
+        Args:
+            num_hierarchies: Number of hierarchy classes
+            embed_dim: Dimension of the output embedding
+            dropout: Dropout rate for regularization (default: 0.3)
+        """
         super().__init__()
         self.num_hierarchies = num_hierarchies
         self.embed_dim = embed_dim
         self._init_weights()
     def _init_weights(self):
+        """
+        Initialize weights properly using Xavier uniform initialization.
+        """
         nn.init.xavier_uniform_(self.embedding.weight)
         for module in self.projection.modules():
             if isinstance(module, nn.Linear):
                     nn.init.zeros_(module.bias)
     def forward(self, hierarchy_indices):
+        """
+        Forward pass through the hierarchy encoder.
+        Args:
+            hierarchy_indices: Tensor of hierarchy indices [batch_size]
+        Returns:
+            Hierarchy embeddings [batch_size, embed_dim]
+        Note:
+            Includes workaround for MPS device: embedding layers don't work well with MPS,
+            so embedding lookup is done on CPU and results are moved back to device.
+        """
         # hierarchy_indices: (B,) - batch of hierarchy indices
         # Workaround for MPS: embedding layers don't work well with MPS, so do lookup on CPU
         device = next(self.parameters()).device
         return self.projection(emb)
 class HierarchyClassifierHead(nn.Module):
+    """
+    Classifier head for hierarchy classification.
+    Multi-layer perceptron that takes embeddings as input and outputs
+    classification logits for hierarchy classes.
+    """
     def __init__(self, in_dim, num_classes, hidden_dim=None, dropout=0.3):
+        """
+        Initialize the hierarchy classifier head.
+        Args:
+            in_dim: Input embedding dimension
+            num_classes: Number of hierarchy classes
+            hidden_dim: Hidden layer dimension (default: max(in_dim // 2, num_classes * 2))
+            dropout: Dropout rate for regularization (default: 0.3)
+        """
         super().__init__()
         if hidden_dim is None:
             hidden_dim = max(in_dim // 2, num_classes * 2)
         )
     def forward(self, x):
+        """
+        Forward pass through the classifier head.
+        Args:
+            x: Input embeddings [batch_size, in_dim]
+        Returns:
+            Classification logits [batch_size, num_classes]
+        """
         return self.classifier(x)
 class Model(nn.Module):
+    """
+    Main hierarchy model for learning clothing category-aligned embeddings.
+    Combines image encoder, hierarchy encoder, and classifier heads to learn
+    aligned embeddings for images and text descriptions based on clothing categories.
+    """
     def __init__(self, num_hierarchy_classes, embed_dim, dropout=0.3):
+        """
+        Initialize the hierarchy model.
+        Args:
+            num_hierarchy_classes: Number of hierarchy classes
+            embed_dim: Dimension of the embedding space
+            dropout: Dropout rate for regularization (default: 0.3)
+        """
         super().__init__()
         self.img_enc = PretrainedImageEncoder(embed_dim, dropout)
         self.hierarchy_enc = HierarchyEncoder(num_hierarchy_classes, embed_dim, dropout)
         self.num_hierarchy_classes = num_hierarchy_classes
     def forward(self, image=None, hierarchy_indices=None):
+        """
+        Forward pass through the model.
+        Args:
+            image: Optional image tensor [batch_size, channels, height, width]
+            hierarchy_indices: Optional hierarchy indices tensor [batch_size]
+        Returns:
+            Dictionary containing:
+            - 'z_img': Image embeddings [batch_size, embed_dim] (if image provided)
+            - 'z_txt': Text embeddings [batch_size, embed_dim] (if hierarchy_indices provided)
+            - 'hierarchy_logits_img': Image classification logits [batch_size, num_classes] (if image provided)
+            - 'hierarchy_logits_txt': Text classification logits [batch_size, num_classes] (if hierarchy_indices provided)
+        """
         out = {}
         if image is not None:
             z_img = self.img_enc(image)
         return out
     def set_hierarchy_extractor(self, hierarchy_extractor):
+        """
+        Set the hierarchy extractor for text processing.
+        Args:
+            hierarchy_extractor: HierarchyExtractor instance
+        """
         self.hierarchy_extractor = hierarchy_extractor
     def get_text_embeddings(self, text):
+        """
+        Get text embeddings for a given text string or list of strings.
+        Args:
+            text: Text string or list of text strings
+        Returns:
+            Text embeddings tensor [batch_size, embed_dim]
+        Raises:
+            ValueError: If hierarchy cannot be extracted from text
+        """
         with torch.no_grad():
             # Get the device of the model
                 raise ValueError(f"Expected string or list/tuple of strings, got {type(text)}: {text}")
     def get_image_embeddings(self, image):
+        """
+        Get image embeddings for a given image tensor.
+        Args:
+            image: Image tensor [channels, height, width] or [batch_size, channels, height, width]
+        Returns:
+            Image embeddings tensor [batch_size, embed_dim]
+        Raises:
+            ValueError: If image is not a torch.Tensor
+        """
         with torch.no_grad():
             if not isinstance(image, torch.Tensor):
                 raise ValueError("Image must be a torch.Tensor")
 # -------------------------
 class Loss(nn.Module):
+    """
+    Combined loss function for hierarchy model training.
+    Combines classification loss, contrastive loss, and consistency loss
+    to learn aligned embeddings while maintaining classification accuracy.
+    """
     def __init__(self, hierarchy_classes, classification_weight=1.0,
                  consistency_weight=0.3, contrastive_weight=0.2,
                  temperature=0.07, label_smoothing=0.1):
+        """
+        Initialize the loss function.
+        Args:
+            hierarchy_classes: List of hierarchy class names
+            classification_weight: Weight for classification loss (default: 1.0)
+            consistency_weight: Weight for consistency loss (default: 0.3)
+            contrastive_weight: Weight for contrastive loss (default: 0.2)
+            temperature: Temperature scaling for contrastive loss (default: 0.07)
+            label_smoothing: Label smoothing parameter (default: 0.1)
+        """
         super().__init__()
         self.classification_weight = classification_weight
         self.consistency_weight = consistency_weight
         self.mse = nn.MSELoss()
     def contrastive_loss(self, img_emb, txt_emb):
+        """
+        InfoNCE contrastive loss for aligning image and text embeddings.
+        Args:
+            img_emb: Image embeddings [batch_size, embed_dim]
+            txt_emb: Text embeddings [batch_size, embed_dim]
+        Returns:
+            Contrastive loss value
+        """
         sim_matrix = torch.matmul(img_emb, txt_emb.T) / self.temperature
         labels = torch.arange(img_emb.size(0), device=img_emb.device)
         return (loss_i2t + loss_t2i) / 2
     def forward(self, img_logits, txt_logits, img_embeddings, txt_embeddings, target_hierarchies):
+        """
+        Forward pass through the loss function.
+        Args:
+            img_logits: Image classification logits [batch_size, num_classes]
+            txt_logits: Text classification logits [batch_size, num_classes]
+            img_embeddings: Image embeddings [batch_size, embed_dim]
+            txt_embeddings: Text embeddings [batch_size, embed_dim]
+            target_hierarchies: List of target hierarchy class names [batch_size]
+        Returns:
+            Combined loss value
+        """
         device = img_embeddings.device
         # Convert hierarchy names to indices
 # -------------------------
 def collate_fn(batch, hierarchy_extractor):
+    """
+    Collate function for DataLoader that processes batches and extracts hierarchy indices.
+    Args:
+        batch: List of (image, description, hierarchy) tuples
+        hierarchy_extractor: HierarchyExtractor instance
+    Returns:
+        Dictionary containing:
+        - 'image': Stacked image tensors [batch_size, channels, height, width]
+        - 'hierarchy_indices': Hierarchy indices tensor [batch_size]
+        - hierarchy_column: List of hierarchy class names [batch_size]
+    """
     images = torch.stack([b[0] for b in batch], dim=0)
     texts = [b[1] for b in batch]
     hierarchies = [b[2] for b in batch]
     }
 def calculate_accuracy(logits, target_hierarchies, hierarchy_classes):
+    """
+    Calculate classification accuracy.
+    Args:
+        logits: Classification logits [batch_size, num_classes]
+        target_hierarchies: List of target hierarchy class names [batch_size]
+        hierarchy_classes: List of hierarchy class names
+    Returns:
+        Accuracy score (float between 0 and 1)
+    """
     batch_size = logits.size(0)
     correct = 0
     pred_indices = torch.argmax(logits, dim=1).cpu().numpy()
     return correct / batch_size
 def train_one_epoch(model, dataloader, optimizer, device, hierarchy_classes, scheduler=None):
+    """
+    Train the model for one epoch.
+    Args:
+        model: Model instance to train
+        dataloader: DataLoader for training data
+        optimizer: Optimizer instance
+        device: Device to train on
+        hierarchy_classes: List of hierarchy class names
+        scheduler: Optional learning rate scheduler
+    Returns:
+        Dictionary containing training metrics:
+        - 'loss': Average training loss
+        - 'acc_img': Average image classification accuracy
+        - 'acc_txt': Average text classification accuracy
+    """
     model.train()
     total_loss = 0.0
     total_acc_img = 0.0
     }
 def validate(model, dataloader, device, hierarchy_classes):
+    """
+    Validate the model on validation data.
+    Args:
+        model: Model instance to validate
+        dataloader: DataLoader for validation data
+        device: Device to validate on
+        hierarchy_classes: List of hierarchy class names
+    Returns:
+        Dictionary containing validation metrics:
+        - 'loss': Average validation loss
+        - 'acc_img': Average image classification accuracy
+        - 'acc_txt': Average text classification accuracy
+    """
     model.eval()
     total_loss = 0.0
     total_acc_img = 0.0