File size: 3,287 Bytes
41bfbd1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | """
HSL Feature Extractor
Replaces PaletteFeatureExtractor (which uses nn.Embedding for token IDs)
for the HSL color pipeline.
Input: (B, H, W, 3) FloatTensor — HSL palette with channels [h, s, l] in [0, 1]
Output: (B, H, W, D) FloatTensor — spatial features
Architecture:
1. Circular hue encoding: h -> (sin(2*pi*h), cos(2*pi*h))
2. Stack: [sin_h, cos_h, s, l] -> 4D tensor
3. Linear projection: nn.Linear(4, hidden_dim)
4. VisionTransformer: reuse existing VisionTransformer from models.vit
"""
import math
import torch
import torch.nn as nn
from .vit import VisionTransformer, trunc_normal_init_
class HSLFeatureExtractor(nn.Module):
"""
Feature extractor for HSL color palettes.
Uses circular hue encoding (sin/cos) to handle hue's circular nature
(hue 0 ≈ hue 1), then projects the 4D encoded features through a linear
layer and a VisionTransformer for spatial feature extraction.
Args:
hidden_dim: Transformer hidden dimension (default: 768)
num_layers: Number of transformer layers (default: 6)
num_heads: Number of attention heads (default: 8)
patch_size: Patch size for ViT patchification (default: 4)
dropout: Dropout probability (default: 0.1)
"""
def __init__(
self,
hidden_dim: int = 768,
num_layers: int = 6,
num_heads: int = 8,
patch_size: int = 4,
dropout: float = 0.1,
):
super().__init__()
self.hidden_dim = hidden_dim
# Project 4D circular-encoded HSL to hidden_dim
self.hsl_proj = nn.Linear(4, hidden_dim, bias=True)
# Vision Transformer for spatial feature extraction
self.vit = VisionTransformer(
hidden_dim=hidden_dim,
num_layers=num_layers,
num_heads=num_heads,
patch_size=patch_size,
dropout=dropout,
)
# Initialize hsl_proj weights with truncated normal
self._init_weights()
def _init_weights(self):
"""Initialize hsl_proj weights with truncated normal."""
std = 1.0 / math.sqrt(self.hsl_proj.in_features)
trunc_normal_init_(self.hsl_proj.weight, std=std)
if self.hsl_proj.bias is not None:
self.hsl_proj.bias.data.zero_()
def forward(self, palette_hsl: torch.Tensor) -> torch.Tensor:
"""
Extract spatial features from an HSL palette.
Args:
palette_hsl: (B, H, W, 3) FloatTensor with channels [h, s, l] in [0, 1]
Returns:
(B, H, W, D) FloatTensor spatial features
"""
# Split channels
h = palette_hsl[..., 0] # (B, H, W)
s = palette_hsl[..., 1] # (B, H, W)
l = palette_hsl[..., 2] # (B, H, W)
# Circular hue encoding — handles wraparound: hue 0 ≈ hue 1
sin_h = torch.sin(2 * math.pi * h) # (B, H, W)
cos_h = torch.cos(2 * math.pi * h) # (B, H, W)
# Stack into 4-channel tensor
encoded = torch.stack([sin_h, cos_h, s, l], dim=-1) # (B, H, W, 4)
# Project to hidden_dim
embedded = self.hsl_proj(encoded) # (B, H, W, D)
# Apply VisionTransformer for spatial feature extraction
features = self.vit(embedded) # (B, H, W, D)
return features
|