Alogotron
/

Milady-Avatar-Adapter

Model card Files Files and versions

xet

Community

Alogotron commited on about 23 hours ago

Commit

f0366ca

verified ·

1 Parent(s): 5df13d2

Upload sdxl/sdxl_adapter.py with huggingface_hub

Browse files

Files changed (1) hide show

sdxl/sdxl_adapter.py +112 -0

sdxl/sdxl_adapter.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+SDXL Adapter - Maps Qwen3-4B activations to SDXL prompt embedding space.
+Input:  [B, 7680] - Qwen3-4B hidden states from layers [9, 18, 27]
+Output: [B, 77, 2048] prompt_embeds + [B, 1280] pooled_prompt_embeds
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerWeightedInput(nn.Module):
+    def __init__(self, n_layers=3, layer_dim=2560):
+        super().__init__()
+        self.n_layers = n_layers
+        self.layer_dim = layer_dim
+        self.layer_logits = nn.Parameter(torch.zeros(n_layers))
+    def forward(self, x):
+        # x: [B, n_layers * layer_dim] -> [B, layer_dim]
+        B = x.shape[0]
+        chunks = x.reshape(B, self.n_layers, self.layer_dim)
+        weights = F.softmax(self.layer_logits, dim=0)
+        return (chunks * weights[None, :, None]).sum(dim=1)
+class SDXLCrossAttentionAdapter(nn.Module):
+    """Cross-attention adapter mapping LLM activations to SDXL embedding space."""
+    def __init__(self, in_dim=2560, rank=256, n_input_tokens=8,
+                 n_heads=8, n_layers=3, n_output_tokens=77,
+                 main_dim=2048, pooled_dim=1280):
+        super().__init__()
+        self.in_dim = in_dim
+        self.rank = rank
+        self.n_input_tokens = n_input_tokens
+        self.n_output_tokens = n_output_tokens
+        self.main_dim = main_dim
+        self.pooled_dim = pooled_dim
+        # Encode input activation into multiple tokens
+        self.input_encoder = nn.Sequential(
+            nn.Linear(in_dim, rank), nn.GELU(),
+            nn.Linear(rank, n_input_tokens * rank),
+        )
+        # Learnable queries for 77 output tokens
+        self.queries = nn.Parameter(torch.randn(n_output_tokens, rank) * 0.02)
+        # Transformer decoder: queries attend to encoded input
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=rank, nhead=n_heads,
+            dim_feedforward=rank * 4, activation='gelu',
+            batch_first=True, norm_first=True,
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_layers)
+        # Project to SDXL main embedding space
+        self.main_project = nn.Sequential(
+            nn.LayerNorm(rank),
+            nn.Linear(rank, main_dim),
+        )
+        # Pooled embedding head: aggregate decoded tokens -> single vector
+        self.pooled_head = nn.Sequential(
+            nn.Linear(rank, rank), nn.GELU(),
+            nn.Linear(rank, pooled_dim),
+        )
+    def forward(self, x):
+        """x: [B, in_dim] -> (main: [B, 77, 2048], pooled: [B, 1280])"""
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        B = x.shape[0]
+        # Encode input into memory tokens
+        memory = self.input_encoder(x).reshape(B, self.n_input_tokens, self.rank)
+        # Cross-attention: queries attend to memory
+        queries = self.queries.unsqueeze(0).expand(B, -1, -1)
+        decoded = self.decoder(queries, memory)  # [B, 77, rank]
+        # Main embeddings
+        main_embeds = self.main_project(decoded)  # [B, 77, 2048]
+        # Pooled embeddings from mean of decoded
+        pooled = self.pooled_head(decoded.mean(dim=1))  # [B, 1280]
+        return main_embeds, pooled
+def count_params(model):
+    return sum(p.numel() for p in model.parameters())
+if __name__ == "__main__":
+    # Quick test
+    layer_weight = LayerWeightedInput(n_layers=3, layer_dim=2560)
+    adapter = SDXLCrossAttentionAdapter(
+        in_dim=2560, rank=256, n_input_tokens=8,
+        n_heads=8, n_layers=3,
+    )
+    x = torch.randn(2, 7680)  # batch of 2, concat of 3 layers
+    x_weighted = layer_weight(x)  # [2, 2560]
+    main, pooled = adapter(x_weighted)
+    print(f"LayerWeightedInput params: {count_params(layer_weight):,}")
+    print(f"SDXLAdapter params: {count_params(adapter):,}")
+    print(f"Total params: {count_params(layer_weight) + count_params(adapter):,}")
+    print(f"Input: {x.shape}")
+    print(f"Weighted: {x_weighted.shape}")
+    print(f"Main embeds: {main.shape}")
+    print(f"Pooled embeds: {pooled.shape}")