pixagram-neo-backup

Runtime error

App Files Files Community

primerz commited on Nov 3, 2025

Commit

6971978

verified ·

1 Parent(s): 570d128

Update models.py

Browse files

Files changed (1) hide show

models.py +116 -1

models.py CHANGED Viewed

@@ -260,6 +260,121 @@ def load_image_encoder():
         return None
-__all__ = ['draw_kps', 'fuse_lora_with_scale', 'load_image_encoder']
 print("[OK] models.py ready - NO MultiControlNetModel, following examplewithface.py")

         return None
+def setup_ip_adapter(pipe, image_encoder):
+    """
+    Setup IP-Adapter for InstantID face embeddings - PROPER IMPLEMENTATION.
+    Based on the reference InstantID pipeline.
+    """
+    if image_encoder is None:
+        return None, False
+    print("Setting up IP-Adapter for InstantID face embeddings (proper implementation)...")
+    try:
+        # Download InstantID weights
+        ip_adapter_path = download_model_with_retry(
+            "InstantX/InstantID",
+            "ip-adapter.bin"
+        )
+        # Load full state dict
+        state_dict = torch.load(ip_adapter_path, map_location="cpu")
+        # Extract image_proj and ip_adapter weights
+        image_proj_state_dict = {}
+        ip_adapter_state_dict = {}
+        for key, value in state_dict.items():
+            if key.startswith("image_proj."):
+                image_proj_state_dict[key.replace("image_proj.", "")] = value
+            elif key.startswith("ip_adapter."):
+                ip_adapter_state_dict[key.replace("ip_adapter.", "")] = value
+        # Create Resampler (image projection model) with CORRECT parameters from reference
+        print("Creating Resampler (Perceiver architecture)...")
+        image_proj_model = Resampler(
+            dim=1280,                                       # Hidden dimension
+            depth=4,                                        # IMPORTANT: 4 layers (not 8!)
+            dim_head=64,                                    # Dimension per head
+            heads=20,                                       # Number of heads
+            num_queries=16,                                 # Number of output tokens
+            embedding_dim=512,                              # InsightFace embedding dim
+            output_dim=pipe.unet.config.cross_attention_dim,  # SDXL cross-attention dim (2048)
+            ff_mult=4                                       # Feedforward multiplier
+        )
+        image_proj_model.eval()
+        image_proj_model = image_proj_model.to(device, dtype=dtype)
+        # Load image_proj weights
+        if image_proj_state_dict:
+            try:
+                image_proj_model.load_state_dict(image_proj_state_dict, strict=True)
+                print("  [OK] Resampler loaded with pretrained weights")
+            except Exception as e:
+                print(f"  [WARNING] Could not load Resampler weights: {e}")
+                print("  Using randomly initialized Resampler")
+        else:
+            print("  [WARNING] No image_proj weights found, using random initialization")
+        # Setup IP-Adapter attention processors
+        print("Setting up IP-Adapter attention processors...")
+        attn_procs = {}
+        num_tokens = 16  # Match Resampler num_queries
+        for name in pipe.unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = pipe.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(pipe.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = pipe.unet.config.block_out_channels[block_id]
+            else:
+                hidden_size = pipe.unet.config.block_out_channels[-1]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor2_0()
+            else:
+                attn_procs[name] = IPAttnProcessor2_0(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    num_tokens=num_tokens
+                ).to(device, dtype=dtype)
+        # Set attention processors
+        pipe.unet.set_attn_processor(attn_procs)
+        # Load IP-Adapter weights into attention processors
+        if ip_adapter_state_dict:
+            try:
+                ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
+                ip_layers.load_state_dict(ip_adapter_state_dict, strict=False)
+                print("  [OK] IP-Adapter attention weights loaded")
+            except Exception as e:
+                print(f"  [WARNING] Could not load IP-Adapter weights: {e}")
+        else:
+            print("  [WARNING] No ip_adapter weights found")
+        # Store image encoder and projection model
+        pipe.image_encoder = image_encoder
+        print("  [OK] IP-Adapter fully loaded with InstantID architecture")
+        print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
+        print(f"  - Face embeddings: 512D â†’ 16x2048D")
+        return image_proj_model, True
+    except Exception as e:
+        print(f"  [ERROR] Could not setup IP-Adapter: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, False
+__all__ = ['draw_kps', 'fuse_lora_with_scale', 'load_image_encoder', 'setup_ip_adapter']
 print("[OK] models.py ready - NO MultiControlNetModel, following examplewithface.py")