pixagram-dev

Runtime error

App Files Files Community

primerz commited on Nov 1

Commit

22858c3

verified ·

1 Parent(s): fc3355b

Update models.py

Browse files

Files changed (1) hide show

models.py +62 -179

models.py CHANGED Viewed

@@ -1,40 +1,33 @@
 """
 Model loading and initialization for Pixagram AI Pixel Art Generator
-FIXED VERSION with proper IP-Adapter and BLIP-2 support
 """
 import torch
 import time
 import os
-import shutil
 from diffusers import (
-    StableDiffusionXLControlNetImg2ImgPipeline,
     ControlNetModel,
     AutoencoderKL,
     LCMScheduler
 )
-from diffusers.models.attention_processor import AttnProcessor2_0
-from transformers import (
-    CLIPVisionModelWithProjection, CLIPTokenizer,
-    CLIPTextModel, CLIPTextModelWithProjection
-)
 from insightface.app import FaceAnalysis
 from controlnet_aux import ZoeDetector, OpenposeDetector, LeresDetector, MidasDetector, MediapipeFaceDetector
 from huggingface_hub import hf_hub_download, snapshot_download
-# --- START FIX: Import our new Cappella module ---
-from cappella import Cappella
 # --- END FIX ---
-# Use reference implementation's attention processor
-from attention_processor import IPAttnProcessor2_0, AttnProcessor
-from resampler import Resampler
 from config import (
     device, dtype, MODEL_REPO, MODEL_FILES, HUGGINGFACE_TOKEN,
     FACE_DETECTION_CONFIG, CLIP_SKIP, DOWNLOAD_CONFIG
 )
 def download_model_with_retry(repo_id, filename, max_retries=None, **kwargs):
     """Download model with retry logic and proper token handling."""
     if max_retries is None:
@@ -200,93 +193,67 @@ def load_controlnets():
         # Return models, indicating InstantID failure
         return controlnet_depth, None, controlnet_openpose, False
-def load_image_encoder():
-    """Load CLIP Image Encoder for IP-Adapter."""
-    print("Loading CLIP Image Encoder for IP-Adapter...")
-    try:
-        # --- FIX: Load core models on GPU ---
-        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            "h94/IP-Adapter",
-            subfolder="models/image_encoder",
-            torch_dtype=dtype
-        ).to(device)
-        print("  [OK] CLIP Image Encoder loaded successfully (on GPU)")
-        return image_encoder
-    except Exception as e:
-        print(f"  [ERROR] Could not load image encoder: {e}")
-        return None
 def load_sdxl_pipeline(controlnets):
     """Load SDXL checkpoint from HuggingFace Hub."""
     print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
-    # --- START FIX ---
-    # Load tokenizers and text encoders from the base model first
-    # This guarantees they exist, even if the single file doesn't have them
     print("  Loading base tokenizers and text encoders...")
     BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
-    try:
-        tokenizer = CLIPTokenizer.from_pretrained(BASE_MODEL, subfolder="tokenizer")
-        tokenizer_2 = CLIPTokenizer.from_pretrained(BASE_MODEL, subfolder="tokenizer_2")
-        text_encoder = CLIPTextModel.from_pretrained(
-            BASE_MODEL, subfolder="text_encoder", torch_dtype=dtype
-        ).to(device)
-        text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
-            BASE_MODEL, subfolder="text_encoder_2", torch_dtype=dtype
-        ).to(device)
-        print("  [OK] Base text/token models loaded")
-    except Exception as e:
-        print(f"  [ERROR] Could not load base text models: {e}")
-        print("  Pipeline will likely fail. Check HF connection/model access.")
-        # Allow it to continue, but it will likely fail below
-        tokenizer = None
-        tokenizer_2 = None
-        text_encoder = None
-        text_encoder_2 = None
     # --- END FIX ---
     try:
         model_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['checkpoint'], repo_type="model")
-        # --- START FIX ---
-        # Pass the pre-loaded models to from_single_file
-        pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_single_file(
             model_path,
             controlnet=controlnets,
             torch_dtype=dtype,
             use_safetensors=True,
-            # Explicitly provide the models
             tokenizer=tokenizer,
             tokenizer_2=tokenizer_2,
             text_encoder=text_encoder,
             text_encoder_2=text_encoder_2,
-        ).to(device) # This main pipe MUST be on device
         # --- END FIX ---
         print("  [OK] Custom checkpoint loaded successfully (VAE bundled)")
         return pipe, True
     except Exception as e:
         print(f"  [WARNING] Could not load custom checkpoint: {e}")
         print("  Using default SDXL base model")
-        # The fallback logic is already correct
-        pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
             controlnet=controlnets,
             torch_dtype=dtype,
-            use_safetensors=True
-        ).to(device) # This main pipe MUST be on device
         return pipe, False
 def load_loras(pipe):
     """Load all LORAs from HuggingFace Hub."""
     print("Loading all LORAs from HuggingFace Hub...")
@@ -320,14 +287,11 @@ def load_loras(pipe):
     return loaded_loras, success
-def setup_ip_adapter(pipe, image_encoder):
     """
-    Setup IP-Adapter for InstantID face embeddings.
-    This is CRITICAL for face preservation.
     """
-    if image_encoder is None:
-        return None, False
     print("Setting up IP-Adapter for InstantID face embeddings...")
     try:
         # Download InstantID weights
@@ -337,110 +301,35 @@ def setup_ip_adapter(pipe, image_encoder):
             repo_type="model"
         )
-        # Load full state dict
-        state_dict = torch.load(ip_adapter_path, map_location="cpu")
-        # Extract image_proj and ip_adapter weights
-        image_proj_state_dict = {}
-        ip_adapter_state_dict = {}
-        for key, value in state_dict.items():
-            if key.startswith("image_proj."):
-                image_proj_state_dict[key.replace("image_proj.", "")] = value
-            elif key.startswith("ip_adapter."):
-                ip_adapter_state_dict[key.replace("ip_adapter.", "")] = value
-        # Create Resampler with CORRECT parameters
-        print("Creating Resampler (Perceiver architecture)...")
-        image_proj_model = Resampler(
-            dim=1280,
-            depth=4,
-            dim_head=64,
-            heads=20,
-            num_queries=16,
-            embedding_dim=512,  # CRITICAL: Must match InsightFace embedding size
-            output_dim=pipe.unet.config.cross_attention_dim,
-            ff_mult=4
-        )
-        image_proj_model.eval()
-        image_proj_model = image_proj_model.to(device, dtype=dtype)
-        # Load image_proj weights
-        if image_proj_state_dict:
-            try:
-                image_proj_model.load_state_dict(image_proj_state_dict, strict=True)
-                print("  [OK] Resampler loaded with pretrained weights")
-            except Exception as e:
-                print(f"  [WARNING] Could not load Resampler weights: {e}")
-        # Setup IP-Adapter attention processors
-        print("Setting up IP-Adapter attention processors...")
-        attn_procs = {}
-        num_tokens = 16
-        for name in pipe.unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = pipe.unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(pipe.unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = pipe.unet.config.block_out_channels[block_id]
-            else:
-                hidden_size = pipe.unet.config.block_out_channels[-1]
-            if cross_attention_dim is None:
-                attn_procs[name] = AttnProcessor2_0()
-            else:
-                attn_procs[name] = IPAttnProcessor2_0(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    scale=1.0,
-                    num_tokens=num_tokens
-                ).to(device, dtype=dtype)
-        # Set attention processors
-        pipe.unet.set_attn_processor(attn_procs)
-        # Load IP-Adapter weights
-        if ip_adapter_state_dict:
-            try:
-                ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
-                ip_layers.load_state_dict(ip_adapter_state_dict, strict=False)
-                print("  [OK] IP-Adapter attention weights loaded")
-            except Exception as e:
-                print(f"  [WARNING] Could not load IP-Adapter weights: {e}")
-        # Store image encoder
-        pipe.image_encoder = image_encoder
-        print("  [OK] IP-Adapter fully loaded with InstantID architecture")
-        print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
-        print(f"  - Face embeddings: 512D -> 16x{pipe.unet.config.cross_attention_dim}D")
-        return image_proj_model, True
     except Exception as e:
         print(f"  [ERROR] Could not setup IP-Adapter: {e}")
         import traceback
         traceback.print_exc()
         return None, False
-# --- START FIX: Use our new Cappella module ---
-def setup_cappella(pipe):
-    """Setup Cappella for our custom prompt encoding."""
-    print("Setting up Cappella (custom prompt encoder)...")
     try:
-        cappella = Cappella(pipe, device)
-        print("  [OK] Cappella loaded successfully.")
-        return cappella, True
     except Exception as e:
-        print(f"  [WARNING] Cappella not available: {e}")
         return None, False
 # --- END FIX ---
@@ -454,10 +343,6 @@ def setup_scheduler(pipe):
 def optimize_pipeline(pipe):
     """Apply optimizations to pipeline."""
-    # --- FIX: Removed enable_model_cpu_offload() ---
-    # Try to enable xformers
     if device == "cuda":
         try:
             pipe.enable_xformers_memory_efficient_attention()
@@ -479,11 +364,10 @@ def load_caption_model():
         print("  Attempting GIT-Large (recommended)...")
         caption_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
-        # --- FIX: Load on CPU ---
         caption_model = AutoModelForCausalLM.from_pretrained(
             "microsoft/git-large-coco",
             torch_dtype=dtype
-        ) # .to(device) removed
         print("  [OK] GIT-Large model loaded (produces detailed captions, on CPU)")
         return caption_processor, caption_model, True, 'git'
     except Exception as e1:
@@ -495,11 +379,10 @@ def load_caption_model():
             print("  Attempting BLIP base (fallback)...")
             caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-            # --- FIX: Load on CPU ---
             caption_model = BlipForConditionalGeneration.from_pretrained(
                 "Salesforce/blip-image-captioning-base",
                 torch_dtype=dtype
-            ) # .to(device) removed
             print("  [OK] BLIP base model loaded (standard captions, on CPU)")
             return caption_processor, caption_model, True, 'blip'
         except Exception as e2:
@@ -514,4 +397,4 @@ def set_clip_skip(pipe):
         print(f"  [OK] CLIP skip set to {CLIP_SKIP}")
-print("[OK] Model loading functions ready")

 """
 Model loading and initialization for Pixagram AI Pixel Art Generator
+FIXED VERSION - Uses correct InstantID pipeline and Compel encoder
 """
 import torch
 import time
 import os
 from diffusers import (
     ControlNetModel,
     AutoencoderKL,
     LCMScheduler
 )
+from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
 from controlnet_aux import ZoeDetector, OpenposeDetector, LeresDetector, MidasDetector, MediapipeFaceDetector
 from huggingface_hub import hf_hub_download, snapshot_download
+# --- START FIX: Import correct pipeline and Compel ---
+from pipeline_stable_diffusion_xl_instantid_img2img import StableDiffusionXLInstantIDImg2ImgPipeline
+from compel import Compel, ReturnedEmbeddingsType
 # --- END FIX ---
 from config import (
     device, dtype, MODEL_REPO, MODEL_FILES, HUGGINGFACE_TOKEN,
     FACE_DETECTION_CONFIG, CLIP_SKIP, DOWNLOAD_CONFIG
 )
+# (We keep download_model_with_retry, load_face_analysis, load_depth_detector,
+# load_openpose_detector, and load_mediapipe_face_detector as they were)
+# ... (Keep all original functions from line 25 down to line 180) ...
 def download_model_with_retry(repo_id, filename, max_retries=None, **kwargs):
     """Download model with retry logic and proper token handling."""
     if max_retries is None:
         # Return models, indicating InstantID failure
         return controlnet_depth, None, controlnet_openpose, False
+# --- START: REMOVED load_image_encoder ---
+# (The new pipeline handles this internally)
+# --- END: REMOVED load_image_encoder ---
 def load_sdxl_pipeline(controlnets):
     """Load SDXL checkpoint from HuggingFace Hub."""
     print("Loading SDXL checkpoint (horizon) with bundled VAE from HuggingFace Hub...")
+    # --- START FIX: Load base text models for Compel (from previous fix) ---
     print("  Loading base tokenizers and text encoders...")
     BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
+    tokenizer = CLIPTokenizer.from_pretrained(BASE_MODEL, subfolder="tokenizer")
+    tokenizer_2 = CLIPTokenizer.from_pretrained(BASE_MODEL, subfolder="tokenizer_2")
+    text_encoder = CLIPTextModel.from_pretrained(
+        BASE_MODEL, subfolder="text_encoder", torch_dtype=dtype
+    ).to(device)
+    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
+        BASE_MODEL, subfolder="text_encoder_2", torch_dtype=dtype
+    ).to(device)
+    print("  [OK] Base text/token models loaded")
     # --- END FIX ---
     try:
         model_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['checkpoint'], repo_type="model")
+        # --- START FIX: Load the CORRECT pipeline ---
+        pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_single_file(
             model_path,
             controlnet=controlnets,
             torch_dtype=dtype,
             use_safetensors=True,
+            # Pass components
             tokenizer=tokenizer,
             tokenizer_2=tokenizer_2,
             text_encoder=text_encoder,
             text_encoder_2=text_encoder_2,
+        ).to(device)
         # --- END FIX ---
         print("  [OK] Custom checkpoint loaded successfully (VAE bundled)")
         return pipe, True
     except Exception as e:
         print(f"  [WARNING] Could not load custom checkpoint: {e}")
         print("  Using default SDXL base model")
+        # --- START FIX: Fallback to the CORRECT pipeline ---
+        pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
             controlnet=controlnets,
             torch_dtype=dtype,
+            use_safetensors=True,
+            # Pass components
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+        ).to(device)
+        # --- END FIX ---
         return pipe, False
 def load_loras(pipe):
     """Load all LORAs from HuggingFace Hub."""
     print("Loading all LORAs from HuggingFace Hub...")
     return loaded_loras, success
+# --- START FIX: Replace setup_ip_adapter ---
+def setup_ip_adapter(pipe):
     """
+    Setup IP-Adapter for InstantID face embeddings using the pipeline's method.
     """
     print("Setting up IP-Adapter for InstantID face embeddings...")
     try:
         # Download InstantID weights
             repo_type="model"
         )
+        # Use the pipeline's built-in loader
+        pipe.load_ip_adapter_instantid(ip_adapter_path)
+        print("  [OK] IP-Adapter fully loaded via pipeline")
+        return None, True # We don't need to return a model
     except Exception as e:
         print(f"  [ERROR] Could not setup IP-Adapter: {e}")
         import traceback
         traceback.print_exc()
         return None, False
+# --- END FIX ---
+# --- START FIX: Replace setup_cappella with setup_compel ---
+def setup_compel(pipe):
+    """Setup Compel for robust prompt encoding."""
+    print("Setting up Compel (prompt encoder)...")
     try:
+        compel = Compel(
+            tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
+            text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
+            returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+            requires_pooled=[False, True]
+        )
+        print("  [OK] Compel loaded successfully.")
+        return compel, True
     except Exception as e:
+        print(f"  [WARNING] Compel not available: {e}")
         return None, False
 # --- END FIX ---
 def optimize_pipeline(pipe):
     """Apply optimizations to pipeline."""
     if device == "cuda":
         try:
             pipe.enable_xformers_memory_efficient_attention()
         print("  Attempting GIT-Large (recommended)...")
         caption_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
         caption_model = AutoModelForCausalLM.from_pretrained(
             "microsoft/git-large-coco",
             torch_dtype=dtype
+        )
         print("  [OK] GIT-Large model loaded (produces detailed captions, on CPU)")
         return caption_processor, caption_model, True, 'git'
     except Exception as e1:
             print("  Attempting BLIP base (fallback)...")
             caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
             caption_model = BlipForConditionalGeneration.from_pretrained(
                 "Salesforce/blip-image-captioning-base",
                 torch_dtype=dtype
+            )
             print("  [OK] BLIP base model loaded (standard captions, on CPU)")
             return caption_processor, caption_model, True, 'blip'
         except Exception as e2:
         print(f"  [OK] CLIP skip set to {CLIP_SKIP}")
+print("[OK] Model loading functions ready")