Create sd15_inference_mechanism.py

Browse files

Files changed (1) hide show

sd15_inference_mechanism.py +242 -0

sd15_inference_mechanism.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# ============================================================================
+# SD 1.5 × MEMORY-CLIP-SEQ: Full Sequence Output
+#
+# The seq77 model produces (B, 77, 768) — same shape as CLIP's native
+# last_hidden_state. Drop-in replacement for SD's text encoder output.
+#
+# Comparisons:
+#   A) Standard CLIP: truncated 77 tokens
+#   B) Seq77 model: full 576-token context → reconstructed 77-position sequence
+#   C) Seq77 pooled + EOS inject: v3 approach but with the better pooled model
+# ============================================================================
+import torch
+import torch.nn.functional as F
+from diffusers import StableDiffusionPipeline, DDIMScheduler
+from transformers import AutoModel, CLIPTextModel, CLIPTokenizer
+from PIL import Image
+import os
+import numpy as np
+SEQ_REPO = "AbstractPhil/geolip-clip-vit-large-patch14-ctx576-seq77"
+SD15_REPO = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16
+# ══════════════════════════════════════════════════════════════════
+# LOAD
+# ══════════════════════════════════════════════════════════════════
+print("Loading SD 1.5...")
+pipe = StableDiffusionPipeline.from_pretrained(
+    SD15_REPO, torch_dtype=DTYPE, safety_checker=None)
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to(DEVICE)
+print("Loading Memory-CLIP-Seq...")
+seq_model = AutoModel.from_pretrained(SEQ_REPO, trust_remote_code=True)
+seq_model = seq_model.to(DEVICE).eval()
+tokenizer = pipe.tokenizer
+text_encoder = pipe.text_encoder
+unet = pipe.unet
+vae = pipe.vae
+scheduler = pipe.scheduler
+print("Ready.")
+# ══════════════════════════════════════════════════════════════════
+# SEGMENTATION
+# ══════════════════════════════════════════════════════════════════
+def segment_text(text, clip_tokenizer, max_content=18, overlap=4, max_segments=32):
+    full_tokens = clip_tokenizer.encode(text, add_special_tokens=False)
+    segments, stride, pos = [], max_content - overlap, 0
+    while pos < len(full_tokens) and len(segments) < max_segments:
+        end = min(pos + max_content, len(full_tokens))
+        chunk = full_tokens[pos:end]
+        sos = clip_tokenizer.bos_token_id or 49406
+        eos = clip_tokenizer.eos_token_id or 49407
+        ids = [sos] + chunk + [eos]
+        n_pad = 77 - len(ids)
+        ids = (ids + [0] * max(n_pad, 0))[:77]
+        mask = ([1] * min(len(chunk) + 2, 77) + [0] * max(n_pad, 0))[:77]
+        segments.append({
+            "input_ids": torch.tensor(ids, dtype=torch.long),
+            "attention_mask": torch.tensor(mask, dtype=torch.long),
+        })
+        if end >= len(full_tokens):
+            break
+        pos += stride
+    return segments
+# ══════════════════════════════════════════════════════════════════
+# ENCODING METHODS
+# ══════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def encode_standard_clip(prompt):
+    """Standard SD 1.5: truncate → (1, 77, 768)"""
+    inputs = tokenizer(prompt, max_length=77, padding="max_length",
+                       truncation=True, return_tensors="pt").to(DEVICE)
+    return text_encoder(input_ids=inputs.input_ids).last_hidden_state
+@torch.no_grad()
+def encode_seq77(prompt):
+    """
+    Seq77 model: full caption → segmented → memory → reconstruct → (1, 77, 768)
+    Direct drop-in replacement for CLIP's last_hidden_state.
+    """
+    out = seq_model(texts=[prompt], output_sequence=True)
+    return out.last_hidden_state.to(DTYPE)  # (1, 77, 768)
+@torch.no_grad()
+def encode_seq77_pooled_eos_inject(prompt, alpha=1.0):
+    """
+    Hybrid: standard CLIP sequence + seq77 pooled embedding at EOS-1.
+    Uses the seq77 model's improved pooled output (m_acc=0.957).
+    """
+    clip_embeds = encode_standard_clip(prompt).clone()
+    # Get pooled from seq model
+    pooled = seq_model.encode(prompt)  # (768,)
+    pooled = pooled.unsqueeze(0)  # (1, 768)
+    # Find EOS
+    inputs = tokenizer(prompt, max_length=77, padding="max_length",
+                       truncation=True, return_tensors="pt")
+    eos_positions = (inputs.input_ids == 49407).nonzero(as_tuple=True)[1]
+    eos_pos = eos_positions[0].item() if len(eos_positions) > 0 else 76
+    inject_pos = max(eos_pos - 1, 1)
+    orig = clip_embeds[:, inject_pos, :]
+    clip_embeds[:, inject_pos, :] = (orig + alpha * (pooled - orig)).to(DTYPE)
+    return clip_embeds
+@torch.no_grad()
+def encode_seq77_blended(prompt, alpha=0.5):
+    """
+    Blend: alpha × seq77_sequence + (1-alpha) × standard_clip_sequence.
+    """
+    clip_embeds = encode_standard_clip(prompt)
+    seq_embeds = encode_seq77(prompt)
+    blended = clip_embeds + alpha * (seq_embeds - clip_embeds)
+    return blended
+# ══════════════════════════════════════════════════════════════════
+# GENERATION
+# ══════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def generate(prompt_embeds, negative_embeds=None,
+             steps=30, cfg=7.5, seed=42, h=512, w=512):
+    gen = torch.Generator(device=DEVICE).manual_seed(seed)
+    if negative_embeds is None:
+        negative_embeds = torch.zeros_like(prompt_embeds)
+    text_emb = torch.cat([negative_embeds, prompt_embeds])
+    latents = torch.randn(
+        (1, unet.config.in_channels, h // 8, w // 8),
+        generator=gen, device=DEVICE, dtype=DTYPE)
+    latents = latents * scheduler.init_noise_sigma
+    scheduler.set_timesteps(steps)
+    for t in scheduler.timesteps:
+        lat_in = scheduler.scale_model_input(torch.cat([latents] * 2), t)
+        pred = unet(lat_in, t, encoder_hidden_states=text_emb).sample
+        pu, pt = pred.chunk(2)
+        pred = pu + cfg * (pt - pu)
+        latents = scheduler.step(pred, t, latents).prev_sample
+    latents = latents / vae.config.scaling_factor
+    img = vae.decode(latents).sample
+    img = (img / 2 + 0.5).clamp(0, 1)
+    img = img.cpu().permute(0, 2, 3, 1).float().numpy()
+    return Image.fromarray((img[0] * 255).astype("uint8"))
+# ============================================================================
+# PURE BLEND: Standard CLIP ↔ Seq77 at 0.25, 0.50, 0.75
+#
+# output = (1 - α) × CLIP_sequence + α × Seq77_sequence
+#
+# No EOS injection. No img2img. Just the raw blend.
+# Run after sd15_seq77_test.py (models loaded)
+# ============================================================================
+import os
+from PIL import Image
+os.makedirs("outputs", exist_ok=True)
+neg = encode_standard_clip("")
+prompts = {
+    "castle": (
+        "A vast sweeping landscape of rolling green hills under dramatic "
+        "storm clouds with a lone oak tree in the foreground its branches "
+        "bent by wind casting long shadows across a field of wildflowers "
+        "in purple yellow and white while in the distance a medieval stone "
+        "castle sits atop a cliff overlooking a turbulent sea with waves "
+        "crashing against ancient rocks and seabirds wheeling overhead "
+        "against a sky painted in shades of grey and gold as the sun "
+        "breaks through the clouds illuminating the castle towers"
+    ),
+    "still_life": (
+        "A meticulously arranged still life painting in the Dutch Golden Age "
+        "style featuring a silver goblet overflowing with deep red wine next "
+        "to a half peeled lemon with its rind spiraling downward and a cracked "
+        "walnut revealing its inner flesh beside a porcelain plate holding "
+        "slices of rare roast beef garnished with fresh rosemary sprigs and "
+        "a small bouquet of wilting tulips in shades of pink and white all set "
+        "against a dark moody background with dramatic chiaroscuro lighting "
+        "that highlights the reflective surfaces and textures of each object "
+        "while casting deep shadows that add depth and mystery to the composition "
+        "with a single fly resting on the edge of the goblet and droplets of "
+        "condensation catching the light on the silver surface"
+    ),
+    "short": "A medieval castle on a cliff overlooking the sea at sunset",
+}
+alphas = [0.0, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]
+for name, prompt in prompts.items():
+    n_tokens = len(seq_model.clip_tokenizer.encode(prompt))
+    print(f"\n{'='*60}")
+    print(f"{name} ({n_tokens} tokens)")
+    print(f"{'='*60}")
+    clip_seq = encode_standard_clip(prompt)
+    mem_seq = encode_seq77(prompt)
+    # Log overall cosine between the two
+    cos = F.cosine_similarity(
+        clip_seq.float().mean(1), mem_seq.float().mean(1)).item()
+    print(f"  CLIP ↔ Seq77 mean cosine: {cos:.4f}")
+    images = []
+    for alpha in alphas:
+        label = f"α={alpha:.2f}"
+        print(f"  {label}...", end=" ", flush=True)
+        blended = clip_seq.float() + alpha * (mem_seq.float() - clip_seq.float())
+        blended = blended.to(DTYPE)
+        img = generate(blended, neg, steps=50, seed=42)
+        img.save(f"outputs/blend_{name}_a{alpha:.2f}.png")
+        images.append((label, img))
+        print("done")
+    combined = Image.new("RGB", (512 * len(images), 512))
+    for i, (label, img) in enumerate(images):
+        combined.paste(img, (512 * i, 0))
+    combined.save(f"outputs/blend_{name}_combined.png")
+    print(f"  Saved: outputs/blend_{name}_combined.png")
+    print(f"  {' | '.join(l for l, _ in images)}")
+print("\nDONE")