Spaces:

AbstractPhil
/

lyra-xl-playground

Running on Zero

App Files Files Community

AbstractPhil commited on 27 days ago

Commit

93038cf

verified ·

1 Parent(s): 0c67338

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -46

app.py CHANGED Viewed

@@ -259,9 +259,17 @@ class SDXLFlowMatchingPipeline:
         prompt: str,
         negative_prompt: str = "",
         clip_skip: int = 1,
-        t5_summary: str = ""
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
         if self.lyra_model is None or self.t5_encoder is None:
             raise ValueError("Lyra VAE components not initialized")
@@ -271,18 +279,16 @@ class SDXLFlowMatchingPipeline:
         )
         # Format T5 input with pilcrow separator (¶)
-        # Training format was: "tags ¶ summary"
         SUMMARY_SEPARATOR = "¶"
         if t5_summary.strip():
             t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
         else:
-            # Fallback: duplicate prompt if no summary provided
             t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {prompt}"
         # Get T5 embeddings
         t5_inputs = self.t5_tokenizer(
             t5_prompt,
-            max_length=512,  # T5-XL uses 512
             padding='max_length',
             truncation=True,
             return_tensors='pt'
@@ -291,40 +297,88 @@ class SDXLFlowMatchingPipeline:
         with torch.no_grad():
             t5_embeds = self.t5_encoder(**t5_inputs).last_hidden_state
-        # For SDXL, split the concatenated CLIP-L + CLIP-G embeddings
         clip_l_dim = 768
         clip_g_dim = 1280
         clip_l_embeds = prompt_embeds[..., :clip_l_dim]
         clip_g_embeds = prompt_embeds[..., clip_l_dim:]
-        # Lyra v2 expects these exact keys from training config:
-        # clip_l, clip_g, t5_xl_l, t5_xl_g
-        # Upcast inputs to float32 for Lyra (model is fp32 for stability)
-        modality_inputs = {
-            'clip_l': clip_l_embeds.float(),
-            'clip_g': clip_g_embeds.float(),
-            't5_xl_l': t5_embeds.float(),
-            't5_xl_g': t5_embeds.float()  # Same T5 embedding for both bindings
-        }
         with torch.no_grad():
-            reconstructions, mu, logvar, _ = self.lyra_model(
-                modality_inputs,
-                target_modalities=['clip_l', 'clip_g']
-            )
-            # Cast outputs back to original dtype (float16)
-            fused_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
-            fused_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
-        # Recombine fused CLIP-L and CLIP-G
         prompt_embeds_fused = torch.cat([fused_clip_l, fused_clip_g], dim=-1)
-        # Process negative prompt similarly if present
         if negative_prompt:
-            # For negative, just use the negative prompt without summary
-            t5_neg_prompt = f"{negative_prompt} {SUMMARY_SEPARATOR} {negative_prompt}"
             t5_inputs_neg = self.t5_tokenizer(
                 t5_neg_prompt,
                 max_length=512,
@@ -339,22 +393,34 @@ class SDXLFlowMatchingPipeline:
             neg_clip_l = negative_prompt_embeds[..., :clip_l_dim]
             neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
-            modality_inputs_neg = {
-                'clip_l': neg_clip_l.float(),
-                'clip_g': neg_clip_g.float(),
-                't5_xl_l': t5_embeds_neg.float(),
-                't5_xl_g': t5_embeds_neg.float()
-            }
-            with torch.no_grad():
-                reconstructions_neg, _, _, _ = self.lyra_model(
-                    modality_inputs_neg,
-                    target_modalities=['clip_l', 'clip_g']
-                )
-                fused_neg_clip_l = reconstructions_neg['clip_l'].to(negative_prompt_embeds.dtype)
-                fused_neg_clip_g = reconstructions_neg['clip_g'].to(negative_prompt_embeds.dtype)
-            negative_prompt_embeds_fused = torch.cat([fused_neg_clip_l, fused_neg_clip_g], dim=-1)
         else:
             negative_prompt_embeds_fused = torch.zeros_like(prompt_embeds_fused)
@@ -388,6 +454,7 @@ class SDXLFlowMatchingPipeline:
         use_lyra: bool = False,
         clip_skip: int = 1,
         t5_summary: str = "",
         progress_callback=None
     ):
         """Generate image using SDXL architecture."""
@@ -401,7 +468,7 @@ class SDXLFlowMatchingPipeline:
         # Encode prompts
         if use_lyra and self.lyra_model is not None:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
-                prompt, negative_prompt, clip_skip, t5_summary
             )
         else:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
@@ -1234,6 +1301,7 @@ def generate_image(
     shift: float,
     use_flow_matching: bool,
     use_lyra: bool,
     seed: int,
     randomize_seed: bool,
     progress=gr.Progress()
@@ -1313,6 +1381,7 @@ def generate_image(
                 use_lyra=True,
                 clip_skip=clip_skip,
                 t5_summary=t5_summary,
                 progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
             )
@@ -1400,6 +1469,15 @@ def create_demo():
                     info="Compare standard vs geometric fusion"
                 )
                 with gr.Accordion("Generation Settings", open=True):
                     num_steps = gr.Slider(
                         label="Steps",
@@ -1500,27 +1578,27 @@ def create_demo():
                     "A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
                     "lowres, bad anatomy, worst quality, low quality",
                     "Illustrious XL",
-                    2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
                 ],
                 [
                     "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
                     "A breathtaking mountain vista bathed in warm golden light at sunset, with a perfectly still crystal clear lake reflecting the peaks",
                     "blurry, low quality",
                     "SDXL Base",
-                    1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
                 ],
                 [
                     "cyberpunk city at night, neon lights, rain, highly detailed",
                     "A futuristic cyberpunk metropolis at night with vibrant neon lights reflecting off rain-slicked streets",
                     "low quality, blurry",
                     "Flow-Lune (SD1.5)",
-                    1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
                 ],
             ],
             inputs=[
                 prompt, t5_summary, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
-                use_flow_matching, use_lyra, seed, randomize_seed
             ],
             outputs=[output_image_standard, output_image_lyra, output_seed],
             fn=generate_image,
@@ -1597,7 +1675,7 @@ def create_demo():
             inputs=[
                 prompt, t5_summary, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
-                use_flow_matching, use_lyra, seed, randomize_seed
             ],
             outputs=[output_image_standard, output_image_lyra, output_seed]
         )

         prompt: str,
         negative_prompt: str = "",
         clip_skip: int = 1,
+        t5_summary: str = "",
+        lyra_strength: float = 0.3
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Encode prompts using Lyra VAE v2 fusion (CLIP + T5).
+        Uses cross-modal translation: encode T5 → decode to CLIP space,
+        then blend with original CLIP embeddings.
+        Args:
+            lyra_strength: Blend factor (0.0 = pure CLIP, 1.0 = pure Lyra reconstruction)
+        """
         if self.lyra_model is None or self.t5_encoder is None:
             raise ValueError("Lyra VAE components not initialized")
         )
         # Format T5 input with pilcrow separator (¶)
         SUMMARY_SEPARATOR = "¶"
         if t5_summary.strip():
             t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
         else:
             t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {prompt}"
         # Get T5 embeddings
         t5_inputs = self.t5_tokenizer(
             t5_prompt,
+            max_length=512,
             padding='max_length',
             truncation=True,
             return_tensors='pt'
         with torch.no_grad():
             t5_embeds = self.t5_encoder(**t5_inputs).last_hidden_state
         clip_l_dim = 768
         clip_g_dim = 1280
         clip_l_embeds = prompt_embeds[..., :clip_l_dim]
         clip_g_embeds = prompt_embeds[..., clip_l_dim:]
+        # Debug: print input stats
+        print(f"[Lyra Debug] CLIP-L input: shape={clip_l_embeds.shape}, mean={clip_l_embeds.mean():.4f}, std={clip_l_embeds.std():.4f}")
+        print(f"[Lyra Debug] CLIP-G input: shape={clip_g_embeds.shape}, mean={clip_g_embeds.mean():.4f}, std={clip_g_embeds.std():.4f}")
+        print(f"[Lyra Debug] T5 input: shape={t5_embeds.shape}, mean={t5_embeds.mean():.4f}, std={t5_embeds.std():.4f}")
         with torch.no_grad():
+            # Try approach 1: Cross-modal - encode T5 only, decode to CLIP
+            # This uses T5's semantic understanding to generate CLIP-compatible embeddings
+            t5_only_inputs = {
+                't5_xl_l': t5_embeds.float(),
+                't5_xl_g': t5_embeds.float()
+            }
+            # Check if model has separate encode/decode methods
+            if hasattr(self.lyra_model, 'encode') and hasattr(self.lyra_model, 'decode'):
+                print("[Lyra Debug] Using separate encode/decode path")
+                # Encode T5 to latent space
+                mu, logvar = self.lyra_model.encode(t5_only_inputs)
+                z = mu  # Use mean for deterministic output
+                print(f"[Lyra Debug] Latent z: shape={z.shape}, mean={z.mean():.4f}, std={z.std():.4f}")
+                # Decode to CLIP space
+                reconstructions = self.lyra_model.decode(z, target_modalities=['clip_l', 'clip_g'])
+            else:
+                print("[Lyra Debug] Using forward pass with all modalities")
+                # Fall back to full forward pass with all modalities
+                modality_inputs = {
+                    'clip_l': clip_l_embeds.float(),
+                    'clip_g': clip_g_embeds.float(),
+                    't5_xl_l': t5_embeds.float(),
+                    't5_xl_g': t5_embeds.float()
+                }
+                reconstructions, mu, logvar, _ = self.lyra_model(
+                    modality_inputs,
+                    target_modalities=['clip_l', 'clip_g']
+                )
+                print(f"[Lyra Debug] Latent mu: shape={mu.shape}, mean={mu.mean():.4f}, std={mu.std():.4f}")
+            lyra_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
+            lyra_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
+            print(f"[Lyra Debug] Lyra CLIP-L output: mean={lyra_clip_l.mean():.4f}, std={lyra_clip_l.std():.4f}")
+            print(f"[Lyra Debug] Lyra CLIP-G output: mean={lyra_clip_g.mean():.4f}, std={lyra_clip_g.std():.4f}")
+            # Check if reconstruction stats are wildly different from input
+            # If so, we may need to normalize
+            clip_l_std_ratio = lyra_clip_l.std() / (clip_l_embeds.std() + 1e-8)
+            clip_g_std_ratio = lyra_clip_g.std() / (clip_g_embeds.std() + 1e-8)
+            print(f"[Lyra Debug] Std ratio CLIP-L: {clip_l_std_ratio:.4f}, CLIP-G: {clip_g_std_ratio:.4f}")
+            # Normalize reconstructions to match input statistics if needed
+            if clip_l_std_ratio > 2.0 or clip_l_std_ratio < 0.5:
+                print("[Lyra Debug] Normalizing CLIP-L reconstruction to match input stats")
+                lyra_clip_l = (lyra_clip_l - lyra_clip_l.mean()) / (lyra_clip_l.std() + 1e-8)
+                lyra_clip_l = lyra_clip_l * clip_l_embeds.std() + clip_l_embeds.mean()
+            if clip_g_std_ratio > 2.0 or clip_g_std_ratio < 0.5:
+                print("[Lyra Debug] Normalizing CLIP-G reconstruction to match input stats")
+                lyra_clip_g = (lyra_clip_g - lyra_clip_g.mean()) / (lyra_clip_g.std() + 1e-8)
+                lyra_clip_g = lyra_clip_g * clip_g_embeds.std() + clip_g_embeds.mean()
+        # Blend original CLIP with Lyra reconstruction
+        fused_clip_l = (1 - lyra_strength) * clip_l_embeds + lyra_strength * lyra_clip_l
+        fused_clip_g = (1 - lyra_strength) * clip_g_embeds + lyra_strength * lyra_clip_g
+        print(f"[Lyra Debug] Final fused CLIP-L: mean={fused_clip_l.mean():.4f}, std={fused_clip_l.std():.4f}")
+        print(f"[Lyra Debug] lyra_strength={lyra_strength}")
         prompt_embeds_fused = torch.cat([fused_clip_l, fused_clip_g], dim=-1)
+        # Process negative prompt (simpler - just use original CLIP for negative)
         if negative_prompt:
+            # For negative, blend less aggressively
+            neg_strength = lyra_strength * 0.5
+            t5_neg_prompt = f"{negative_prompt} {SUMMARY_SEPARATOR} {negative_prompt}"
             t5_inputs_neg = self.t5_tokenizer(
                 t5_neg_prompt,
                 max_length=512,
             neg_clip_l = negative_prompt_embeds[..., :clip_l_dim]
             neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
+            if hasattr(self.lyra_model, 'encode') and hasattr(self.lyra_model, 'decode'):
+                t5_neg_inputs = {'t5_xl_l': t5_embeds_neg.float(), 't5_xl_g': t5_embeds_neg.float()}
+                mu_neg, _ = self.lyra_model.encode(t5_neg_inputs)
+                recon_neg = self.lyra_model.decode(mu_neg, target_modalities=['clip_l', 'clip_g'])
+            else:
+                modality_inputs_neg = {
+                    'clip_l': neg_clip_l.float(),
+                    'clip_g': neg_clip_g.float(),
+                    't5_xl_l': t5_embeds_neg.float(),
+                    't5_xl_g': t5_embeds_neg.float()
+                }
+                recon_neg, _, _, _ = self.lyra_model(modality_inputs_neg, target_modalities=['clip_l', 'clip_g'])
+            lyra_neg_l = recon_neg['clip_l'].to(negative_prompt_embeds.dtype)
+            lyra_neg_g = recon_neg['clip_g'].to(negative_prompt_embeds.dtype)
+            # Normalize if needed
+            if lyra_neg_l.std() / (neg_clip_l.std() + 1e-8) > 2.0:
+                lyra_neg_l = (lyra_neg_l - lyra_neg_l.mean()) / (lyra_neg_l.std() + 1e-8)
+                lyra_neg_l = lyra_neg_l * neg_clip_l.std() + neg_clip_l.mean()
+            if lyra_neg_g.std() / (neg_clip_g.std() + 1e-8) > 2.0:
+                lyra_neg_g = (lyra_neg_g - lyra_neg_g.mean()) / (lyra_neg_g.std() + 1e-8)
+                lyra_neg_g = lyra_neg_g * neg_clip_g.std() + neg_clip_g.mean()
+            fused_neg_l = (1 - neg_strength) * neg_clip_l + neg_strength * lyra_neg_l
+            fused_neg_g = (1 - neg_strength) * neg_clip_g + neg_strength * lyra_neg_g
+            negative_prompt_embeds_fused = torch.cat([fused_neg_l, fused_neg_g], dim=-1)
         else:
             negative_prompt_embeds_fused = torch.zeros_like(prompt_embeds_fused)
         use_lyra: bool = False,
         clip_skip: int = 1,
         t5_summary: str = "",
+        lyra_strength: float = 0.3,
         progress_callback=None
     ):
         """Generate image using SDXL architecture."""
         # Encode prompts
         if use_lyra and self.lyra_model is not None:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
+                prompt, negative_prompt, clip_skip, t5_summary, lyra_strength
             )
         else:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
     shift: float,
     use_flow_matching: bool,
     use_lyra: bool,
+    lyra_strength: float,
     seed: int,
     randomize_seed: bool,
     progress=gr.Progress()
                 use_lyra=True,
                 clip_skip=clip_skip,
                 t5_summary=t5_summary,
+                lyra_strength=lyra_strength,
                 progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
             )
                     info="Compare standard vs geometric fusion"
                 )
+                lyra_strength = gr.Slider(
+                    label="Lyra Blend Strength",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.3,
+                    step=0.05,
+                    info="0.0 = pure CLIP, 1.0 = pure Lyra reconstruction"
+                )
                 with gr.Accordion("Generation Settings", open=True):
                     num_steps = gr.Slider(
                         label="Steps",
                     "A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
                     "lowres, bad anatomy, worst quality, low quality",
                     "Illustrious XL",
+                    2, 25, 7.0, 1024, 1024, 0.0, False, True, 0.3, 42, False
                 ],
                 [
                     "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
                     "A breathtaking mountain vista bathed in warm golden light at sunset, with a perfectly still crystal clear lake reflecting the peaks",
                     "blurry, low quality",
                     "SDXL Base",
+                    1, 30, 7.5, 1024, 1024, 0.0, False, True, 0.3, 123, False
                 ],
                 [
                     "cyberpunk city at night, neon lights, rain, highly detailed",
                     "A futuristic cyberpunk metropolis at night with vibrant neon lights reflecting off rain-slicked streets",
                     "low quality, blurry",
                     "Flow-Lune (SD1.5)",
+                    1, 20, 7.5, 512, 512, 2.5, True, True, 0.3, 456, False
                 ],
             ],
             inputs=[
                 prompt, t5_summary, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
+                use_flow_matching, use_lyra, lyra_strength, seed, randomize_seed
             ],
             outputs=[output_image_standard, output_image_lyra, output_seed],
             fn=generate_image,
             inputs=[
                 prompt, t5_summary, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
+                use_flow_matching, use_lyra, lyra_strength, seed, randomize_seed
             ],
             outputs=[output_image_standard, output_image_lyra, output_seed]
         )