Spaces:

AbstractPhil
/

lyra-xl-playground

Running on Zero

App Files Files Community

AbstractPhil commited on 25 days ago

Commit

edab745

verified ·

1 Parent(s): 9cda443

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -15

app.py CHANGED Viewed

@@ -288,16 +288,14 @@ class SDXLFlowMatchingPipeline:
         clip_l_embeds = prompt_embeds[..., :clip_l_dim]
         clip_g_embeds = prompt_embeds[..., clip_l_dim:]
-        # Get Lyra model dtype and cast inputs to match
-        lyra_dtype = next(self.lyra_model.parameters()).dtype
         # Lyra v2 expects these exact keys from training config:
         # clip_l, clip_g, t5_xl_l, t5_xl_g
         modality_inputs = {
-            'clip_l': clip_l_embeds.to(lyra_dtype),
-            'clip_g': clip_g_embeds.to(lyra_dtype),
-            't5_xl_l': t5_embeds.to(lyra_dtype),
-            't5_xl_g': t5_embeds.to(lyra_dtype)  # Same T5 embedding for both bindings
         }
         with torch.no_grad():
@@ -305,6 +303,7 @@ class SDXLFlowMatchingPipeline:
                 modality_inputs,
                 target_modalities=['clip_l', 'clip_g']
             )
             fused_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
             fused_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
@@ -328,10 +327,10 @@ class SDXLFlowMatchingPipeline:
             neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
             modality_inputs_neg = {
-                'clip_l': neg_clip_l.to(lyra_dtype),
-                'clip_g': neg_clip_g.to(lyra_dtype),
-                't5_xl_l': t5_embeds_neg.to(lyra_dtype),
-                't5_xl_g': t5_embeds_neg.to(lyra_dtype)
             }
             with torch.no_grad():
@@ -339,8 +338,8 @@ class SDXLFlowMatchingPipeline:
                     modality_inputs_neg,
                     target_modalities=['clip_l', 'clip_g']
                 )
-                fused_neg_clip_l = reconstructions_neg['clip_l'].to(prompt_embeds.dtype)
-                fused_neg_clip_g = reconstructions_neg['clip_g'].to(prompt_embeds.dtype)
             negative_prompt_embeds_fused = torch.cat([fused_neg_clip_l, fused_neg_clip_g], dim=-1)
         else:
@@ -1029,8 +1028,8 @@ def load_lyra_vae_xl(
         else:
             lyra_model.load_state_dict(checkpoint)
-        # Use float16 to match SDXL pipeline
-        lyra_model.to(device, dtype=torch.float16)
         lyra_model.eval()
         print(f"✅ Lyra VAE v2 loaded")

         clip_l_embeds = prompt_embeds[..., :clip_l_dim]
         clip_g_embeds = prompt_embeds[..., clip_l_dim:]
         # Lyra v2 expects these exact keys from training config:
         # clip_l, clip_g, t5_xl_l, t5_xl_g
+        # Upcast inputs to float32 for Lyra (model is fp32 for stability)
         modality_inputs = {
+            'clip_l': clip_l_embeds.float(),
+            'clip_g': clip_g_embeds.float(),
+            't5_xl_l': t5_embeds.float(),
+            't5_xl_g': t5_embeds.float()  # Same T5 embedding for both bindings
         }
         with torch.no_grad():
                 modality_inputs,
                 target_modalities=['clip_l', 'clip_g']
             )
+            # Cast outputs back to original dtype (float16)
             fused_clip_l = reconstructions['clip_l'].to(prompt_embeds.dtype)
             fused_clip_g = reconstructions['clip_g'].to(prompt_embeds.dtype)
             neg_clip_g = negative_prompt_embeds[..., clip_l_dim:]
             modality_inputs_neg = {
+                'clip_l': neg_clip_l.float(),
+                'clip_g': neg_clip_g.float(),
+                't5_xl_l': t5_embeds_neg.float(),
+                't5_xl_g': t5_embeds_neg.float()
             }
             with torch.no_grad():
                     modality_inputs_neg,
                     target_modalities=['clip_l', 'clip_g']
                 )
+                fused_neg_clip_l = reconstructions_neg['clip_l'].to(negative_prompt_embeds.dtype)
+                fused_neg_clip_g = reconstructions_neg['clip_g'].to(negative_prompt_embeds.dtype)
             negative_prompt_embeds_fused = torch.cat([fused_neg_clip_l, fused_neg_clip_g], dim=-1)
         else:
         else:
             lyra_model.load_state_dict(checkpoint)
+        # Keep Lyra in float32 for stability - inputs will be upcast
+        lyra_model.to(device)
         lyra_model.eval()
         print(f"✅ Lyra VAE v2 loaded")