Spaces:

AbstractPhil
/

lyra-xl-playground

Running on Zero

App Files Files Community

AbstractPhil commited on 20 days ago

Commit

0c67338

verified ·

1 Parent(s): edab745

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -10

app.py CHANGED Viewed

@@ -258,7 +258,8 @@ class SDXLFlowMatchingPipeline:
         self,
         prompt: str,
         negative_prompt: str = "",
-        clip_skip: int = 1
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
         if self.lyra_model is None or self.t5_encoder is None:
@@ -269,9 +270,18 @@ class SDXLFlowMatchingPipeline:
             prompt, negative_prompt, clip_skip
         )
         # Get T5 embeddings
         t5_inputs = self.t5_tokenizer(
-            prompt,
             max_length=512,  # T5-XL uses 512
             padding='max_length',
             truncation=True,
@@ -312,8 +322,11 @@ class SDXLFlowMatchingPipeline:
         # Process negative prompt similarly if present
         if negative_prompt:
             t5_inputs_neg = self.t5_tokenizer(
-                negative_prompt,
                 max_length=512,
                 padding='max_length',
                 truncation=True,
@@ -374,6 +387,7 @@ class SDXLFlowMatchingPipeline:
         seed: Optional[int] = None,
         use_lyra: bool = False,
         clip_skip: int = 1,
         progress_callback=None
     ):
         """Generate image using SDXL architecture."""
@@ -387,7 +401,7 @@ class SDXLFlowMatchingPipeline:
         # Encode prompts
         if use_lyra and self.lyra_model is not None:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
-                prompt, negative_prompt, clip_skip
             )
         else:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
@@ -1204,11 +1218,12 @@ def estimate_duration(num_steps: int, width: int, height: int, use_lyra: bool =
 @spaces.GPU(duration=lambda *args: estimate_duration(
-    args[4], args[6], args[7], args[10],
-    "SDXL" in args[2] or "Illustrious" in args[2]
 ))
 def generate_image(
     prompt: str,
     negative_prompt: str,
     model_choice: str,
     clip_skip: int,
@@ -1297,6 +1312,7 @@ def generate_image(
                 seed=seed,
                 use_lyra=True,
                 clip_skip=clip_skip,
                 progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
             )
@@ -1330,17 +1346,28 @@ def create_demo():
         | **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
         | **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
-        Enable **Lyra VAE** for CLIP+T5 fusion comparison!
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.TextArea(
-                    label="Prompt",
                     value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
                     lines=3
                 )
                 negative_prompt = gr.TextArea(
                     label="Negative Prompt",
                     value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
@@ -1470,25 +1497,28 @@ def create_demo():
             examples=[
                 [
                     "masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
                     "lowres, bad anatomy, worst quality, low quality",
                     "Illustrious XL",
                     2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
                 ],
                 [
                     "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
                     "blurry, low quality",
                     "SDXL Base",
                     1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
                 ],
                 [
                     "cyberpunk city at night, neon lights, rain, highly detailed",
                     "low quality, blurry",
                     "Flow-Lune (SD1.5)",
                     1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
                 ],
             ],
             inputs=[
-                prompt, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
                 use_flow_matching, use_lyra, seed, randomize_seed
             ],
@@ -1565,7 +1595,7 @@ def create_demo():
         generate_btn.click(
             fn=generate_image,
             inputs=[
-                prompt, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
                 use_flow_matching, use_lyra, seed, randomize_seed
             ],

         self,
         prompt: str,
         negative_prompt: str = "",
+        clip_skip: int = 1,
+        t5_summary: str = ""
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
         if self.lyra_model is None or self.t5_encoder is None:
             prompt, negative_prompt, clip_skip
         )
+        # Format T5 input with pilcrow separator (¶)
+        # Training format was: "tags ¶ summary"
+        SUMMARY_SEPARATOR = "¶"
+        if t5_summary.strip():
+            t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
+        else:
+            # Fallback: duplicate prompt if no summary provided
+            t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {prompt}"
         # Get T5 embeddings
         t5_inputs = self.t5_tokenizer(
+            t5_prompt,
             max_length=512,  # T5-XL uses 512
             padding='max_length',
             truncation=True,
         # Process negative prompt similarly if present
         if negative_prompt:
+            # For negative, just use the negative prompt without summary
+            t5_neg_prompt = f"{negative_prompt} {SUMMARY_SEPARATOR} {negative_prompt}"
             t5_inputs_neg = self.t5_tokenizer(
+                t5_neg_prompt,
                 max_length=512,
                 padding='max_length',
                 truncation=True,
         seed: Optional[int] = None,
         use_lyra: bool = False,
         clip_skip: int = 1,
+        t5_summary: str = "",
         progress_callback=None
     ):
         """Generate image using SDXL architecture."""
         # Encode prompts
         if use_lyra and self.lyra_model is not None:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
+                prompt, negative_prompt, clip_skip, t5_summary
             )
         else:
             prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
 @spaces.GPU(duration=lambda *args: estimate_duration(
+    args[5], args[7], args[8], args[11],
+    "SDXL" in args[3] or "Illustrious" in args[3]
 ))
 def generate_image(
     prompt: str,
+    t5_summary: str,
     negative_prompt: str,
     model_choice: str,
     clip_skip: int,
                 seed=seed,
                 use_lyra=True,
                 clip_skip=clip_skip,
+                t5_summary=t5_summary,
                 progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
             )
         | **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
         | **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
+        **Lyra VAE** fuses CLIP + T5 embeddings using:
+        - **Prompt (Tags)**: Booru-style tags for CLIP encoding
+        - **T5 Summary**: Natural language description for T5 (format: `tags ¶ summary`)
+        Enable **Lyra VAE** for side-by-side comparison!
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 prompt = gr.TextArea(
+                    label="Prompt (Tags for CLIP)",
                     value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
                     lines=3
                 )
+                t5_summary = gr.TextArea(
+                    label="T5 Summary (Natural Language for Lyra)",
+                    value="A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
+                    lines=2,
+                    info="Used after ¶ separator for T5. Leave empty to use tags only."
+                )
                 negative_prompt = gr.TextArea(
                     label="Negative Prompt",
                     value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
             examples=[
                 [
                     "masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
+                    "A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
                     "lowres, bad anatomy, worst quality, low quality",
                     "Illustrious XL",
                     2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
                 ],
                 [
                     "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
+                    "A breathtaking mountain vista bathed in warm golden light at sunset, with a perfectly still crystal clear lake reflecting the peaks",
                     "blurry, low quality",
                     "SDXL Base",
                     1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
                 ],
                 [
                     "cyberpunk city at night, neon lights, rain, highly detailed",
+                    "A futuristic cyberpunk metropolis at night with vibrant neon lights reflecting off rain-slicked streets",
                     "low quality, blurry",
                     "Flow-Lune (SD1.5)",
                     1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
                 ],
             ],
             inputs=[
+                prompt, t5_summary, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
                 use_flow_matching, use_lyra, seed, randomize_seed
             ],
         generate_btn.click(
             fn=generate_image,
             inputs=[
+                prompt, t5_summary, negative_prompt, model_choice, clip_skip,
                 num_steps, cfg_scale, width, height, shift,
                 use_flow_matching, use_lyra, seed, randomize_seed
             ],