AbstractPhil commited on
Commit
0c67338
·
verified ·
1 Parent(s): edab745

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -10
app.py CHANGED
@@ -258,7 +258,8 @@ class SDXLFlowMatchingPipeline:
258
  self,
259
  prompt: str,
260
  negative_prompt: str = "",
261
- clip_skip: int = 1
 
262
  ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
263
  """Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
264
  if self.lyra_model is None or self.t5_encoder is None:
@@ -269,9 +270,18 @@ class SDXLFlowMatchingPipeline:
269
  prompt, negative_prompt, clip_skip
270
  )
271
 
 
 
 
 
 
 
 
 
 
272
  # Get T5 embeddings
273
  t5_inputs = self.t5_tokenizer(
274
- prompt,
275
  max_length=512, # T5-XL uses 512
276
  padding='max_length',
277
  truncation=True,
@@ -312,8 +322,11 @@ class SDXLFlowMatchingPipeline:
312
 
313
  # Process negative prompt similarly if present
314
  if negative_prompt:
 
 
 
315
  t5_inputs_neg = self.t5_tokenizer(
316
- negative_prompt,
317
  max_length=512,
318
  padding='max_length',
319
  truncation=True,
@@ -374,6 +387,7 @@ class SDXLFlowMatchingPipeline:
374
  seed: Optional[int] = None,
375
  use_lyra: bool = False,
376
  clip_skip: int = 1,
 
377
  progress_callback=None
378
  ):
379
  """Generate image using SDXL architecture."""
@@ -387,7 +401,7 @@ class SDXLFlowMatchingPipeline:
387
  # Encode prompts
388
  if use_lyra and self.lyra_model is not None:
389
  prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
390
- prompt, negative_prompt, clip_skip
391
  )
392
  else:
393
  prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
@@ -1204,11 +1218,12 @@ def estimate_duration(num_steps: int, width: int, height: int, use_lyra: bool =
1204
 
1205
 
1206
  @spaces.GPU(duration=lambda *args: estimate_duration(
1207
- args[4], args[6], args[7], args[10],
1208
- "SDXL" in args[2] or "Illustrious" in args[2]
1209
  ))
1210
  def generate_image(
1211
  prompt: str,
 
1212
  negative_prompt: str,
1213
  model_choice: str,
1214
  clip_skip: int,
@@ -1297,6 +1312,7 @@ def generate_image(
1297
  seed=seed,
1298
  use_lyra=True,
1299
  clip_skip=clip_skip,
 
1300
  progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
1301
  )
1302
 
@@ -1330,17 +1346,28 @@ def create_demo():
1330
  | **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
1331
  | **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
1332
 
1333
- Enable **Lyra VAE** for CLIP+T5 fusion comparison!
 
 
 
 
1334
  """)
1335
 
1336
  with gr.Row():
1337
  with gr.Column(scale=1):
1338
  prompt = gr.TextArea(
1339
- label="Prompt",
1340
  value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
1341
  lines=3
1342
  )
1343
 
 
 
 
 
 
 
 
1344
  negative_prompt = gr.TextArea(
1345
  label="Negative Prompt",
1346
  value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
@@ -1470,25 +1497,28 @@ def create_demo():
1470
  examples=[
1471
  [
1472
  "masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
 
1473
  "lowres, bad anatomy, worst quality, low quality",
1474
  "Illustrious XL",
1475
  2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
1476
  ],
1477
  [
1478
  "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
 
1479
  "blurry, low quality",
1480
  "SDXL Base",
1481
  1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
1482
  ],
1483
  [
1484
  "cyberpunk city at night, neon lights, rain, highly detailed",
 
1485
  "low quality, blurry",
1486
  "Flow-Lune (SD1.5)",
1487
  1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
1488
  ],
1489
  ],
1490
  inputs=[
1491
- prompt, negative_prompt, model_choice, clip_skip,
1492
  num_steps, cfg_scale, width, height, shift,
1493
  use_flow_matching, use_lyra, seed, randomize_seed
1494
  ],
@@ -1565,7 +1595,7 @@ def create_demo():
1565
  generate_btn.click(
1566
  fn=generate_image,
1567
  inputs=[
1568
- prompt, negative_prompt, model_choice, clip_skip,
1569
  num_steps, cfg_scale, width, height, shift,
1570
  use_flow_matching, use_lyra, seed, randomize_seed
1571
  ],
 
258
  self,
259
  prompt: str,
260
  negative_prompt: str = "",
261
+ clip_skip: int = 1,
262
+ t5_summary: str = ""
263
  ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
264
  """Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
265
  if self.lyra_model is None or self.t5_encoder is None:
 
270
  prompt, negative_prompt, clip_skip
271
  )
272
 
273
+ # Format T5 input with pilcrow separator (¶)
274
+ # Training format was: "tags ¶ summary"
275
+ SUMMARY_SEPARATOR = "¶"
276
+ if t5_summary.strip():
277
+ t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
278
+ else:
279
+ # Fallback: duplicate prompt if no summary provided
280
+ t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {prompt}"
281
+
282
  # Get T5 embeddings
283
  t5_inputs = self.t5_tokenizer(
284
+ t5_prompt,
285
  max_length=512, # T5-XL uses 512
286
  padding='max_length',
287
  truncation=True,
 
322
 
323
  # Process negative prompt similarly if present
324
  if negative_prompt:
325
+ # For negative, just use the negative prompt without summary
326
+ t5_neg_prompt = f"{negative_prompt} {SUMMARY_SEPARATOR} {negative_prompt}"
327
+
328
  t5_inputs_neg = self.t5_tokenizer(
329
+ t5_neg_prompt,
330
  max_length=512,
331
  padding='max_length',
332
  truncation=True,
 
387
  seed: Optional[int] = None,
388
  use_lyra: bool = False,
389
  clip_skip: int = 1,
390
+ t5_summary: str = "",
391
  progress_callback=None
392
  ):
393
  """Generate image using SDXL architecture."""
 
401
  # Encode prompts
402
  if use_lyra and self.lyra_model is not None:
403
  prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
404
+ prompt, negative_prompt, clip_skip, t5_summary
405
  )
406
  else:
407
  prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
 
1218
 
1219
 
1220
  @spaces.GPU(duration=lambda *args: estimate_duration(
1221
+ args[5], args[7], args[8], args[11],
1222
+ "SDXL" in args[3] or "Illustrious" in args[3]
1223
  ))
1224
  def generate_image(
1225
  prompt: str,
1226
+ t5_summary: str,
1227
  negative_prompt: str,
1228
  model_choice: str,
1229
  clip_skip: int,
 
1312
  seed=seed,
1313
  use_lyra=True,
1314
  clip_skip=clip_skip,
1315
+ t5_summary=t5_summary,
1316
  progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
1317
  )
1318
 
 
1346
  | **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
1347
  | **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
1348
 
1349
+ **Lyra VAE** fuses CLIP + T5 embeddings using:
1350
+ - **Prompt (Tags)**: Booru-style tags for CLIP encoding
1351
+ - **T5 Summary**: Natural language description for T5 (format: `tags ¶ summary`)
1352
+
1353
+ Enable **Lyra VAE** for side-by-side comparison!
1354
  """)
1355
 
1356
  with gr.Row():
1357
  with gr.Column(scale=1):
1358
  prompt = gr.TextArea(
1359
+ label="Prompt (Tags for CLIP)",
1360
  value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
1361
  lines=3
1362
  )
1363
 
1364
+ t5_summary = gr.TextArea(
1365
+ label="T5 Summary (Natural Language for Lyra)",
1366
+ value="A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
1367
+ lines=2,
1368
+ info="Used after ¶ separator for T5. Leave empty to use tags only."
1369
+ )
1370
+
1371
  negative_prompt = gr.TextArea(
1372
  label="Negative Prompt",
1373
  value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
 
1497
  examples=[
1498
  [
1499
  "masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
1500
+ "A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
1501
  "lowres, bad anatomy, worst quality, low quality",
1502
  "Illustrious XL",
1503
  2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
1504
  ],
1505
  [
1506
  "A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
1507
+ "A breathtaking mountain vista bathed in warm golden light at sunset, with a perfectly still crystal clear lake reflecting the peaks",
1508
  "blurry, low quality",
1509
  "SDXL Base",
1510
  1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
1511
  ],
1512
  [
1513
  "cyberpunk city at night, neon lights, rain, highly detailed",
1514
+ "A futuristic cyberpunk metropolis at night with vibrant neon lights reflecting off rain-slicked streets",
1515
  "low quality, blurry",
1516
  "Flow-Lune (SD1.5)",
1517
  1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
1518
  ],
1519
  ],
1520
  inputs=[
1521
+ prompt, t5_summary, negative_prompt, model_choice, clip_skip,
1522
  num_steps, cfg_scale, width, height, shift,
1523
  use_flow_matching, use_lyra, seed, randomize_seed
1524
  ],
 
1595
  generate_btn.click(
1596
  fn=generate_image,
1597
  inputs=[
1598
+ prompt, t5_summary, negative_prompt, model_choice, clip_skip,
1599
  num_steps, cfg_scale, width, height, shift,
1600
  use_flow_matching, use_lyra, seed, randomize_seed
1601
  ],