Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -258,7 +258,8 @@ class SDXLFlowMatchingPipeline:
|
|
| 258 |
self,
|
| 259 |
prompt: str,
|
| 260 |
negative_prompt: str = "",
|
| 261 |
-
clip_skip: int = 1
|
|
|
|
| 262 |
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 263 |
"""Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
|
| 264 |
if self.lyra_model is None or self.t5_encoder is None:
|
|
@@ -269,9 +270,18 @@ class SDXLFlowMatchingPipeline:
|
|
| 269 |
prompt, negative_prompt, clip_skip
|
| 270 |
)
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
# Get T5 embeddings
|
| 273 |
t5_inputs = self.t5_tokenizer(
|
| 274 |
-
|
| 275 |
max_length=512, # T5-XL uses 512
|
| 276 |
padding='max_length',
|
| 277 |
truncation=True,
|
|
@@ -312,8 +322,11 @@ class SDXLFlowMatchingPipeline:
|
|
| 312 |
|
| 313 |
# Process negative prompt similarly if present
|
| 314 |
if negative_prompt:
|
|
|
|
|
|
|
|
|
|
| 315 |
t5_inputs_neg = self.t5_tokenizer(
|
| 316 |
-
|
| 317 |
max_length=512,
|
| 318 |
padding='max_length',
|
| 319 |
truncation=True,
|
|
@@ -374,6 +387,7 @@ class SDXLFlowMatchingPipeline:
|
|
| 374 |
seed: Optional[int] = None,
|
| 375 |
use_lyra: bool = False,
|
| 376 |
clip_skip: int = 1,
|
|
|
|
| 377 |
progress_callback=None
|
| 378 |
):
|
| 379 |
"""Generate image using SDXL architecture."""
|
|
@@ -387,7 +401,7 @@ class SDXLFlowMatchingPipeline:
|
|
| 387 |
# Encode prompts
|
| 388 |
if use_lyra and self.lyra_model is not None:
|
| 389 |
prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
|
| 390 |
-
prompt, negative_prompt, clip_skip
|
| 391 |
)
|
| 392 |
else:
|
| 393 |
prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
|
|
@@ -1204,11 +1218,12 @@ def estimate_duration(num_steps: int, width: int, height: int, use_lyra: bool =
|
|
| 1204 |
|
| 1205 |
|
| 1206 |
@spaces.GPU(duration=lambda *args: estimate_duration(
|
| 1207 |
-
args[
|
| 1208 |
-
"SDXL" in args[
|
| 1209 |
))
|
| 1210 |
def generate_image(
|
| 1211 |
prompt: str,
|
|
|
|
| 1212 |
negative_prompt: str,
|
| 1213 |
model_choice: str,
|
| 1214 |
clip_skip: int,
|
|
@@ -1297,6 +1312,7 @@ def generate_image(
|
|
| 1297 |
seed=seed,
|
| 1298 |
use_lyra=True,
|
| 1299 |
clip_skip=clip_skip,
|
|
|
|
| 1300 |
progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
|
| 1301 |
)
|
| 1302 |
|
|
@@ -1330,17 +1346,28 @@ def create_demo():
|
|
| 1330 |
| **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
|
| 1331 |
| **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
|
| 1332 |
|
| 1333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1334 |
""")
|
| 1335 |
|
| 1336 |
with gr.Row():
|
| 1337 |
with gr.Column(scale=1):
|
| 1338 |
prompt = gr.TextArea(
|
| 1339 |
-
label="Prompt",
|
| 1340 |
value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
|
| 1341 |
lines=3
|
| 1342 |
)
|
| 1343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1344 |
negative_prompt = gr.TextArea(
|
| 1345 |
label="Negative Prompt",
|
| 1346 |
value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
|
|
@@ -1470,25 +1497,28 @@ def create_demo():
|
|
| 1470 |
examples=[
|
| 1471 |
[
|
| 1472 |
"masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
|
|
|
|
| 1473 |
"lowres, bad anatomy, worst quality, low quality",
|
| 1474 |
"Illustrious XL",
|
| 1475 |
2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
|
| 1476 |
],
|
| 1477 |
[
|
| 1478 |
"A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
|
|
|
|
| 1479 |
"blurry, low quality",
|
| 1480 |
"SDXL Base",
|
| 1481 |
1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
|
| 1482 |
],
|
| 1483 |
[
|
| 1484 |
"cyberpunk city at night, neon lights, rain, highly detailed",
|
|
|
|
| 1485 |
"low quality, blurry",
|
| 1486 |
"Flow-Lune (SD1.5)",
|
| 1487 |
1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
|
| 1488 |
],
|
| 1489 |
],
|
| 1490 |
inputs=[
|
| 1491 |
-
prompt, negative_prompt, model_choice, clip_skip,
|
| 1492 |
num_steps, cfg_scale, width, height, shift,
|
| 1493 |
use_flow_matching, use_lyra, seed, randomize_seed
|
| 1494 |
],
|
|
@@ -1565,7 +1595,7 @@ def create_demo():
|
|
| 1565 |
generate_btn.click(
|
| 1566 |
fn=generate_image,
|
| 1567 |
inputs=[
|
| 1568 |
-
prompt, negative_prompt, model_choice, clip_skip,
|
| 1569 |
num_steps, cfg_scale, width, height, shift,
|
| 1570 |
use_flow_matching, use_lyra, seed, randomize_seed
|
| 1571 |
],
|
|
|
|
| 258 |
self,
|
| 259 |
prompt: str,
|
| 260 |
negative_prompt: str = "",
|
| 261 |
+
clip_skip: int = 1,
|
| 262 |
+
t5_summary: str = ""
|
| 263 |
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 264 |
"""Encode prompts using Lyra VAE v2 fusion (CLIP + T5)."""
|
| 265 |
if self.lyra_model is None or self.t5_encoder is None:
|
|
|
|
| 270 |
prompt, negative_prompt, clip_skip
|
| 271 |
)
|
| 272 |
|
| 273 |
+
# Format T5 input with pilcrow separator (¶)
|
| 274 |
+
# Training format was: "tags ¶ summary"
|
| 275 |
+
SUMMARY_SEPARATOR = "¶"
|
| 276 |
+
if t5_summary.strip():
|
| 277 |
+
t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {t5_summary}"
|
| 278 |
+
else:
|
| 279 |
+
# Fallback: duplicate prompt if no summary provided
|
| 280 |
+
t5_prompt = f"{prompt} {SUMMARY_SEPARATOR} {prompt}"
|
| 281 |
+
|
| 282 |
# Get T5 embeddings
|
| 283 |
t5_inputs = self.t5_tokenizer(
|
| 284 |
+
t5_prompt,
|
| 285 |
max_length=512, # T5-XL uses 512
|
| 286 |
padding='max_length',
|
| 287 |
truncation=True,
|
|
|
|
| 322 |
|
| 323 |
# Process negative prompt similarly if present
|
| 324 |
if negative_prompt:
|
| 325 |
+
# For negative, just use the negative prompt without summary
|
| 326 |
+
t5_neg_prompt = f"{negative_prompt} {SUMMARY_SEPARATOR} {negative_prompt}"
|
| 327 |
+
|
| 328 |
t5_inputs_neg = self.t5_tokenizer(
|
| 329 |
+
t5_neg_prompt,
|
| 330 |
max_length=512,
|
| 331 |
padding='max_length',
|
| 332 |
truncation=True,
|
|
|
|
| 387 |
seed: Optional[int] = None,
|
| 388 |
use_lyra: bool = False,
|
| 389 |
clip_skip: int = 1,
|
| 390 |
+
t5_summary: str = "",
|
| 391 |
progress_callback=None
|
| 392 |
):
|
| 393 |
"""Generate image using SDXL architecture."""
|
|
|
|
| 401 |
# Encode prompts
|
| 402 |
if use_lyra and self.lyra_model is not None:
|
| 403 |
prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt_lyra(
|
| 404 |
+
prompt, negative_prompt, clip_skip, t5_summary
|
| 405 |
)
|
| 406 |
else:
|
| 407 |
prompt_embeds, negative_prompt_embeds, pooled, negative_pooled = self.encode_prompt(
|
|
|
|
| 1218 |
|
| 1219 |
|
| 1220 |
@spaces.GPU(duration=lambda *args: estimate_duration(
|
| 1221 |
+
args[5], args[7], args[8], args[11],
|
| 1222 |
+
"SDXL" in args[3] or "Illustrious" in args[3]
|
| 1223 |
))
|
| 1224 |
def generate_image(
|
| 1225 |
prompt: str,
|
| 1226 |
+
t5_summary: str,
|
| 1227 |
negative_prompt: str,
|
| 1228 |
model_choice: str,
|
| 1229 |
clip_skip: int,
|
|
|
|
| 1312 |
seed=seed,
|
| 1313 |
use_lyra=True,
|
| 1314 |
clip_skip=clip_skip,
|
| 1315 |
+
t5_summary=t5_summary,
|
| 1316 |
progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
|
| 1317 |
)
|
| 1318 |
|
|
|
|
| 1346 |
| **Flow-Lune** | SD1.5 | v1 (T5-base) | Fast flow matching (15-25 steps) |
|
| 1347 |
| **SD1.5 Base** | SD1.5 | v1 (T5-base) | Baseline comparison |
|
| 1348 |
|
| 1349 |
+
**Lyra VAE** fuses CLIP + T5 embeddings using:
|
| 1350 |
+
- **Prompt (Tags)**: Booru-style tags for CLIP encoding
|
| 1351 |
+
- **T5 Summary**: Natural language description for T5 (format: `tags ¶ summary`)
|
| 1352 |
+
|
| 1353 |
+
Enable **Lyra VAE** for side-by-side comparison!
|
| 1354 |
""")
|
| 1355 |
|
| 1356 |
with gr.Row():
|
| 1357 |
with gr.Column(scale=1):
|
| 1358 |
prompt = gr.TextArea(
|
| 1359 |
+
label="Prompt (Tags for CLIP)",
|
| 1360 |
value="masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
|
| 1361 |
lines=3
|
| 1362 |
)
|
| 1363 |
|
| 1364 |
+
t5_summary = gr.TextArea(
|
| 1365 |
+
label="T5 Summary (Natural Language for Lyra)",
|
| 1366 |
+
value="A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
|
| 1367 |
+
lines=2,
|
| 1368 |
+
info="Used after ¶ separator for T5. Leave empty to use tags only."
|
| 1369 |
+
)
|
| 1370 |
+
|
| 1371 |
negative_prompt = gr.TextArea(
|
| 1372 |
label="Negative Prompt",
|
| 1373 |
value="lowres, bad anatomy, bad hands, text, error, cropped, worst quality, low quality",
|
|
|
|
| 1497 |
examples=[
|
| 1498 |
[
|
| 1499 |
"masterpiece, best quality, 1girl, blue hair, school uniform, cherry blossoms, detailed background",
|
| 1500 |
+
"A beautiful anime girl with flowing blue hair wearing a school uniform, surrounded by delicate pink cherry blossoms against a bright sky",
|
| 1501 |
"lowres, bad anatomy, worst quality, low quality",
|
| 1502 |
"Illustrious XL",
|
| 1503 |
2, 25, 7.0, 1024, 1024, 0.0, False, True, 42, False
|
| 1504 |
],
|
| 1505 |
[
|
| 1506 |
"A majestic mountain landscape at golden hour, crystal clear lake, photorealistic, 8k",
|
| 1507 |
+
"A breathtaking mountain vista bathed in warm golden light at sunset, with a perfectly still crystal clear lake reflecting the peaks",
|
| 1508 |
"blurry, low quality",
|
| 1509 |
"SDXL Base",
|
| 1510 |
1, 30, 7.5, 1024, 1024, 0.0, False, True, 123, False
|
| 1511 |
],
|
| 1512 |
[
|
| 1513 |
"cyberpunk city at night, neon lights, rain, highly detailed",
|
| 1514 |
+
"A futuristic cyberpunk metropolis at night with vibrant neon lights reflecting off rain-slicked streets",
|
| 1515 |
"low quality, blurry",
|
| 1516 |
"Flow-Lune (SD1.5)",
|
| 1517 |
1, 20, 7.5, 512, 512, 2.5, True, True, 456, False
|
| 1518 |
],
|
| 1519 |
],
|
| 1520 |
inputs=[
|
| 1521 |
+
prompt, t5_summary, negative_prompt, model_choice, clip_skip,
|
| 1522 |
num_steps, cfg_scale, width, height, shift,
|
| 1523 |
use_flow_matching, use_lyra, seed, randomize_seed
|
| 1524 |
],
|
|
|
|
| 1595 |
generate_btn.click(
|
| 1596 |
fn=generate_image,
|
| 1597 |
inputs=[
|
| 1598 |
+
prompt, t5_summary, negative_prompt, model_choice, clip_skip,
|
| 1599 |
num_steps, cfg_scale, width, height, shift,
|
| 1600 |
use_flow_matching, use_lyra, seed, randomize_seed
|
| 1601 |
],
|