FireRed-Image-Edit-1.1-Fast

Sleeping

App Files Files Community

primerz commited on Mar 14

Commit

5dc2f53

verified ·

1 Parent(s): 34256b8

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -20

app.py CHANGED Viewed

@@ -97,6 +97,20 @@ class FireRedTheme(Soft):
 theme = FireRedTheme()
 # ═══════════════════════════════════════════════════════════════════════
 #  MODEL
 # ═══════════════════════════════════════════════════════════════════════
@@ -113,22 +127,75 @@ from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 dtype = torch.bfloat16
 pipe = QwenImageEditPlusPipeline.from_pretrained(
     "FireRedTeam/FireRed-Image-Edit-1.1",
-    transformer=QwenImageTransformer2DModel.from_pretrained(
-        "prithivMLmods/Qwen-Image-Edit-Rapid-AIO-V23",
-        torch_dtype=dtype,
-        device_map="cuda",
-    ),
     torch_dtype=dtype,
 ).to(device)
 try:
     pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
     print("Flash Attention 3 Processor set successfully.")
 except Exception as e:
     print(f"Warning: Could not set FA3 processor: {e}")
 MAX_SEED = np.iinfo(np.int32).max
 DEFAULT_NEGATIVE_PROMPT = (
@@ -188,9 +255,7 @@ def infer(
     seed, randomize_seed, guidance_scale, steps,
     progress=gr.Progress(track_tqdm=True),
 ):
-    gc.collect()
-    torch.cuda.empty_cache()
     if not images:
         raise gr.Error("⚠️  Please upload at least one image.")
     if not prompt or not prompt.strip():
@@ -219,18 +284,24 @@ def infer(
     width, height = update_dimensions_on_upload(pil_images[0])
     try:
-        result = pipe(
-            image=pil_images,
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            num_inference_steps=steps,
-            generator=generator,
-            true_cfg_scale=guidance_scale,
-        ).images[0]
         return result, seed
     finally:
         gc.collect()
         torch.cuda.empty_cache()
@@ -589,7 +660,7 @@ with gr.Blocks(css=css, theme=theme, title="🔥 FireRed Image Edit") as demo:
         outputs=[images, prompt, output_image, info_box],
     )
-    # Generate
     run_button.click(
         fn=infer,
         inputs=[
@@ -597,6 +668,7 @@ with gr.Blocks(css=css, theme=theme, title="🔥 FireRed Image Edit") as demo:
             seed, randomize_seed, guidance_scale, steps,
         ],
         outputs=[output_image, seed],
     ).then(
         fn=format_info,
         inputs=[seed, images],
@@ -608,7 +680,12 @@ with gr.Blocks(css=css, theme=theme, title="🔥 FireRed Image Edit") as demo:
 # ═══════════════════════════════════════════════════════════════════════
 if __name__ == "__main__":
-    demo.queue(max_size=30).launch(
         mcp_server=True,
         ssr_mode=False,
         show_error=True,

 theme = FireRedTheme()
+# ═══════════════════════════════════════════════════════════════════════
+#  GLOBAL CUDA OPTIMIZATIONS
+# ═══════════════════════════════════════════════════════════════════════
+# Enable cuDNN autotuner — finds the fastest convolution algorithms for
+# the hardware and input sizes after a short warm-up.
+torch.backends.cudnn.benchmark = True
+# Allow TF32 on Ampere+ GPUs for ~3× faster matmuls with negligible
+# precision loss (already bf16 pipeline, so this is free perf).
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision("high")
 # ═══════════════════════════════════════════════════════════════════════
 #  MODEL
 # ═══════════════════════════════════════════════════════════════════════
 dtype = torch.bfloat16
+# Load transformer separately so we can optimise it before plugging in
+transformer = QwenImageTransformer2DModel.from_pretrained(
+    "prithivMLmods/Qwen-Image-Edit-Rapid-AIO-V23",
+    torch_dtype=dtype,
+    device_map="cuda",
+)
+# Attempt torch.compile for a fused-kernel speed-up on the denoising
+# backbone. Falls back gracefully if the environment doesn't support it
+# (older driver / torch version / dynamic-shape issues).
+try:
+    transformer = torch.compile(transformer, mode="reduce-overhead")
+    print("torch.compile applied to transformer (reduce-overhead).")
+except Exception as e:
+    print(f"torch.compile skipped: {e}")
 pipe = QwenImageEditPlusPipeline.from_pretrained(
     "FireRedTeam/FireRed-Image-Edit-1.1",
+    transformer=transformer,
     torch_dtype=dtype,
 ).to(device)
+# Flash Attention 3 processor — fastest path when available
 try:
     pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
     print("Flash Attention 3 Processor set successfully.")
 except Exception as e:
     print(f"Warning: Could not set FA3 processor: {e}")
+# VAE optimisations — process large images in tiles / slices so we
+# never OOM on the decode step, and still stay fast for normal sizes.
+try:
+    pipe.vae.enable_tiling()
+    print("VAE tiling enabled.")
+except Exception:
+    pass
+try:
+    pipe.vae.enable_slicing()
+    print("VAE slicing enabled.")
+except Exception:
+    pass
+# ── Warmup pass ─────────────────────────────────────────────────────
+# The first inference is always slower (CUDA context init, cuDNN
+# autotuner, torch.compile tracing). Run a tiny dummy forward so that
+# cost is paid at startup, not on the first user request.
+print("Running warmup inference …")
+try:
+    _warmup_img = Image.new("RGB", (64, 64), color=(128, 128, 128))
+    _warmup_gen = torch.Generator(device=device).manual_seed(0)
+    with torch.inference_mode():
+        pipe(
+            image=[_warmup_img],
+            prompt="warmup",
+            negative_prompt="",
+            height=64,
+            width=64,
+            num_inference_steps=1,
+            generator=_warmup_gen,
+            true_cfg_scale=1.0,
+        )
+    del _warmup_img, _warmup_gen
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("Warmup complete.")
+except Exception as e:
+    print(f"Warmup skipped: {e}")
 MAX_SEED = np.iinfo(np.int32).max
 DEFAULT_NEGATIVE_PROMPT = (
     seed, randomize_seed, guidance_scale, steps,
     progress=gr.Progress(track_tqdm=True),
 ):
+    # ── Input validation (cheap, do first) ──────────────────────────
     if not images:
         raise gr.Error("⚠️  Please upload at least one image.")
     if not prompt or not prompt.strip():
     width, height = update_dimensions_on_upload(pil_images[0])
     try:
+        # torch.inference_mode is strictly faster than torch.no_grad —
+        # it also disables view-tracking and version-counter bumps.
+        with torch.inference_mode():
+            result = pipe(
+                image=pil_images,
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                generator=generator,
+                true_cfg_scale=guidance_scale,
+            ).images[0]
         return result, seed
     finally:
+        # GC *after* inference to reclaim any temporaries the pipeline
+        # allocated. Avoid gc.collect() + empty_cache() *before*
+        # inference — that stalls the CUDA stream for nothing.
         gc.collect()
         torch.cuda.empty_cache()
         outputs=[images, prompt, output_image, info_box],
     )
+    # Generate — with a public api_name so the endpoint is discoverable
     run_button.click(
         fn=infer,
         inputs=[
             seed, randomize_seed, guidance_scale, steps,
         ],
         outputs=[output_image, seed],
+        api_name="edit",
     ).then(
         fn=format_info,
         inputs=[seed, images],
 # ═══════════════════════════════════════════════════════════════════════
 if __name__ == "__main__":
+    demo.queue(
+        max_size=30,
+        default_concurrency_limit=2,   # allow 2 concurrent GPU jobs
+    ).launch(
+        share=True,          # ← public shareable link
+        show_api=True,       # ← API docs visible at /docs
         mcp_server=True,
         ssr_mode=False,
         show_error=True,