Spaces:

rahul7star
/

zerocompiledtest

Paused

App Files Files Community

rahul7star commited on Sep 22, 2025

Commit

569bc6c

verified ·

1 Parent(s): 07742f6

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -54

app.py CHANGED Viewed

@@ -1,77 +1,66 @@
 import torch
 from diffusers import DiffusionPipeline
 import spaces
-from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights, aoti_capture, aoti_compile, aoti_apply
 from time import perf_counter
-import argparse
 CKPT_ID = "black-forest-labs/Flux.1-Dev"
-def get_pipe_kwargs():
-    return {
-        "prompt": "A cat holding a sign that says hello world",
-        "height": 512,  # reduce memory usage
-        "width": 512,
-        "guidance_scale": 3.5,
-        "num_inference_steps": 50,
-        "max_sequence_length": 512,
-        "generator": torch.manual_seed(0)
-    }
-def load_pipeline():
     pipe = DiffusionPipeline.from_pretrained(
         CKPT_ID,
-        torch_dtype=torch.float32,  # CPU-only
         device_map="cpu"
     )
     pipe.set_progress_bar_config(disable=True)
     return pipe
 @torch.no_grad()
-def aot_compile_load(pipe, regional=False):
-    prompt = "example prompt"
-    torch.compiler.reset()
     with torch._inductor.utils.fresh_inductor_cache():
-        if regional:
-            # Compile transformer blocks **one at a time** to save memory
-            for block_list in [pipe.transformer.transformer_blocks, pipe.transformer.single_transformer_blocks]:
-                for i, block in enumerate(block_list):
-                    with aoti_capture(block) as call:
-                        pipe(prompt=prompt)
-                    exported = torch.export.export(block, args=call.args, kwargs=call.kwargs)
-                    compiled = aoti_compile(exported)
-                    weights = ZeroGPUWeights(block.state_dict())
-                    compiled_block = ZeroGPUCompiledModel(compiled.archive_file, weights)
-                    block.forward = compiled_block  # replace forward with compiled block
-                    # Free memory
-                    del exported, compiled, weights, compiled_block, call
-                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        else:
-            # Compile the whole transformer at once
-            with aoti_capture(pipe.transformer) as call:
-                pipe(prompt=prompt)
-            exported = torch.export.export(pipe.transformer, args=call.args, kwargs=call.kwargs)
-            compiled = aoti_compile(exported)
-            aoti_apply(compiled, pipe.transformer)
-            del exported, compiled, call
     return pipe
-def measure_compile_time(pipe, regional=False):
     start = perf_counter()
-    pipe = aot_compile_load(pipe, regional=regional)
     end = perf_counter()
-    # Run inference to ensure it works
-    image = pipe(**get_pipe_kwargs()).images[0]
-    return end - start, image
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--regional", action="store_true")
-    args = parser.parse_args()
-    pipe = load_pipeline()
-    latency, image = measure_compile_time(pipe, regional=args.regional)
-    print(f"{args.regional=}, CPU compile + run latency: {latency:.2f} secs")
-    image.save(f"regional@{args.regional}.png")

 import torch
 from diffusers import DiffusionPipeline
 import spaces
+from spaces.zero.torch.aoti import aoti_capture, aoti_compile, aoti_apply
 from time import perf_counter
 CKPT_ID = "black-forest-labs/Flux.1-Dev"
+# -----------------------------
+# Pipeline arguments
+# -----------------------------
+PIPE_KWARGS = {
+    "prompt": "A cat holding a sign that says hello world",
+    "height": 256,  # very small to reduce memory
+    "width": 256,
+    "guidance_scale": 3.5,
+    "num_inference_steps": 25,  # fewer steps
+    "generator": torch.manual_seed(0)
+}
+# -----------------------------
+# Load pipeline
+# -----------------------------
+def load_pipe():
     pipe = DiffusionPipeline.from_pretrained(
         CKPT_ID,
+        torch_dtype=torch.float32,
         device_map="cpu"
     )
     pipe.set_progress_bar_config(disable=True)
     return pipe
+# -----------------------------
+# Compile transformer using aoti (lightweight)
+# -----------------------------
 @torch.no_grad()
+def compile_pipe(pipe):
     with torch._inductor.utils.fresh_inductor_cache():
+        # Capture + compile transformer once
+        with aoti_capture(pipe.transformer) as call:
+            pipe(prompt="dummy")
+        exported = torch.export.export(pipe.transformer, args=call.args, kwargs=call.kwargs)
+        compiled = aoti_compile(exported)
+        aoti_apply(compiled, pipe.transformer)
+        del exported, compiled, call
     return pipe
+# -----------------------------
+# Measure runtime
+# -----------------------------
+@torch.no_grad()
+def run_pipe(pipe):
     start = perf_counter()
+    image = pipe(**PIPE_KWARGS).images[0]
     end = perf_counter()
+    return end-start, image
+# -----------------------------
+# Main
+# -----------------------------
 if __name__ == "__main__":
+    pipe = load_pipe()
+    pipe = compile_pipe(pipe)  # light aoti compile
+    latency, image = run_pipe(pipe)
+    print(f"Lightweight CPU + aoti latency: {latency:.2f}s")
+    image.save("cpu_lightweight.png")