Spaces:

rahul7star
/

zerocompiledtest

Paused

App Files Files Community

rahul7star commited on Sep 22, 2025

Commit

07742f6

verified ·

1 Parent(s): c465c0d

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -28

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from diffusers import DiffusionPipeline
 import spaces
-from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights
 from time import perf_counter
 import argparse
@@ -10,8 +10,8 @@ CKPT_ID = "black-forest-labs/Flux.1-Dev"
 def get_pipe_kwargs():
     return {
         "prompt": "A cat holding a sign that says hello world",
-        "height": 1024,
-        "width": 1024,
         "guidance_scale": 3.5,
         "num_inference_steps": 50,
         "max_sequence_length": 512,
@@ -21,7 +21,7 @@ def get_pipe_kwargs():
 def load_pipeline():
     pipe = DiffusionPipeline.from_pretrained(
         CKPT_ID,
-        torch_dtype=torch.float32,  # CPU only
         device_map="cpu"
     )
     pipe.set_progress_bar_config(disable=True)
@@ -30,36 +30,32 @@ def load_pipeline():
 @torch.no_grad()
 def aot_compile_load(pipe, regional=False):
     prompt = "example prompt"
     torch.compiler.reset()
     with torch._inductor.utils.fresh_inductor_cache():
         if regional:
-            # Compile individual transformer blocks
-            for block in pipe.transformer.transformer_blocks:
-                with spaces.aoti_capture(block) as call:
-                    pipe(prompt=prompt)
-                exported = torch.export.export(block, args=call.args, kwargs=call.kwargs)
-                compiled = spaces.aoti_compile(exported)
-                weights = ZeroGPUWeights(block.state_dict())
-                compiled_block = ZeroGPUCompiledModel(compiled.archive_file, weights)
-                block.forward = compiled_block
-            for block in pipe.transformer.single_transformer_blocks:
-                with spaces.aoti_capture(block) as call:
-                    pipe(prompt=prompt)
-                exported = torch.export.export(block, args=call.args, kwargs=call.kwargs)
-                compiled = spaces.aoti_compile(exported)
-                weights = ZeroGPUWeights(block.state_dict())
-                compiled_block = ZeroGPUCompiledModel(compiled.archive_file, weights)
-                block.forward = compiled_block
         else:
-            # Compile the whole transformer
-            with spaces.aoti_capture(pipe.transformer) as call:
                 pipe(prompt=prompt)
             exported = torch.export.export(pipe.transformer, args=call.args, kwargs=call.kwargs)
-            compiled = spaces.aoti_compile(exported)
-            spaces.aoti_apply(compiled, pipe.transformer)
     return pipe
 def measure_compile_time(pipe, regional=False):

 import torch
 from diffusers import DiffusionPipeline
 import spaces
+from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights, aoti_capture, aoti_compile, aoti_apply
 from time import perf_counter
 import argparse
 def get_pipe_kwargs():
     return {
         "prompt": "A cat holding a sign that says hello world",
+        "height": 512,  # reduce memory usage
+        "width": 512,
         "guidance_scale": 3.5,
         "num_inference_steps": 50,
         "max_sequence_length": 512,
 def load_pipeline():
     pipe = DiffusionPipeline.from_pretrained(
         CKPT_ID,
+        torch_dtype=torch.float32,  # CPU-only
         device_map="cpu"
     )
     pipe.set_progress_bar_config(disable=True)
 @torch.no_grad()
 def aot_compile_load(pipe, regional=False):
     prompt = "example prompt"
     torch.compiler.reset()
     with torch._inductor.utils.fresh_inductor_cache():
         if regional:
+            # Compile transformer blocks **one at a time** to save memory
+            for block_list in [pipe.transformer.transformer_blocks, pipe.transformer.single_transformer_blocks]:
+                for i, block in enumerate(block_list):
+                    with aoti_capture(block) as call:
+                        pipe(prompt=prompt)
+                    exported = torch.export.export(block, args=call.args, kwargs=call.kwargs)
+                    compiled = aoti_compile(exported)
+                    weights = ZeroGPUWeights(block.state_dict())
+                    compiled_block = ZeroGPUCompiledModel(compiled.archive_file, weights)
+                    block.forward = compiled_block  # replace forward with compiled block
+                    # Free memory
+                    del exported, compiled, weights, compiled_block, call
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
         else:
+            # Compile the whole transformer at once
+            with aoti_capture(pipe.transformer) as call:
                 pipe(prompt=prompt)
             exported = torch.export.export(pipe.transformer, args=call.args, kwargs=call.kwargs)
+            compiled = aoti_compile(exported)
+            aoti_apply(compiled, pipe.transformer)
+            del exported, compiled, call
     return pipe
 def measure_compile_time(pipe, regional=False):