Spaces:

diffusers
/

optimized-diffusers-code

Runtime error

App Files Files Community

sayakpaul HF Staff commited on Sep 26, 2025

Commit

4e7f94b

verified ·

1 Parent(s): d891944

Sync from GitHub

Browse files

Files changed (4) hide show

README.md +2 -3
app.py +19 -8
prompts.py +51 -28
utils/pipeline_utils.py +2 -5

README.md CHANGED Viewed

@@ -10,12 +10,11 @@ pinned: false
 short_description: 'Optimize Diffusers Code on your hardware.'
 ---
-Still a WIP. Use an LLM to generate reasonable code snippets in a hardware-aware manner for Diffusers.
 ### Motivation
-Within the Diffusers, we support a bunch of optimization techniques (refer [here](https://huggingface.co/docs/diffusers/main/en/optimization/memory), [here](https://huggingface.co/docs/diffusers/main/en/optimization/cache), and [here](https://huggingface.co/docs/diffusers/main/en/optimization/fp16)). However, it can be
-daunting for our users to determine when to use what. Hence, this repository tries to take a stab
 at using an LLM to generate reasonable code snippets for a given pipeline checkpoint that respects
 user hardware configuration.

 short_description: 'Optimize Diffusers Code on your hardware.'
 ---
+Use an LLM to generate reasonable code snippets in a hardware-aware manner for Diffusers. Still experimental.
 ### Motivation
+Within the Diffusers, we support a bunch of optimization techniques (refer [here](https://huggingface.co/docs/diffusers/main/en/optimization/memory), [here](https://huggingface.co/docs/diffusers/main/en/optimization/cache), and [here](https://huggingface.co/docs/diffusers/main/en/optimization/fp16)). However, it can be daunting for our users to determine when to use what. Hence, this repository tries to take a stab
 at using an LLM to generate reasonable code snippets for a given pipeline checkpoint that respects
 user hardware configuration.

app.py CHANGED Viewed

@@ -11,11 +11,13 @@ def get_output_code(
     repo_id,
     gemini_model_to_use,
     disable_bf16,
-    enable_lossy,
     system_ram,
     gpu_vram,
     torch_compile_friendly,
     fp8_friendly,
 ):
     loading_mem_out = determine_pipe_loading_memory(repo_id, None, disable_bf16)
     load_memory = loading_mem_out["total_loading_memory_gb"]
@@ -36,7 +38,8 @@ def get_output_code(
         pipeline_loading_memory=load_memory,
         available_system_ram=system_ram,
         available_gpu_vram=gpu_vram,
-        enable_lossy_outputs=enable_lossy,
         is_fp8_supported=fp8_friendly,
         enable_torch_compile=torch_compile_friendly,
     )
@@ -79,16 +82,19 @@ with gr.Blocks() as demo:
                 disable_bf16 = gr.Checkbox(
                     label="Disable BF16 (Use FP32)",
                     value=False,
-                    info="Calculate using 32-bit precision instead of 16-bit.",
                 )
                 enable_lossy = gr.Checkbox(
-                    label="Allow Lossy Quantization", value=False, info="Consider 8-bit/4-bit quantization."
                 )
                 torch_compile_friendly = gr.Checkbox(
-                    label="torch.compile() friendly", value=False, info="Model is compatible with torch.compile."
                 )
                 fp8_friendly = gr.Checkbox(
-                    label="fp8 friendly", value=False, info="Model and hardware support FP8 precision."
                 )
         with gr.Column(scale=1):
@@ -99,6 +105,7 @@ with gr.Blocks() as demo:
         repo_id,
         gemini_model_to_use,
         disable_bf16,
         enable_lossy,
         system_ram,
         gpu_vram,
@@ -114,6 +121,7 @@ with gr.Blocks() as demo:
                     "gemini-2.5-pro",
                     False,
                     False,
                     64,
                     24,
                     True,
@@ -124,6 +132,7 @@ with gr.Blocks() as demo:
                     "gemini-2.5-flash",
                     False,
                     True,
                     16,
                     8,
                     False,
@@ -134,6 +143,7 @@ with gr.Blocks() as demo:
                     "gemini-2.5-pro",
                     False,
                     False,
                     32,
                     16,
                     True,
@@ -149,8 +159,9 @@ with gr.Blocks() as demo:
         gr.Markdown(
             """
             - Try changing to the model from Flash to Pro if the results are bad.
-            - Try to be as specific as possible about your local machine.
             - As a rule of thumb, GPUs from RTX 4090 and later, are generally good for using `torch.compile()`.
             - To leverage FP8, the GPU needs to have a compute capability of at least 8.9.
             - Check out the following docs for optimization in Diffusers:
                 * [Memory](https://huggingface.co/docs/diffusers/main/en/optimization/memory)
@@ -165,7 +176,7 @@ with gr.Blocks() as demo:
     gr.Markdown("---")
-    with gr.Accordion("Generated Code (expand)", open=False):
         code_output = gr.Code(interactive=True, language="python")
     gr.Markdown(

     repo_id,
     gemini_model_to_use,
     disable_bf16,
+    enbale_caching,
+    enable_quantization,
     system_ram,
     gpu_vram,
     torch_compile_friendly,
     fp8_friendly,
+    progress=gr.Progress(track_tqdm=True)
 ):
     loading_mem_out = determine_pipe_loading_memory(repo_id, None, disable_bf16)
     load_memory = loading_mem_out["total_loading_memory_gb"]
         pipeline_loading_memory=load_memory,
         available_system_ram=system_ram,
         available_gpu_vram=gpu_vram,
+        enable_caching=enable_caching,
+        enable_quantization=enable_quantization,
         is_fp8_supported=fp8_friendly,
         enable_torch_compile=torch_compile_friendly,
     )
                 disable_bf16 = gr.Checkbox(
                     label="Disable BF16 (Use FP32)",
                     value=False,
+                    info="Compute in 32-bit precision (caution ⚠️)",
+                )
+                enable_caching = gr.Checkbox(
+                    label="Enable lossy caching", value=False, info="Consider applying caching for speed"
                 )
                 enable_lossy = gr.Checkbox(
+                    label="Allow Lossy Quantization", value=False, info="Consider 8-bit/4-bit quantization"
                 )
                 torch_compile_friendly = gr.Checkbox(
+                    label="torch.compile() friendly", value=False, info="Model is compatible with torch.compile"
                 )
                 fp8_friendly = gr.Checkbox(
+                    label="fp8 friendly", value=False, info="Model and hardware support FP8 precision"
                 )
         with gr.Column(scale=1):
         repo_id,
         gemini_model_to_use,
         disable_bf16,
+        enable_caching,
         enable_lossy,
         system_ram,
         gpu_vram,
                     "gemini-2.5-pro",
                     False,
                     False,
+                    False,
                     64,
                     24,
                     True,
                     "gemini-2.5-flash",
                     False,
                     True,
+                    False,
                     16,
                     8,
                     False,
                     "gemini-2.5-pro",
                     False,
                     False,
+                    False,
                     32,
                     16,
                     True,
         gr.Markdown(
             """
             - Try changing to the model from Flash to Pro if the results are bad.
+            - Please provide the VRAM and RAM details accurately as the suggestions depend on them.
             - As a rule of thumb, GPUs from RTX 4090 and later, are generally good for using `torch.compile()`.
+            - When lossy quantization isn't preferred try enabling caching. Caching can still be lossy, though.
             - To leverage FP8, the GPU needs to have a compute capability of at least 8.9.
             - Check out the following docs for optimization in Diffusers:
                 * [Memory](https://huggingface.co/docs/diffusers/main/en/optimization/memory)
     gr.Markdown("---")
+    with gr.Accordion("Generated Code 💻", open=True):
         code_output = gr.Code(interactive=True, language="python")
     gr.Markdown(

prompts.py CHANGED Viewed

@@ -15,15 +15,15 @@ image = pipe("photo of a dog sitting beside a river").images[0]
 ```
 Your task will be to output a reasonable inference code in Python from user-supplied information about their
-needs. More specifically, you will be provided with the following information (in no particular order):
 * `ckpt_id` of the diffusion pipeline
 * Loading memory of a diffusion pipeline in GB
 * Available system RAM in GB
 * Available GPU VRAM in GB
-* If the user can afford to have lossy outputs (the likes of quantization)
-* If FP8 is supported
-* If the available GPU supports the latest `torch.compile()` knobs
 There are three categories of system RAM, broadly:
@@ -59,22 +59,12 @@ onload_device = torch.device("cuda")
 pipe = DiffusionPipeline.from_pretrained(CKPT_ID, torch_dtype=torch.bfloat16)
 offload_dir = "DIRECTORY" # change me
-for name, module in pipe.components.items():
-    if hasattr(component, "_supports_group_offloading") and component._supports_group_offloading:
-        module.enable_group_offload(
-            onload_device=onload_device,
-            offload_type="leaf_level",
-            use_stream=True,
-            offload_to_disk_path=f"{offload_dir}/{name}"
-        )
-    elif isinstance(component, (PreTrainedModel, torch.nn.Module)):
-        apply_group_offloading(
-            module,
-            onload_device=onload_device,
-            offload_type="leaf_level",
-            use_stream=True,
-            offload_to_disk_path=f"{offload_dir}/{name}"
-    )
 # Inference goes here.
 ...
@@ -104,6 +94,32 @@ satisfied by the available VRAM:
 pipe = pipe.to("cuda")
 ```
 ## Guidance on using quantization
 If the user specifies to use quantization, then you should default to using bitsandbytes 4bit. The code here
@@ -143,28 +159,34 @@ pipe = DiffusionPipeline.from_pretrained(CKPT_ID, quantization_config=quant_conf
 ...
 ```
 ## Guidance on using `torch.compile()`
 If the user wants to additionally boost inference speed, then you should the following line of code just before
 inference:
-* Add the following when offloading was applied: `torch._dynamo.config.recompile_limit = 1000`.
 * ONLY, add the following when `bitsandbytes` was used for `quant_backend`: `torch._dynamo.config.capture_dynamic_output_shape_ops = True`.
-* Finally, add `pipe.transformer.compile()`.
 * Add `pipe.vae.decode = torch.compile(vae.decode)` as a comment.
 In case no offloading was applied, then the line should be:
 ```py
-pipe.transformer.compile(fullgraph=True)
 ```
 ## Other guidelines
-* When the available VRAM > pipeline loading memory, you should suggest using `pipe = pipe.to("cuda")`.
-* If the user prefers not to use quantization and further reduce memory, then suggest using:
-`pipe.transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)`. Note
-that this is different from using FP8. In FP8, we use quantization like shown above.
 * Do NOT add any extra imports or lines of code that will not be used.
 * Do NOT try to be too creative about combining the optimization techniques laid out above.
 * Do NOT add extra arguments to the `pipe` call other than the `prompt`.
@@ -184,7 +206,8 @@ ckpt_id: {ckpt_id}
 pipeline_loading_memory_GB: {pipeline_loading_memory}
 available_system_ram_GB: {available_system_ram}
 available_gpu_vram_GB: {available_gpu_vram}
-enable_lossy_outputs: {enable_lossy_outputs}
 is_fp8_supported: {is_fp8_supported}
 enable_torch_compile: {enable_torch_compile}
 """

 ```
 Your task will be to output a reasonable inference code in Python from user-supplied information about their
+needs. More specifically, you will be provided with the following user information (in no particular order):
 * `ckpt_id` of the diffusion pipeline
 * Loading memory of a diffusion pipeline in GB
 * Available system RAM in GB
 * Available GPU VRAM in GB
+* If the user can afford to have lossy outputs (either quantization or caching)
+* If FP8 precision is supported
+* If the available GPU supports compatibility with `torch.compile`
 There are three categories of system RAM, broadly:
 pipe = DiffusionPipeline.from_pretrained(CKPT_ID, torch_dtype=torch.bfloat16)
 offload_dir = "DIRECTORY" # change me
+pipe.enable_group_offload(
+    onload_device=onload_device,
+    offload_type="leaf_level",
+    use_stream=True,
+    offload_to_disk_path=f"{offload_dir}/{name}"
+)
 # Inference goes here.
 ...
 pipe = pipe.to("cuda")
 ```
+When the user prefers to have lossy outputs (either through quantization or caching), prefer combining offloading
+with either quantization or caching as detailed below.
+## Guidance on using caching
+When caching is enabled, you can leverage `FasterCache` like so:
+```py
+config = FasterCacheConfig(
+    spatial_attention_block_skip_range=2,
+    spatial_attention_timestep_skip_range=(-1, 681),
+    current_timestep_callback=lambda: pipe.current_timestep,
+    attention_weight_callback=lambda _: 0.3,
+    unconditional_batch_skip_range=5,
+    unconditional_batch_timestep_skip_range=(-1, 781),
+    tensor_format="BCHW", # for video models, this should be BFCHW
+)
+pipe.transformer.enable_cache(config)
+# Rest of the code goes here.
+...
+```
+Recommend the users to also consult the documentation:
+https://huggingface.co/docs/diffusers/main/en/optimization/cache
 ## Guidance on using quantization
 If the user specifies to use quantization, then you should default to using bitsandbytes 4bit. The code here
 ...
 ```
+**Some additional notes**:
+* Offloading can be combined with quantization. However, this is only supported with `bitsandbytes`.
+* If the VRAM and RAM are very low consider combining quantization with offloading.
 ## Guidance on using `torch.compile()`
 If the user wants to additionally boost inference speed, then you should the following line of code just before
 inference:
 * ONLY, add the following when `bitsandbytes` was used for `quant_backend`: `torch._dynamo.config.capture_dynamic_output_shape_ops = True`.
+* Finally, add `pipe.transformer.compile_repeated_blocks()`.
 * Add `pipe.vae.decode = torch.compile(vae.decode)` as a comment.
 In case no offloading was applied, then the line should be:
 ```py
+pipe.transformer.compile_repeated_blocks(fullgraph=True)
 ```
 ## Other guidelines
+* For the line of code that actually calls the `pipe`, always recommend users to verify the call arguments.
+* When the available VRAM is somewhat greater than pipeline loading memory, you should suggest using `pipe = pipe.to("cuda")`. But in
+cases where, VRAM is only tiny bit greater, you should suggest the use of offloading. For example, if the available VRAM
+is 32 GBs and pipeline loading memory is 31.5 GBs, it's better to use offloading.
+* If the user prefers not to use quantization and still reduce memory, then suggest using:
+`pipe.transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)`.
 * Do NOT add any extra imports or lines of code that will not be used.
 * Do NOT try to be too creative about combining the optimization techniques laid out above.
 * Do NOT add extra arguments to the `pipe` call other than the `prompt`.
 pipeline_loading_memory_GB: {pipeline_loading_memory}
 available_system_ram_GB: {available_system_ram}
 available_gpu_vram_GB: {available_gpu_vram}
+enable_caching: {enable_caching}
+enable_quantization: {enable_quantization}
 is_fp8_supported: {is_fp8_supported}
 enable_torch_compile: {enable_torch_compile}
 """

utils/pipeline_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 import functools
 import os
 import safetensors.torch
-from huggingface_hub import model_info, hf_hub_download
 import tempfile
 import torch
 import functools
@@ -189,7 +189,4 @@ if __name__ == "__main__":
     safetensor_files = output["components"]
     print(f"{total_size_gb=} GB")
     print(f"{safetensor_files=}")
-    print("\n")
-    # total_size_gb, safetensor_files = _determine_memory_from_local_ckpt("LOCAL_DIR") # change me.
-    # print(f"{total_size_gb=} GB")
-    # print(f"{safetensor_files=}")

 import functools
 import os
 import safetensors.torch
+from huggingface_hub import model_info
 import tempfile
 import torch
 import functools
     safetensor_files = output["components"]
     print(f"{total_size_gb=} GB")
     print(f"{safetensor_files=}")
+    print("\n")