sayakpaul HF Staff commited on
Commit
4e7f94b
·
verified ·
1 Parent(s): d891944

Sync from GitHub

Browse files
Files changed (4) hide show
  1. README.md +2 -3
  2. app.py +19 -8
  3. prompts.py +51 -28
  4. utils/pipeline_utils.py +2 -5
README.md CHANGED
@@ -10,12 +10,11 @@ pinned: false
10
  short_description: 'Optimize Diffusers Code on your hardware.'
11
  ---
12
 
13
- Still a WIP. Use an LLM to generate reasonable code snippets in a hardware-aware manner for Diffusers.
14
 
15
  ### Motivation
16
 
17
- Within the Diffusers, we support a bunch of optimization techniques (refer [here](https://huggingface.co/docs/diffusers/main/en/optimization/memory), [here](https://huggingface.co/docs/diffusers/main/en/optimization/cache), and [here](https://huggingface.co/docs/diffusers/main/en/optimization/fp16)). However, it can be
18
- daunting for our users to determine when to use what. Hence, this repository tries to take a stab
19
  at using an LLM to generate reasonable code snippets for a given pipeline checkpoint that respects
20
  user hardware configuration.
21
 
 
10
  short_description: 'Optimize Diffusers Code on your hardware.'
11
  ---
12
 
13
+ Use an LLM to generate reasonable code snippets in a hardware-aware manner for Diffusers. Still experimental.
14
 
15
  ### Motivation
16
 
17
+ Within the Diffusers, we support a bunch of optimization techniques (refer [here](https://huggingface.co/docs/diffusers/main/en/optimization/memory), [here](https://huggingface.co/docs/diffusers/main/en/optimization/cache), and [here](https://huggingface.co/docs/diffusers/main/en/optimization/fp16)). However, it can be daunting for our users to determine when to use what. Hence, this repository tries to take a stab
 
18
  at using an LLM to generate reasonable code snippets for a given pipeline checkpoint that respects
19
  user hardware configuration.
20
 
app.py CHANGED
@@ -11,11 +11,13 @@ def get_output_code(
11
  repo_id,
12
  gemini_model_to_use,
13
  disable_bf16,
14
- enable_lossy,
 
15
  system_ram,
16
  gpu_vram,
17
  torch_compile_friendly,
18
  fp8_friendly,
 
19
  ):
20
  loading_mem_out = determine_pipe_loading_memory(repo_id, None, disable_bf16)
21
  load_memory = loading_mem_out["total_loading_memory_gb"]
@@ -36,7 +38,8 @@ def get_output_code(
36
  pipeline_loading_memory=load_memory,
37
  available_system_ram=system_ram,
38
  available_gpu_vram=gpu_vram,
39
- enable_lossy_outputs=enable_lossy,
 
40
  is_fp8_supported=fp8_friendly,
41
  enable_torch_compile=torch_compile_friendly,
42
  )
@@ -79,16 +82,19 @@ with gr.Blocks() as demo:
79
  disable_bf16 = gr.Checkbox(
80
  label="Disable BF16 (Use FP32)",
81
  value=False,
82
- info="Calculate using 32-bit precision instead of 16-bit.",
 
 
 
83
  )
84
  enable_lossy = gr.Checkbox(
85
- label="Allow Lossy Quantization", value=False, info="Consider 8-bit/4-bit quantization."
86
  )
87
  torch_compile_friendly = gr.Checkbox(
88
- label="torch.compile() friendly", value=False, info="Model is compatible with torch.compile."
89
  )
90
  fp8_friendly = gr.Checkbox(
91
- label="fp8 friendly", value=False, info="Model and hardware support FP8 precision."
92
  )
93
 
94
  with gr.Column(scale=1):
@@ -99,6 +105,7 @@ with gr.Blocks() as demo:
99
  repo_id,
100
  gemini_model_to_use,
101
  disable_bf16,
 
102
  enable_lossy,
103
  system_ram,
104
  gpu_vram,
@@ -114,6 +121,7 @@ with gr.Blocks() as demo:
114
  "gemini-2.5-pro",
115
  False,
116
  False,
 
117
  64,
118
  24,
119
  True,
@@ -124,6 +132,7 @@ with gr.Blocks() as demo:
124
  "gemini-2.5-flash",
125
  False,
126
  True,
 
127
  16,
128
  8,
129
  False,
@@ -134,6 +143,7 @@ with gr.Blocks() as demo:
134
  "gemini-2.5-pro",
135
  False,
136
  False,
 
137
  32,
138
  16,
139
  True,
@@ -149,8 +159,9 @@ with gr.Blocks() as demo:
149
  gr.Markdown(
150
  """
151
  - Try changing to the model from Flash to Pro if the results are bad.
152
- - Try to be as specific as possible about your local machine.
153
  - As a rule of thumb, GPUs from RTX 4090 and later, are generally good for using `torch.compile()`.
 
154
  - To leverage FP8, the GPU needs to have a compute capability of at least 8.9.
155
  - Check out the following docs for optimization in Diffusers:
156
  * [Memory](https://huggingface.co/docs/diffusers/main/en/optimization/memory)
@@ -165,7 +176,7 @@ with gr.Blocks() as demo:
165
 
166
  gr.Markdown("---")
167
 
168
- with gr.Accordion("Generated Code (expand)", open=False):
169
  code_output = gr.Code(interactive=True, language="python")
170
 
171
  gr.Markdown(
 
11
  repo_id,
12
  gemini_model_to_use,
13
  disable_bf16,
14
+ enbale_caching,
15
+ enable_quantization,
16
  system_ram,
17
  gpu_vram,
18
  torch_compile_friendly,
19
  fp8_friendly,
20
+ progress=gr.Progress(track_tqdm=True)
21
  ):
22
  loading_mem_out = determine_pipe_loading_memory(repo_id, None, disable_bf16)
23
  load_memory = loading_mem_out["total_loading_memory_gb"]
 
38
  pipeline_loading_memory=load_memory,
39
  available_system_ram=system_ram,
40
  available_gpu_vram=gpu_vram,
41
+ enable_caching=enable_caching,
42
+ enable_quantization=enable_quantization,
43
  is_fp8_supported=fp8_friendly,
44
  enable_torch_compile=torch_compile_friendly,
45
  )
 
82
  disable_bf16 = gr.Checkbox(
83
  label="Disable BF16 (Use FP32)",
84
  value=False,
85
+ info="Compute in 32-bit precision (caution ⚠️)",
86
+ )
87
+ enable_caching = gr.Checkbox(
88
+ label="Enable lossy caching", value=False, info="Consider applying caching for speed"
89
  )
90
  enable_lossy = gr.Checkbox(
91
+ label="Allow Lossy Quantization", value=False, info="Consider 8-bit/4-bit quantization"
92
  )
93
  torch_compile_friendly = gr.Checkbox(
94
+ label="torch.compile() friendly", value=False, info="Model is compatible with torch.compile"
95
  )
96
  fp8_friendly = gr.Checkbox(
97
+ label="fp8 friendly", value=False, info="Model and hardware support FP8 precision"
98
  )
99
 
100
  with gr.Column(scale=1):
 
105
  repo_id,
106
  gemini_model_to_use,
107
  disable_bf16,
108
+ enable_caching,
109
  enable_lossy,
110
  system_ram,
111
  gpu_vram,
 
121
  "gemini-2.5-pro",
122
  False,
123
  False,
124
+ False,
125
  64,
126
  24,
127
  True,
 
132
  "gemini-2.5-flash",
133
  False,
134
  True,
135
+ False,
136
  16,
137
  8,
138
  False,
 
143
  "gemini-2.5-pro",
144
  False,
145
  False,
146
+ False,
147
  32,
148
  16,
149
  True,
 
159
  gr.Markdown(
160
  """
161
  - Try changing to the model from Flash to Pro if the results are bad.
162
+ - Please provide the VRAM and RAM details accurately as the suggestions depend on them.
163
  - As a rule of thumb, GPUs from RTX 4090 and later, are generally good for using `torch.compile()`.
164
+ - When lossy quantization isn't preferred try enabling caching. Caching can still be lossy, though.
165
  - To leverage FP8, the GPU needs to have a compute capability of at least 8.9.
166
  - Check out the following docs for optimization in Diffusers:
167
  * [Memory](https://huggingface.co/docs/diffusers/main/en/optimization/memory)
 
176
 
177
  gr.Markdown("---")
178
 
179
+ with gr.Accordion("Generated Code 💻", open=True):
180
  code_output = gr.Code(interactive=True, language="python")
181
 
182
  gr.Markdown(
prompts.py CHANGED
@@ -15,15 +15,15 @@ image = pipe("photo of a dog sitting beside a river").images[0]
15
  ```
16
 
17
  Your task will be to output a reasonable inference code in Python from user-supplied information about their
18
- needs. More specifically, you will be provided with the following information (in no particular order):
19
 
20
  * `ckpt_id` of the diffusion pipeline
21
  * Loading memory of a diffusion pipeline in GB
22
  * Available system RAM in GB
23
  * Available GPU VRAM in GB
24
- * If the user can afford to have lossy outputs (the likes of quantization)
25
- * If FP8 is supported
26
- * If the available GPU supports the latest `torch.compile()` knobs
27
 
28
  There are three categories of system RAM, broadly:
29
 
@@ -59,22 +59,12 @@ onload_device = torch.device("cuda")
59
  pipe = DiffusionPipeline.from_pretrained(CKPT_ID, torch_dtype=torch.bfloat16)
60
 
61
  offload_dir = "DIRECTORY" # change me
62
- for name, module in pipe.components.items():
63
- if hasattr(component, "_supports_group_offloading") and component._supports_group_offloading:
64
- module.enable_group_offload(
65
- onload_device=onload_device,
66
- offload_type="leaf_level",
67
- use_stream=True,
68
- offload_to_disk_path=f"{offload_dir}/{name}"
69
- )
70
- elif isinstance(component, (PreTrainedModel, torch.nn.Module)):
71
- apply_group_offloading(
72
- module,
73
- onload_device=onload_device,
74
- offload_type="leaf_level",
75
- use_stream=True,
76
- offload_to_disk_path=f"{offload_dir}/{name}"
77
- )
78
 
79
  # Inference goes here.
80
  ...
@@ -104,6 +94,32 @@ satisfied by the available VRAM:
104
  pipe = pipe.to("cuda")
105
  ```
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  ## Guidance on using quantization
108
 
109
  If the user specifies to use quantization, then you should default to using bitsandbytes 4bit. The code here
@@ -143,28 +159,34 @@ pipe = DiffusionPipeline.from_pretrained(CKPT_ID, quantization_config=quant_conf
143
  ...
144
  ```
145
 
 
 
 
 
 
146
  ## Guidance on using `torch.compile()`
147
 
148
  If the user wants to additionally boost inference speed, then you should the following line of code just before
149
  inference:
150
 
151
- * Add the following when offloading was applied: `torch._dynamo.config.recompile_limit = 1000`.
152
  * ONLY, add the following when `bitsandbytes` was used for `quant_backend`: `torch._dynamo.config.capture_dynamic_output_shape_ops = True`.
153
- * Finally, add `pipe.transformer.compile()`.
154
  * Add `pipe.vae.decode = torch.compile(vae.decode)` as a comment.
155
 
156
  In case no offloading was applied, then the line should be:
157
 
158
  ```py
159
- pipe.transformer.compile(fullgraph=True)
160
  ```
161
 
162
  ## Other guidelines
163
 
164
- * When the available VRAM > pipeline loading memory, you should suggest using `pipe = pipe.to("cuda")`.
165
- * If the user prefers not to use quantization and further reduce memory, then suggest using:
166
- `pipe.transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)`. Note
167
- that this is different from using FP8. In FP8, we use quantization like shown above.
 
 
168
  * Do NOT add any extra imports or lines of code that will not be used.
169
  * Do NOT try to be too creative about combining the optimization techniques laid out above.
170
  * Do NOT add extra arguments to the `pipe` call other than the `prompt`.
@@ -184,7 +206,8 @@ ckpt_id: {ckpt_id}
184
  pipeline_loading_memory_GB: {pipeline_loading_memory}
185
  available_system_ram_GB: {available_system_ram}
186
  available_gpu_vram_GB: {available_gpu_vram}
187
- enable_lossy_outputs: {enable_lossy_outputs}
 
188
  is_fp8_supported: {is_fp8_supported}
189
  enable_torch_compile: {enable_torch_compile}
190
  """
 
15
  ```
16
 
17
  Your task will be to output a reasonable inference code in Python from user-supplied information about their
18
+ needs. More specifically, you will be provided with the following user information (in no particular order):
19
 
20
  * `ckpt_id` of the diffusion pipeline
21
  * Loading memory of a diffusion pipeline in GB
22
  * Available system RAM in GB
23
  * Available GPU VRAM in GB
24
+ * If the user can afford to have lossy outputs (either quantization or caching)
25
+ * If FP8 precision is supported
26
+ * If the available GPU supports compatibility with `torch.compile`
27
 
28
  There are three categories of system RAM, broadly:
29
 
 
59
  pipe = DiffusionPipeline.from_pretrained(CKPT_ID, torch_dtype=torch.bfloat16)
60
 
61
  offload_dir = "DIRECTORY" # change me
62
+ pipe.enable_group_offload(
63
+ onload_device=onload_device,
64
+ offload_type="leaf_level",
65
+ use_stream=True,
66
+ offload_to_disk_path=f"{offload_dir}/{name}"
67
+ )
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Inference goes here.
70
  ...
 
94
  pipe = pipe.to("cuda")
95
  ```
96
 
97
+ When the user prefers to have lossy outputs (either through quantization or caching), prefer combining offloading
98
+ with either quantization or caching as detailed below.
99
+
100
+ ## Guidance on using caching
101
+
102
+ When caching is enabled, you can leverage `FasterCache` like so:
103
+
104
+ ```py
105
+ config = FasterCacheConfig(
106
+ spatial_attention_block_skip_range=2,
107
+ spatial_attention_timestep_skip_range=(-1, 681),
108
+ current_timestep_callback=lambda: pipe.current_timestep,
109
+ attention_weight_callback=lambda _: 0.3,
110
+ unconditional_batch_skip_range=5,
111
+ unconditional_batch_timestep_skip_range=(-1, 781),
112
+ tensor_format="BCHW", # for video models, this should be BFCHW
113
+ )
114
+ pipe.transformer.enable_cache(config)
115
+
116
+ # Rest of the code goes here.
117
+ ...
118
+ ```
119
+
120
+ Recommend the users to also consult the documentation:
121
+ https://huggingface.co/docs/diffusers/main/en/optimization/cache
122
+
123
  ## Guidance on using quantization
124
 
125
  If the user specifies to use quantization, then you should default to using bitsandbytes 4bit. The code here
 
159
  ...
160
  ```
161
 
162
+ **Some additional notes**:
163
+
164
+ * Offloading can be combined with quantization. However, this is only supported with `bitsandbytes`.
165
+ * If the VRAM and RAM are very low consider combining quantization with offloading.
166
+
167
  ## Guidance on using `torch.compile()`
168
 
169
  If the user wants to additionally boost inference speed, then you should the following line of code just before
170
  inference:
171
 
 
172
  * ONLY, add the following when `bitsandbytes` was used for `quant_backend`: `torch._dynamo.config.capture_dynamic_output_shape_ops = True`.
173
+ * Finally, add `pipe.transformer.compile_repeated_blocks()`.
174
  * Add `pipe.vae.decode = torch.compile(vae.decode)` as a comment.
175
 
176
  In case no offloading was applied, then the line should be:
177
 
178
  ```py
179
+ pipe.transformer.compile_repeated_blocks(fullgraph=True)
180
  ```
181
 
182
  ## Other guidelines
183
 
184
+ * For the line of code that actually calls the `pipe`, always recommend users to verify the call arguments.
185
+ * When the available VRAM is somewhat greater than pipeline loading memory, you should suggest using `pipe = pipe.to("cuda")`. But in
186
+ cases where, VRAM is only tiny bit greater, you should suggest the use of offloading. For example, if the available VRAM
187
+ is 32 GBs and pipeline loading memory is 31.5 GBs, it's better to use offloading.
188
+ * If the user prefers not to use quantization and still reduce memory, then suggest using:
189
+ `pipe.transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)`.
190
  * Do NOT add any extra imports or lines of code that will not be used.
191
  * Do NOT try to be too creative about combining the optimization techniques laid out above.
192
  * Do NOT add extra arguments to the `pipe` call other than the `prompt`.
 
206
  pipeline_loading_memory_GB: {pipeline_loading_memory}
207
  available_system_ram_GB: {available_system_ram}
208
  available_gpu_vram_GB: {available_gpu_vram}
209
+ enable_caching: {enable_caching}
210
+ enable_quantization: {enable_quantization}
211
  is_fp8_supported: {is_fp8_supported}
212
  enable_torch_compile: {enable_torch_compile}
213
  """
utils/pipeline_utils.py CHANGED
@@ -3,7 +3,7 @@ from pathlib import Path
3
  import functools
4
  import os
5
  import safetensors.torch
6
- from huggingface_hub import model_info, hf_hub_download
7
  import tempfile
8
  import torch
9
  import functools
@@ -189,7 +189,4 @@ if __name__ == "__main__":
189
  safetensor_files = output["components"]
190
  print(f"{total_size_gb=} GB")
191
  print(f"{safetensor_files=}")
192
- print("\n")
193
- # total_size_gb, safetensor_files = _determine_memory_from_local_ckpt("LOCAL_DIR") # change me.
194
- # print(f"{total_size_gb=} GB")
195
- # print(f"{safetensor_files=}")
 
3
  import functools
4
  import os
5
  import safetensors.torch
6
+ from huggingface_hub import model_info
7
  import tempfile
8
  import torch
9
  import functools
 
189
  safetensor_files = output["components"]
190
  print(f"{total_size_gb=} GB")
191
  print(f"{safetensor_files=}")
192
+ print("\n")