Gianpaolo Macario commited on
Commit ·
b63b631
1
Parent(s): ae24614
feat: add stablediffusion.py
Browse filesBased on sample code at modal.com
- app.py +1 -1
- flux.py +230 -0
- pyproject.toml +4 -0
- stablediffusion.py +72 -0
- uv.lock +0 -0
app.py
CHANGED
|
@@ -95,4 +95,4 @@ with gr.Blocks() as demo:
|
|
| 95 |
outputs=calc_output,
|
| 96 |
api_name="calculate")
|
| 97 |
|
| 98 |
-
demo.launch(mcp_server=True)
|
|
|
|
| 95 |
outputs=calc_output,
|
| 96 |
api_name="calculate")
|
| 97 |
|
| 98 |
+
demo.launch(mcp_server=True, share=True)
|
flux.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Run Flux fast on H100s with torch.compile
|
| 2 |
+
#
|
| 3 |
+
# See https://modal.com/docs/examples/flux
|
| 4 |
+
|
| 5 |
+
# Setting up the image and dependencies
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import modal
|
| 11 |
+
|
| 12 |
+
# We’ll make use of the full CUDA toolkit in this example, so we’ll build our container image
|
| 13 |
+
# off of the nvidia/cuda base.
|
| 14 |
+
|
| 15 |
+
cuda_version = "12.4.0" # should be no greater than host CUDA version
|
| 16 |
+
flavor = "devel" # includes full CUDA toolkit
|
| 17 |
+
operating_sys = "ubuntu22.04"
|
| 18 |
+
tag = f"{cuda_version}-{flavor}-{operating_sys}"
|
| 19 |
+
|
| 20 |
+
cuda_dev_image = modal.Image.from_registry(
|
| 21 |
+
f"nvidia/cuda:{tag}", add_python="3.11"
|
| 22 |
+
).entrypoint([])
|
| 23 |
+
|
| 24 |
+
# Now we install most of our dependencies with apt and pip.
|
| 25 |
+
# For Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library
|
| 26 |
+
# we install from GitHub source and so pin to a specific commit.
|
| 27 |
+
#
|
| 28 |
+
# PyTorch added faster attention kernels for Hopper GPUs in version 2.5,
|
| 29 |
+
# so we pin to that version to ensure we get the best performance on H100s.
|
| 30 |
+
|
| 31 |
+
diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b"
|
| 32 |
+
|
| 33 |
+
flux_image = (
|
| 34 |
+
cuda_dev_image.apt_install(
|
| 35 |
+
"git",
|
| 36 |
+
"libglib2.0-0",
|
| 37 |
+
"libsm6",
|
| 38 |
+
"libxrender1",
|
| 39 |
+
"libxext6",
|
| 40 |
+
"ffmpeg",
|
| 41 |
+
"libgl1",
|
| 42 |
+
)
|
| 43 |
+
.pip_install(
|
| 44 |
+
"invisible_watermark==0.2.0",
|
| 45 |
+
"transformers==4.44.0",
|
| 46 |
+
"huggingface_hub[hf_transfer]==0.26.2",
|
| 47 |
+
"accelerate==0.33.0",
|
| 48 |
+
"safetensors==0.4.4",
|
| 49 |
+
"sentencepiece==0.2.0",
|
| 50 |
+
"torch==2.5.0",
|
| 51 |
+
f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}",
|
| 52 |
+
"numpy<2",
|
| 53 |
+
)
|
| 54 |
+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"})
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Later, we’ll also use torch.compile to increase the speed further.
|
| 58 |
+
# Torch compilation needs to be re-executed when each new container starts,
|
| 59 |
+
# So we turn on some extra caching to reduce compile times for later containers.
|
| 60 |
+
|
| 61 |
+
flux_image = flux_image.env(
|
| 62 |
+
{
|
| 63 |
+
"TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache",
|
| 64 |
+
"TORCHINDUCTOR_FX_GRAPH_CACHE": "1",
|
| 65 |
+
}
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Finally, we construct our Modal App, set its default image to the one we just constructed,
|
| 69 |
+
# and import FluxPipeline for downloading and running Flux.1.
|
| 70 |
+
|
| 71 |
+
app = modal.App(
|
| 72 |
+
"example-flux",
|
| 73 |
+
image=flux_image,
|
| 74 |
+
secrets=[modal.Secret.from_name("huggingface-secret")],
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# @app.function(
|
| 78 |
+
# image=modal.Image.debian_slim().pip_install("torch", "diffusers[torch]", "transformers", "ftfy"),
|
| 79 |
+
# gpu="any",
|
| 80 |
+
# )
|
| 81 |
+
|
| 82 |
+
with flux_image.imports():
|
| 83 |
+
import torch
|
| 84 |
+
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
|
| 85 |
+
|
| 86 |
+
# Defining a parameterized Model inference class
|
| 87 |
+
#
|
| 88 |
+
# Next, we map the model’s setup and inference code onto Modal.
|
| 89 |
+
#
|
| 90 |
+
# 1. We the model setun in the method decorated with @modal.enter().
|
| 91 |
+
# This includes loading the weights and moving them to the GPU,
|
| 92 |
+
# along with an optional torch.compile step (see details below).
|
| 93 |
+
# The @modal.enter() decorator ensures that this method runs only once,
|
| 94 |
+
# when a new container starts, instead of in the path of every call.
|
| 95 |
+
#
|
| 96 |
+
# 2. We run the actual inference in methods decorated with @modal.method().
|
| 97 |
+
|
| 98 |
+
MINUTES = 60 # seconds
|
| 99 |
+
VARIANT = "schnell" # or "dev", but note [dev] requires you to accept terms and conditions on HF
|
| 100 |
+
NUM_INFERENCE_STEPS = 4 # use ~50 for [dev], smaller for [schnell]
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@app.cls(
|
| 104 |
+
gpu="H100", # fastest GPU on Modal
|
| 105 |
+
scaledown_window=20 * MINUTES,
|
| 106 |
+
timeout=60 * MINUTES, # leave plenty of time for compilation
|
| 107 |
+
volumes={ # add Volumes to store serializable compilation artifacts, see section on torch.compile below
|
| 108 |
+
"/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True),
|
| 109 |
+
"/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True),
|
| 110 |
+
"/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True),
|
| 111 |
+
"/root/.inductor-cache": modal.Volume.from_name(
|
| 112 |
+
"inductor-cache", create_if_missing=True
|
| 113 |
+
),
|
| 114 |
+
},
|
| 115 |
+
)
|
| 116 |
+
class Model:
|
| 117 |
+
compile: bool = ( # see section on torch.compile below for details
|
| 118 |
+
modal.parameter(default=False)
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
@modal.enter()
|
| 122 |
+
def enter(self):
|
| 123 |
+
pipe = FluxPipeline.from_pretrained(
|
| 124 |
+
f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16
|
| 125 |
+
).to("cuda") # move model to GPU
|
| 126 |
+
self.pipe = optimize(pipe, compile=self.compile)
|
| 127 |
+
|
| 128 |
+
@modal.method()
|
| 129 |
+
def inference(self, prompt: str) -> bytes:
|
| 130 |
+
print("🎨 generating image...")
|
| 131 |
+
out = self.pipe(
|
| 132 |
+
prompt,
|
| 133 |
+
output_type="pil",
|
| 134 |
+
num_inference_steps=NUM_INFERENCE_STEPS,
|
| 135 |
+
).images[0] # type: ignore
|
| 136 |
+
|
| 137 |
+
byte_stream = BytesIO()
|
| 138 |
+
out.save(byte_stream, format="JPEG")
|
| 139 |
+
return byte_stream.getvalue()
|
| 140 |
+
|
| 141 |
+
# Calling our inference function
|
| 142 |
+
#
|
| 143 |
+
# To generate an image we just need to call the Model’s generate method with .remote appended to it.
|
| 144 |
+
# You can call .generate.remote from any Python environment that has access to your Modal credentials.
|
| 145 |
+
# The local environment will get back the image as bytes.
|
| 146 |
+
#
|
| 147 |
+
# Here, we wrap the call in a Modal local_entrypoint so that it can be run with modal run:
|
| 148 |
+
#
|
| 149 |
+
# modal run flux.py
|
| 150 |
+
|
| 151 |
+
# By default, we call generate twice to demonstrate how much faster the inference is after cold start.
|
| 152 |
+
# In our tests, clients received images in about 1.2 seconds. We save the output bytes to a temporary file.
|
| 153 |
+
|
| 154 |
+
@app.local_entrypoint()
|
| 155 |
+
def main(
|
| 156 |
+
prompt: str = "a computer screen showing ASCII terminal art of the"
|
| 157 |
+
" word 'Modal' in neon green. two programmers are pointing excitedly"
|
| 158 |
+
" at the screen.",
|
| 159 |
+
twice: bool = True,
|
| 160 |
+
compile: bool = False,
|
| 161 |
+
):
|
| 162 |
+
t0 = time.time()
|
| 163 |
+
image_bytes = Model(compile=compile).inference.remote(prompt)
|
| 164 |
+
print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")
|
| 165 |
+
|
| 166 |
+
if twice:
|
| 167 |
+
t0 = time.time()
|
| 168 |
+
image_bytes = Model(compile=compile).inference.remote(prompt)
|
| 169 |
+
print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")
|
| 170 |
+
|
| 171 |
+
output_path = Path("/tmp") / "flux" / "output.jpg"
|
| 172 |
+
output_path.parent.mkdir(exist_ok=True, parents=True)
|
| 173 |
+
print(f"🎨 saving output to {output_path}")
|
| 174 |
+
output_path.write_bytes(image_bytes)
|
| 175 |
+
|
| 176 |
+
# TODO: Speeding up Flux with torch.compile
|
| 177 |
+
|
| 178 |
+
def optimize(pipe, compile=True):
|
| 179 |
+
# fuse QKV projections in Transformer and VAE
|
| 180 |
+
pipe.transformer.fuse_qkv_projections()
|
| 181 |
+
pipe.vae.fuse_qkv_projections()
|
| 182 |
+
|
| 183 |
+
# switch memory layout to Torch's preferred, channels_last
|
| 184 |
+
pipe.transformer.to(memory_format=torch.channels_last)
|
| 185 |
+
pipe.vae.to(memory_format=torch.channels_last)
|
| 186 |
+
|
| 187 |
+
if not compile:
|
| 188 |
+
return pipe
|
| 189 |
+
|
| 190 |
+
# set torch compile flags
|
| 191 |
+
config = torch._inductor.config # type: ignore
|
| 192 |
+
config.disable_progress = False # show progress bar
|
| 193 |
+
config.conv_1x1_as_mm = True # treat 1x1 convolutions as matrix muls
|
| 194 |
+
# adjust autotuning algorithm
|
| 195 |
+
config.coordinate_descent_tuning = True
|
| 196 |
+
config.coordinate_descent_check_all_directions = True
|
| 197 |
+
config.epilogue_fusion = False # do not fuse pointwise ops into matmuls
|
| 198 |
+
|
| 199 |
+
# tag the compute-intensive modules, the Transformer and VAE decoder, for compilation
|
| 200 |
+
pipe.transformer = torch.compile(
|
| 201 |
+
pipe.transformer, mode="max-autotune", fullgraph=True
|
| 202 |
+
)
|
| 203 |
+
pipe.vae.decode = torch.compile(
|
| 204 |
+
pipe.vae.decode, mode="max-autotune", fullgraph=True
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# trigger torch compilation
|
| 208 |
+
print("🔦 running torch compilation (may take up to 20 minutes)...")
|
| 209 |
+
|
| 210 |
+
pipe(
|
| 211 |
+
"dummy prompt to trigger torch compilation",
|
| 212 |
+
output_type="pil",
|
| 213 |
+
num_inference_steps=NUM_INFERENCE_STEPS, # use ~50 for [dev], smaller for [schnell]
|
| 214 |
+
).images[0]
|
| 215 |
+
|
| 216 |
+
print("🔦 finished torch compilation")
|
| 217 |
+
|
| 218 |
+
return pipe
|
| 219 |
+
|
| 220 |
+
# To run this script, use the command:
|
| 221 |
+
# modal run flux.py --prompt "a beautiful landscape with mountains and a river" --twice --compile
|
| 222 |
+
|
| 223 |
+
# This will generate an image based on the provided prompt, run it twice,
|
| 224 |
+
# and save the output to a file named `output.jpg` in the `/tmp/flux/` directory.
|
| 225 |
+
#
|
| 226 |
+
# Make sure to have Modal CLI installed and configured with your API key.
|
| 227 |
+
# You can install Modal CLI with:
|
| 228 |
+
# pip install modal-cli
|
| 229 |
+
|
| 230 |
+
# EOF
|
pyproject.toml
CHANGED
|
@@ -5,6 +5,10 @@ description = "Add your description here"
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
|
|
|
| 8 |
"gradio[mcp]>=5.32.1",
|
|
|
|
| 9 |
"mcp>=1.9.0",
|
|
|
|
|
|
|
| 10 |
]
|
|
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
| 8 |
+
"diffusers>=0.33.1",
|
| 9 |
"gradio[mcp]>=5.32.1",
|
| 10 |
+
"icecream>=2.1.4",
|
| 11 |
"mcp>=1.9.0",
|
| 12 |
+
"modal>=1.0.2",
|
| 13 |
+
"torch>=2.7.1",
|
| 14 |
]
|
stablediffusion.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Run Stable Diffusion to generate images from text prompts.
|
| 2 |
+
# This script uses the Modal framework to run Stable Diffusion in a cloud environment.
|
| 3 |
+
# It requires the `modal` package and a Hugging Face token stored in a Modal secret.
|
| 4 |
+
# Make sure to set up the Modal environment and install the necessary dependencies.
|
| 5 |
+
|
| 6 |
+
# Usage: modal run stablediffusion.py
|
| 7 |
+
|
| 8 |
+
from icecream import ic
|
| 9 |
+
import io
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
import modal
|
| 13 |
+
|
| 14 |
+
app = modal.App()
|
| 15 |
+
|
| 16 |
+
@app.function(
|
| 17 |
+
image=modal.Image.debian_slim().pip_install(
|
| 18 |
+
"icecream",
|
| 19 |
+
"torch",
|
| 20 |
+
"diffusers[torch]",
|
| 21 |
+
"transformers",
|
| 22 |
+
"ftfy"
|
| 23 |
+
),
|
| 24 |
+
secrets=[modal.Secret.from_name("huggingface-secret")],
|
| 25 |
+
gpu="any",
|
| 26 |
+
)
|
| 27 |
+
def run_stable_diffusion(prompt: str):
|
| 28 |
+
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
|
| 29 |
+
|
| 30 |
+
pipe = StableDiffusionPipeline.from_pretrained(
|
| 31 |
+
"runwayml/stable-diffusion-v1-5",
|
| 32 |
+
use_auth_token=os.environ["HF_TOKEN"],
|
| 33 |
+
).to("cuda")
|
| 34 |
+
|
| 35 |
+
image = pipe(prompt, num_inference_steps=10).images[0] # type: ignore
|
| 36 |
+
|
| 37 |
+
buf = io.BytesIO()
|
| 38 |
+
image.save(buf, format="PNG")
|
| 39 |
+
img_bytes = buf.getvalue()
|
| 40 |
+
|
| 41 |
+
return img_bytes
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@app.local_entrypoint()
|
| 45 |
+
def main():
|
| 46 |
+
prompt = "Wu-Tang Clan climbing Mount Everest"
|
| 47 |
+
# prompt = "A robot dog walking down a vineyard" # Example prompt
|
| 48 |
+
|
| 49 |
+
# out_path = "/tmp/output.png"
|
| 50 |
+
out_path = "stablediffusion_output.png"
|
| 51 |
+
|
| 52 |
+
# ic(os.getcwd())
|
| 53 |
+
# img_bytes = b"<image_bytes>" # Placeholder for the actual image bytes
|
| 54 |
+
|
| 55 |
+
print("DEBUG: Starting Stable Diffusion with prompt:", prompt)
|
| 56 |
+
img_bytes = run_stable_diffusion.remote(prompt=prompt)
|
| 57 |
+
print("DEBUG: Writing img_bytes length:", len(img_bytes))
|
| 58 |
+
with open(out_path, "wb") as f:
|
| 59 |
+
f.write(img_bytes)
|
| 60 |
+
print("DEBUG: Image saved to ", out_path)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
main()
|
| 65 |
+
print("Image saved to /tmp/output.png")
|
| 66 |
+
print("Run `modal deploy` to deploy this app.")
|
| 67 |
+
print("Run `modal serve` to serve this app locally.")
|
| 68 |
+
print("Run `modal run` to run this app in the cloud.")
|
| 69 |
+
print("Run `modal logs` to view the logs of this app.")
|
| 70 |
+
print("Run `modal shell` to open a shell in the cloud environment.")
|
| 71 |
+
print("Run `modal run --help` to see all available options.")
|
| 72 |
+
print("Run `modal deploy --help` to see all available options for deployment.")
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|