Spaces:

Agents-MCP-Hackathon
/

simple-calculator

Running

App Files Files Community

Gianpaolo Macario commited on Jun 6, 2025

Commit

b63b631

1 Parent(s): ae24614

feat: add stablediffusion.py

Browse files

Based on sample code at modal.com

Files changed (5) hide show

app.py +1 -1
flux.py +230 -0
pyproject.toml +4 -0
stablediffusion.py +72 -0
uv.lock +0 -0

app.py CHANGED Viewed

@@ -95,4 +95,4 @@ with gr.Blocks() as demo:
         outputs=calc_output,
         api_name="calculate")
-demo.launch(mcp_server=True)

         outputs=calc_output,
         api_name="calculate")
+demo.launch(mcp_server=True, share=True)

flux.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Run Flux fast on H100s with torch.compile
+#
+# See https://modal.com/docs/examples/flux
+# Setting up the image and dependencies
+import time
+from io import BytesIO
+from pathlib import Path
+import modal
+# We’ll make use of the full CUDA toolkit in this example, so we’ll build our container image
+# off of the nvidia/cuda base.
+cuda_version = "12.4.0"  # should be no greater than host CUDA version
+flavor = "devel"  # includes full CUDA toolkit
+operating_sys = "ubuntu22.04"
+tag = f"{cuda_version}-{flavor}-{operating_sys}"
+cuda_dev_image = modal.Image.from_registry(
+    f"nvidia/cuda:{tag}", add_python="3.11"
+).entrypoint([])
+# Now we install most of our dependencies with apt and pip.
+# For Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library
+# we install from GitHub source and so pin to a specific commit.
+#
+# PyTorch added faster attention kernels for Hopper GPUs in version 2.5,
+# so we pin to that version to ensure we get the best performance on H100s.
+diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b"
+flux_image = (
+    cuda_dev_image.apt_install(
+        "git",
+        "libglib2.0-0",
+        "libsm6",
+        "libxrender1",
+        "libxext6",
+        "ffmpeg",
+        "libgl1",
+    )
+    .pip_install(
+        "invisible_watermark==0.2.0",
+        "transformers==4.44.0",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "accelerate==0.33.0",
+        "safetensors==0.4.4",
+        "sentencepiece==0.2.0",
+        "torch==2.5.0",
+        f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}",
+        "numpy<2",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"})
+)
+# Later, we’ll also use torch.compile to increase the speed further.
+# Torch compilation needs to be re-executed when each new container starts,
+# So we turn on some extra caching to reduce compile times for later containers.
+flux_image = flux_image.env(
+    {
+        "TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache",
+        "TORCHINDUCTOR_FX_GRAPH_CACHE": "1",
+    }
+)
+# Finally, we construct our Modal App, set its default image to the one we just constructed,
+# and import FluxPipeline for downloading and running Flux.1.
+app = modal.App(
+    "example-flux",
+    image=flux_image,
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+)
+# @app.function(
+#     image=modal.Image.debian_slim().pip_install("torch", "diffusers[torch]", "transformers", "ftfy"),
+#     gpu="any",
+# )
+with flux_image.imports():
+    import torch
+    from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
+# Defining a parameterized Model inference class
+#
+# Next, we map the model’s setup and inference code onto Modal.
+#
+# 1. We the model setun in the method decorated with @modal.enter().
+#    This includes loading the weights and moving them to the GPU,
+#    along with an optional torch.compile step (see details below).
+#    The @modal.enter() decorator ensures that this method runs only once,
+#    when a new container starts, instead of in the path of every call.
+#
+# 2. We run the actual inference in methods decorated with @modal.method().
+MINUTES = 60  # seconds
+VARIANT = "schnell"  # or "dev", but note [dev] requires you to accept terms and conditions on HF
+NUM_INFERENCE_STEPS = 4  # use ~50 for [dev], smaller for [schnell]
+@app.cls(
+    gpu="H100",  # fastest GPU on Modal
+    scaledown_window=20 * MINUTES,
+    timeout=60 * MINUTES,  # leave plenty of time for compilation
+    volumes={  # add Volumes to store serializable compilation artifacts, see section on torch.compile below
+        "/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True),
+        "/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True),
+        "/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True),
+        "/root/.inductor-cache": modal.Volume.from_name(
+            "inductor-cache", create_if_missing=True
+        ),
+    },
+)
+class Model:
+    compile: bool = (  # see section on torch.compile below for details
+        modal.parameter(default=False)
+    )
+    @modal.enter()
+    def enter(self):
+        pipe = FluxPipeline.from_pretrained(
+            f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16
+        ).to("cuda")  # move model to GPU
+        self.pipe = optimize(pipe, compile=self.compile)
+    @modal.method()
+    def inference(self, prompt: str) -> bytes:
+        print("🎨 generating image...")
+        out = self.pipe(
+            prompt,
+            output_type="pil",
+            num_inference_steps=NUM_INFERENCE_STEPS,
+        ).images[0] # type: ignore
+        byte_stream = BytesIO()
+        out.save(byte_stream, format="JPEG")
+        return byte_stream.getvalue()
+# Calling our inference function
+#
+# To generate an image we just need to call the Model’s generate method with .remote appended to it.
+# You can call .generate.remote from any Python environment that has access to your Modal credentials.
+# The local environment will get back the image as bytes.
+#
+# Here, we wrap the call in a Modal local_entrypoint so that it can be run with modal run:
+#
+# modal run flux.py
+# By default, we call generate twice to demonstrate how much faster the inference is after cold start.
+# In our tests, clients received images in about 1.2 seconds. We save the output bytes to a temporary file.
+@app.local_entrypoint()
+def main(
+    prompt: str = "a computer screen showing ASCII terminal art of the"
+    " word 'Modal' in neon green. two programmers are pointing excitedly"
+    " at the screen.",
+    twice: bool = True,
+    compile: bool = False,
+):
+    t0 = time.time()
+    image_bytes = Model(compile=compile).inference.remote(prompt)
+    print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")
+    if twice:
+        t0 = time.time()
+        image_bytes = Model(compile=compile).inference.remote(prompt)
+        print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")
+    output_path = Path("/tmp") / "flux" / "output.jpg"
+    output_path.parent.mkdir(exist_ok=True, parents=True)
+    print(f"🎨 saving output to {output_path}")
+    output_path.write_bytes(image_bytes)
+# TODO: Speeding up Flux with torch.compile
+def optimize(pipe, compile=True):
+    # fuse QKV projections in Transformer and VAE
+    pipe.transformer.fuse_qkv_projections()
+    pipe.vae.fuse_qkv_projections()
+    # switch memory layout to Torch's preferred, channels_last
+    pipe.transformer.to(memory_format=torch.channels_last)
+    pipe.vae.to(memory_format=torch.channels_last)
+    if not compile:
+        return pipe
+    # set torch compile flags
+    config = torch._inductor.config # type: ignore
+    config.disable_progress = False  # show progress bar
+    config.conv_1x1_as_mm = True  # treat 1x1 convolutions as matrix muls
+    # adjust autotuning algorithm
+    config.coordinate_descent_tuning = True
+    config.coordinate_descent_check_all_directions = True
+    config.epilogue_fusion = False  # do not fuse pointwise ops into matmuls
+    # tag the compute-intensive modules, the Transformer and VAE decoder, for compilation
+    pipe.transformer = torch.compile(
+        pipe.transformer, mode="max-autotune", fullgraph=True
+    )
+    pipe.vae.decode = torch.compile(
+        pipe.vae.decode, mode="max-autotune", fullgraph=True
+    )
+    # trigger torch compilation
+    print("🔦 running torch compilation (may take up to 20 minutes)...")
+    pipe(
+        "dummy prompt to trigger torch compilation",
+        output_type="pil",
+        num_inference_steps=NUM_INFERENCE_STEPS,  # use ~50 for [dev], smaller for [schnell]
+    ).images[0]
+    print("🔦 finished torch compilation")
+    return pipe
+# To run this script, use the command:
+# modal run flux.py --prompt "a beautiful landscape with mountains and a river" --twice --compile
+# This will generate an image based on the provided prompt, run it twice,
+# and save the output to a file named `output.jpg` in the `/tmp/flux/` directory.
+#
+# Make sure to have Modal CLI installed and configured with your API key.
+# You can install Modal CLI with:
+# pip install modal-cli
+# EOF

pyproject.toml CHANGED Viewed

@@ -5,6 +5,10 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "gradio[mcp]>=5.32.1",
     "mcp>=1.9.0",
 ]

 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "diffusers>=0.33.1",
     "gradio[mcp]>=5.32.1",
+    "icecream>=2.1.4",
     "mcp>=1.9.0",
+    "modal>=1.0.2",
+    "torch>=2.7.1",
 ]

stablediffusion.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Run Stable Diffusion to generate images from text prompts.
+# This script uses the Modal framework to run Stable Diffusion in a cloud environment.
+# It requires the `modal` package and a Hugging Face token stored in a Modal secret.
+# Make sure to set up the Modal environment and install the necessary dependencies.
+# Usage: modal run stablediffusion.py
+from icecream import ic
+import io
+import os
+import modal
+app = modal.App()
+@app.function(
+    image=modal.Image.debian_slim().pip_install(
+        "icecream",
+        "torch",
+        "diffusers[torch]",
+        "transformers",
+        "ftfy"
+    ),
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+    gpu="any",
+)
+def run_stable_diffusion(prompt: str):
+    from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
+    pipe = StableDiffusionPipeline.from_pretrained(
+        "runwayml/stable-diffusion-v1-5",
+        use_auth_token=os.environ["HF_TOKEN"],
+    ).to("cuda")
+    image = pipe(prompt, num_inference_steps=10).images[0] # type: ignore
+    buf = io.BytesIO()
+    image.save(buf, format="PNG")
+    img_bytes = buf.getvalue()
+    return img_bytes
+@app.local_entrypoint()
+def main():
+    prompt = "Wu-Tang Clan climbing Mount Everest"
+    # prompt = "A robot dog walking down a vineyard"  # Example prompt
+    # out_path = "/tmp/output.png"
+    out_path = "stablediffusion_output.png"
+    # ic(os.getcwd())
+    # img_bytes = b"<image_bytes>"  # Placeholder for the actual image bytes
+    print("DEBUG: Starting Stable Diffusion with prompt:", prompt)
+    img_bytes = run_stable_diffusion.remote(prompt=prompt)
+    print("DEBUG: Writing img_bytes length:", len(img_bytes))
+    with open(out_path, "wb") as f:
+        f.write(img_bytes)
+    print("DEBUG: Image saved to ", out_path)
+if __name__ == "__main__":
+    main()
+    print("Image saved to /tmp/output.png")
+    print("Run `modal deploy` to deploy this app.")
+    print("Run `modal serve` to serve this app locally.")
+    print("Run `modal run` to run this app in the cloud.")
+    print("Run `modal logs` to view the logs of this app.")
+    print("Run `modal shell` to open a shell in the cloud environment.")
+    print("Run `modal run --help` to see all available options.")
+    print("Run `modal deploy --help` to see all available options for deployment.")

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff