Spaces:

Shilpaj
/

StableDiffusion

Runtime error

App Files Files Community

Shilpaj commited on Feb 27, 2025

Commit

74fa5e8

verified ·

1 Parent(s): 2e09a45

Feat: App files

Browse files

Files changed (3) hide show

app.py +308 -0
requirements.txt +16 -0
utils.py +268 -0

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+#!/usr/bin/env python3
+"""
+Gradio Application for Stable Diffusion
+Author: Shilpaj Bhalerao
+Date: Feb 26, 2025
+"""
+import os
+import torch
+import gradio as gr
+from tqdm.auto import tqdm
+import numpy as np
+from PIL import Image
+from utils import (
+    load_models, clear_gpu_memory, set_timesteps, latents_to_pil,
+    vignette_loss, get_concept_embedding, load_concept_library, image_grid
+)
+# Hugging Face Space configuration
+# Use @space decorator to configure the Space
+# This will set the Space to use zero GPU resources
+@gr.Blocks.add_decorator
+def space(demo, **kwargs):
+    demo.queue(concurrency_count=1, max_size=10)
+    return demo
+# Set device
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+if device == "mps":
+    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
+# Load models
+vae, tokenizer, text_encoder, unet, scheduler, pipe = load_models(device)
+# Load concept library
+concept_embeds, concept_tokens = load_concept_library(pipe)
+# Define art style concepts
+art_concepts = {
+    "sketch_painting": "a sketch painting, pencil drawing, hand-drawn illustration",
+    "oil_painting": "an oil painting, textured canvas, painterly technique",
+    "watercolor": "a watercolor painting, fluid, soft edges",
+    "digital_art": "digital art, computer generated, precise details",
+    "comic_book": "comic book style, ink outlines, cel shading"
+}
+def generate_latents(prompt, seed, num_inference_steps, guidance_scale,
+                     vignette_loss_scale, concept_style=None, concept_strength=0.5,
+                     height=512, width=512):
+    """
+    Generate latents using the UNet model
+    Args:
+        prompt (str): Text prompt
+        seed (int): Random seed
+        num_inference_steps (int): Number of denoising steps
+        guidance_scale (float): Scale for classifier-free guidance
+        vignette_loss_scale (float): Scale for vignette loss
+        concept_style (str, optional): Style concept to use
+        concept_strength (float): Strength of concept influence (0.0-1.0)
+        height (int): Image height
+        width (int): Image width
+    Returns:
+        torch.Tensor: Generated latents
+    """
+    # Set the seed
+    generator = torch.manual_seed(seed)
+    batch_size = 1
+    # Clear GPU memory
+    clear_gpu_memory()
+    # Get concept embedding if specified
+    concept_embedding = None
+    if concept_style:
+        if concept_style in concept_tokens:
+            # Use pre-trained concept embedding
+            concept_embedding = concept_embeds[concept_style].unsqueeze(0).to(device)
+        elif concept_style in art_concepts:
+            # Generate concept embedding from text description
+            concept_text = art_concepts[concept_style]
+            concept_embedding = get_concept_embedding(concept_text, tokenizer, text_encoder, device)
+    # Prep text
+    text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length,
+                          truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
+    # Apply concept embedding influence if provided
+    if concept_embedding is not None and concept_strength > 0:
+        # Fix the dimension mismatch by adding a batch dimension to concept_embedding if needed
+        if len(concept_embedding.shape) == 2 and len(text_embeddings.shape) == 3:
+            concept_embedding = concept_embedding.unsqueeze(0)
+        # Create weighted blend between original text embedding and concept
+        if text_embeddings.shape == concept_embedding.shape:
+            text_embeddings = (1 - concept_strength) * text_embeddings + concept_strength * concept_embedding
+    # Unconditional embedding for classifier-free guidance
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = tokenizer(
+        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+    )
+    with torch.no_grad():
+        uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+    # Prep Scheduler
+    set_timesteps(scheduler, num_inference_steps)
+    # Prep latents
+    latents = torch.randn(
+        (batch_size, unet.in_channels, height // 8, width // 8),
+        generator=generator,
+    )
+    latents = latents.to(device)
+    latents = latents * scheduler.init_noise_sigma
+    # Loop through diffusion process
+    for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
+        # Expand latents for classifier-free guidance
+        latent_model_input = torch.cat([latents] * 2)
+        sigma = scheduler.sigmas[i]
+        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+        # Predict the noise residual
+        with torch.no_grad():
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
+        # Perform classifier-free guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # Apply additional guidance with vignette loss
+        if vignette_loss_scale > 0 and i % 5 == 0:
+            # Requires grad on the latents
+            latents = latents.detach().requires_grad_()
+            # Get the predicted x0
+            latents_x0 = latents - sigma * noise_pred
+            # Decode to image space
+            denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5  # range (0, 1)
+            # Calculate loss
+            loss = vignette_loss(denoised_images) * vignette_loss_scale
+            # Get gradient
+            cond_grad = torch.autograd.grad(loss, latents)[0]
+            # Modify the latents based on this gradient
+            latents = latents.detach() - cond_grad * sigma**2
+        # Step with scheduler
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    return latents
+def generate_image(prompt, seed=42, num_inference_steps=30, guidance_scale=7.5,
+                  vignette_loss_scale=0.0, concept_style="none", concept_strength=0.5,
+                  height=512, width=512):
+    """
+    Generate an image using Stable Diffusion
+    Args:
+        prompt (str): Text prompt
+        seed (int): Random seed
+        num_inference_steps (int): Number of denoising steps
+        guidance_scale (float): Scale for classifier-free guidance
+        vignette_loss_scale (float): Scale for vignette loss
+        concept_style (str): Style concept to use
+        concept_strength (float): Strength of concept influence (0.0-1.0)
+        height (int): Image height
+        width (int): Image width
+    Returns:
+        PIL.Image: Generated image
+    """
+    # Handle "none" concept style
+    if concept_style == "none":
+        concept_style = None
+    # Generate latents
+    latents = generate_latents(
+        prompt=prompt,
+        seed=seed,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        vignette_loss_scale=vignette_loss_scale,
+        concept_style=concept_style,
+        concept_strength=concept_strength,
+        height=height,
+        width=width
+    )
+    # Convert latents to image
+    images = latents_to_pil(latents, vae)
+    return images[0]
+def generate_style_grid(prompt, seed=42, num_inference_steps=30, guidance_scale=7.5,
+                       vignette_loss_scale=0.0, concept_strength=0.5):
+    """
+    Generate a grid of images with different style concepts
+    Args:
+        prompt (str): Text prompt
+        seed (int): Random seed
+        num_inference_steps (int): Number of denoising steps
+        guidance_scale (float): Scale for classifier-free guidance
+        vignette_loss_scale (float): Scale for vignette loss
+        concept_strength (float): Strength of concept influence (0.0-1.0)
+    Returns:
+        PIL.Image: Grid of generated images
+    """
+    # List of styles to use
+    styles = list(art_concepts.keys())
+    # Generate images for each style
+    images = []
+    labels = []
+    for i, style in enumerate(styles):
+        # Generate image with this style
+        latents = generate_latents(
+            prompt=prompt,
+            seed=seed + i,  # Use different seeds for variety
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            vignette_loss_scale=vignette_loss_scale,
+            concept_style=style,
+            concept_strength=concept_strength
+        )
+        # Convert latents to image
+        style_images = latents_to_pil(latents, vae)
+        images.append(style_images[0])
+        labels.append(style)
+    # Create grid
+    grid = image_grid(images, 1, len(styles), labels)
+    return grid
+# Define Gradio interface
+@space
+def create_demo():
+    with gr.Blocks(title="Guided Stable Diffusion with Styles") as demo:
+        gr.Markdown("# Guided Stable Diffusion with Styles")
+        with gr.Tab("Single Image Generation"):
+            with gr.Row():
+                with gr.Column():
+                    prompt = gr.Textbox(label="Prompt", placeholder="A cat sitting on a chair")
+                    seed = gr.Slider(minimum=0, maximum=10000, step=1, label="Seed", value=42)
+                    num_inference_steps = gr.Slider(minimum=10, maximum=100, step=1, label="Inference Steps", value=30)
+                    guidance_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.1, label="Guidance Scale", value=7.5)
+                    vignette_loss_scale = gr.Slider(minimum=0.0, maximum=100.0, step=1.0, label="Vignette Loss Scale", value=0.0)
+                    # Combine SD concept library tokens and art concept descriptions
+                    all_styles = ["none"] + concept_tokens + list(art_concepts.keys())
+                    concept_style = gr.Dropdown(choices=all_styles, label="Style Concept", value="none")
+                    concept_strength = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, label="Concept Strength", value=0.5)
+                    generate_btn = gr.Button("Generate Image")
+                with gr.Column():
+                    output_image = gr.Image(label="Generated Image", type="pil")
+        with gr.Tab("Style Grid"):
+            with gr.Row():
+                with gr.Column():
+                    grid_prompt = gr.Textbox(label="Prompt", placeholder="A dog running in the park")
+                    grid_seed = gr.Slider(minimum=0, maximum=10000, step=1, label="Base Seed", value=42)
+                    grid_num_inference_steps = gr.Slider(minimum=10, maximum=100, step=1, label="Inference Steps", value=30)
+                    grid_guidance_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.1, label="Guidance Scale", value=7.5)
+                    grid_vignette_loss_scale = gr.Slider(minimum=0.0, maximum=100.0, step=1.0, label="Vignette Loss Scale", value=0.0)
+                    grid_concept_strength = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, label="Concept Strength", value=0.5)
+                    grid_generate_btn = gr.Button("Generate Style Grid")
+                with gr.Column():
+                    output_grid = gr.Image(label="Style Grid", type="pil")
+        # Set up event handlers
+        generate_btn.click(
+            generate_image,
+            inputs=[prompt, seed, num_inference_steps, guidance_scale,
+                    vignette_loss_scale, concept_style, concept_strength],
+            outputs=output_image
+        )
+        grid_generate_btn.click(
+            generate_style_grid,
+            inputs=[grid_prompt, grid_seed, grid_num_inference_steps,
+                    grid_guidance_scale, grid_vignette_loss_scale, grid_concept_strength],
+            outputs=output_grid
+        )
+        return demo
+# Launch the app
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# Core dependencies
+torch>=1.7.0
+torchvision>=0.8.0
+diffusers>=0.12.0
+transformers>=4.25.1
+accelerate>=0.16.0
+ftfy>=6.1.1
+gradio>=3.20.0
+numpy>=1.22.0
+Pillow>=9.0.0
+tqdm>=4.64.0
+huggingface-hub>=0.12.0
+# Optional dependencies for better performance
+scipy>=1.9.0
+matplotlib>=3.5.0

utils.py ADDED Viewed

	@@ -0,0 +1,268 @@

+#!/usr/bin/env python3
+"""
+Utility functions for the application
+Author: Shilpaj Bhalerao
+Date: Feb 26, 2025
+"""
+import torch
+import gc
+from PIL import Image, ImageDraw, ImageFont
+from diffusers import StableDiffusionPipeline
+from transformers import CLIPTokenizer, CLIPTextModel
+import os
+def load_models(device="cuda"):
+    """
+    Load the necessary models for stable diffusion
+    Args:
+        device (str): Device to load models on ('cuda', 'mps', or 'cpu')
+    Returns:
+        tuple: (vae, tokenizer, text_encoder, unet, scheduler, pipe)
+    """
+    from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
+    # Set device
+    if device == "cuda" and not torch.cuda.is_available():
+        device = "mps" if torch.backends.mps.is_available() else "cpu"
+    if device == "mps":
+        os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
+    print(f"Loading models on {device}...")
+    # Load the autoencoder model which will be used to decode the latents into image space
+    vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
+    # Load the tokenizer and text encoder to tokenize and encode the text
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    # The UNet model for generating the latents
+    unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
+    # The noise scheduler
+    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
+    # Load the full pipeline for concept loading
+    pipe = StableDiffusionPipeline.from_pretrained(
+        "runwayml/stable-diffusion-v1-5",
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+    )
+    # Move models to device
+    vae = vae.to(device)
+    text_encoder = text_encoder.to(device)
+    unet = unet.to(device)
+    pipe = pipe.to(device)
+    return vae, tokenizer, text_encoder, unet, scheduler, pipe
+def clear_gpu_memory():
+    """Clear GPU memory cache"""
+    torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+def set_timesteps(scheduler, num_inference_steps):
+    """Set timesteps for the scheduler with MPS compatibility fix"""
+    scheduler.set_timesteps(num_inference_steps)
+    scheduler.timesteps = scheduler.timesteps.to(torch.float32)  # minor fix to ensure MPS compatibility
+def pil_to_latent(input_im, vae, device):
+    """
+    Convert the image to latents
+    Args:
+        input_im: Input PIL image
+        vae: VAE model
+        device: Device to run on
+    Returns:
+        Latents from VAE's encoder
+    """
+    from torchvision import transforms as tfms
+    # Single image -> single latent in a batch (so size 1, 4, 64, 64)
+    with torch.no_grad():
+        latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(device)*2-1)  # Note scaling
+    return 0.18215 * latent.latent_dist.sample()
+def latents_to_pil(latents, vae):
+    """
+    Convert the latents to images
+    Args:
+        latents: Latent tensor
+        vae: VAE model
+    Returns:
+        list: PIL images
+    """
+    # batch of latents -> list of images
+    latents = (1 / 0.18215) * latents
+    with torch.no_grad():
+        image = vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def image_grid(imgs, rows, cols, labels=None):
+    """
+    Create a grid of images with optional labels.
+    Args:
+        imgs (list): List of PIL images to be arranged in a grid
+        rows (int): Number of rows in the grid
+        cols (int): Number of columns in the grid
+        labels (list, optional): List of label strings for each image
+    Returns:
+        PIL.Image: A single image with all input images arranged in a grid and labeled
+    """
+    assert len(imgs) == rows*cols, f"Number of images ({len(imgs)}) must equal rows*cols ({rows*cols})"
+    w, h = imgs[0].size
+    grid = Image.new('RGB', size=(cols*w, rows*h + 30 if labels else rows*h))
+    # Add padding at the bottom for labels if they exist
+    label_height = 30 if labels else 0
+    # Paste images
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i%cols*w, i//cols*h))
+    # Add labels if provided
+    if labels:
+        assert len(labels) == len(imgs), "Number of labels must match number of images"
+        draw = ImageDraw.Draw(grid)
+        # Try to use a standard font, fall back to default if not available
+        try:
+            font = ImageFont.truetype("arial.ttf", 14)
+        except IOError:
+            font = ImageFont.load_default()
+        for i, label in enumerate(labels):
+            # Position text under the image
+            x = (i % cols) * w + 10
+            y = (i // cols + 1) * h - 5
+            # Draw black text with white outline for visibility
+            # White outline (draw text in each direction)
+            for offset in [(1,1), (-1,-1), (1,-1), (-1,1)]:
+                draw.text((x+offset[0], y+offset[1]), label, fill=(255,255,255), font=font)
+            # Main text (black)
+            draw.text((x, y), label, fill=(0,0,0), font=font)
+    return grid
+def vignette_loss(images, vignette_strength=3.0, color_shift=[1.0, 0.5, 0.0]):
+    """
+    Creates a strong vignette effect (dark corners) and color shift.
+    Args:
+        images: Batch of images from VAE decoder (range 0-1)
+        vignette_strength: How strong the darkening effect is (higher = more dramatic)
+        color_shift: RGB color to shift the center toward [r, g, b]
+    Returns:
+        torch.Tensor: Loss value
+    """
+    batch_size, channels, height, width = images.shape
+    # Create coordinate grid centered at 0 with range [-1, 1]
+    y = torch.linspace(-1, 1, height).view(-1, 1).repeat(1, width).to(images.device)
+    x = torch.linspace(-1, 1, width).view(1, -1).repeat(height, 1).to(images.device)
+    # Calculate radius from center (normalized [0,1])
+    radius = torch.sqrt(x.pow(2) + y.pow(2)) / 1.414
+    # Vignette mask: dark at edges, bright in center
+    vignette = torch.exp(-vignette_strength * radius)
+    # Color shift target: shift center toward specified color
+    color_tensor = torch.tensor(color_shift, dtype=torch.float32).view(1, 3, 1, 1).to(images.device)
+    center_mask = 1.0 - radius.unsqueeze(0).unsqueeze(0)
+    center_mask = torch.pow(center_mask, 2.0)  # Make the transition more dramatic
+    # Target image with vignette and color shift
+    target = images.clone()
+    # Apply vignette (multiply all channels by vignette mask)
+    for c in range(channels):
+        target[:, c] = target[:, c] * vignette
+    # Apply color shift in center
+    for c in range(channels):
+        # Shift toward target color more in center, less at edges
+        color_offset = (color_tensor[:, c] - images[:, c]) * center_mask
+        target[:, c] = target[:, c] + color_offset.squeeze(1)
+    # Calculate loss - how different current image is from our target
+    return torch.pow(images - target, 2).mean()
+def get_concept_embedding(concept_text, tokenizer, text_encoder, device):
+    """
+    Generate CLIP embedding for a concept described in text
+    Args:
+        concept_text (str): Text description of the concept (e.g., "sketch painting")
+        tokenizer: CLIP tokenizer
+        text_encoder: CLIP text encoder
+        device: Device to run on
+    Returns:
+        torch.Tensor: CLIP embedding for the concept
+    """
+    # Tokenize the concept text
+    concept_tokens = tokenizer(
+        concept_text,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt"
+    ).input_ids.to(device)
+    # Generate the embedding using the text encoder
+    with torch.no_grad():
+        concept_embedding = text_encoder(concept_tokens)[0]
+    return concept_embedding
+def load_concept_library(pipe):
+    """
+    Load textual inversion concepts from the SD concept library
+    Args:
+        pipe: StableDiffusionPipeline
+    Returns:
+        dict: Dictionary of token to embedding mappings
+    """
+    # Load textual inversion embeddings
+    pipe.load_textual_inversion("sd-concepts-library/dreams")
+    pipe.load_textual_inversion("sd-concepts-library/midjourney-style")
+    pipe.load_textual_inversion("sd-concepts-library/moebius")
+    pipe.load_textual_inversion("sd-concepts-library/style-of-marc-allante")
+    pipe.load_textual_inversion("sd-concepts-library/wlop-style")
+    # Extract the embeddings from the pipeline
+    tokens = ['<meeg>', '<midjourney-style>', '<moebius>', '<Marc_Allante>', '<wlop-style>']
+    token_ids = pipe.tokenizer.convert_tokens_to_ids(tokens)
+    embeddings = pipe.text_encoder.get_input_embeddings().weight[token_ids].detach().cpu()
+    # Create a dictionary with the embeddings
+    learned_embeds = {}
+    for i, token in enumerate(tokens):
+        learned_embeds[token] = embeddings[i]
+    # Save the embeddings for future use
+    torch.save(learned_embeds, "learned_embeds.bin")
+    print(f"Saved embeddings for tokens: {', '.join(tokens)}")
+    return learned_embeds, tokens