Spaces:

WonwoongCho
/

IT-Blender

Running on Zero

File size: 6,108 Bytes

import gradio as gr
import torch
import numpy as np
import spaces
from PIL import Image
from huggingface_hub import hf_hub_download

from diffusers import FluxPipeline
from src.attention_processor import FluxBlendedAttnProcessor2_0

from src.utils_sample import set_seed, resize_and_add_margin
import os


@spaces.GPU
def process_image_and_text(image, text, pipe):
    
    # image = Image.open(img_path).convert('RGB') 
    image = resize_and_add_margin(image, target_size=512)
    
    image_list = [image]

    out = pipe(
        prompt=text,
        height=512,
        width=512,
        max_sequence_length=256,
        generator=torch.Generator().manual_seed(seed),
        it_blender_image=image_list
    ).images[0]

    return out


def get_samples():
    sample_list = [
        {
            "image": "assets/0.jpg",
            "scale": 0.6,
            "seed": 42,
            "text": "A photo of a monster cartoon character, imaginative, creative, design",
        },
        {
            "image": "assets/1.jpg",
            "scale": 0.6,
            "seed": 42,
            "text": "A photo of an owl cartoon character, imaginative, creative, design",
        },
        {
            "image": "assets/2.jpg",
            "scale": 0.6,
            "seed": 42,
            "text": "A photo of a dragon, imaginative, creative, design",
        },
        {
            "image": "assets/character1.jpg",
            "scale": 0.6,
            "seed": 42,
            "text": "A photo of a dragon, imaginative, creative, design",
        },
        {
            "image": "assets/character2.jpg",
            "scale": 0.6,
            "seed": 42,
            "text": "A photo of a dragon, imaginative, creative, design",
        },
        {
            "image": "assets/character3.jpg",
            "scale": 0.6,
            "seed": 42,
            "text": "A photo of a dragon, imaginative, creative, design",
        },
        {
            "image": "assets/graphic1.jpg",
            "scale": 0.7,
            "seed": 42,
            "text": "A photo of a woman, imaginative, creative, design",
        },
        {
            "image": "assets/product1.jpg",
            "scale": 0.8,
            "seed": 42,
            "text": "A photo of a motorcycle, imaginative, creative, design",
        }

    ]
    return [
        [
            Image.open(sample["image"]).resize((512, 512)),
            sample["scale"],
            sample["seed"],
            sample["text"],
        ]
        for sample in sample_list
    ]


header = """
# 💡 IT-Blender / FLUX
<div style="text-align: center; display: flex; justify-content: left; gap: 5px;">
<a href="https://arxiv.org/abs/2411.15098"><img src="https://img.shields.io/badge/ArXiv-Paper-A42C25.svg" alt="arXiv"></a>
<a href="https://imagineforme.github.io/"><img alt="Build" src="https://img.shields.io/badge/Project%20Page-ITBlender-yellow"></a> 
<a href="https://github.com/WonwoongCho/IT-Blender"><img src="https://img.shields.io/badge/GitHub-Code-blue.svg?logo=github&" alt="GitHub"></a>
</div>
"""


def create_app(
    device: str = "cuda",
):
    dtype = torch.bfloat16
    token = os.environ.get("HF_TOKEN")

    pipe = FluxPipeline.from_pretrained(
        "black-forest-labs/FLUX.1-dev", 
        torch_dtype=dtype, 
        token=token
    )
    pipe = pipe.to(device)

    with gr.Blocks() as app:
        gr.Markdown(header, elem_id="header")
        with gr.Row(equal_height=False):
            with gr.Column(variant="panel", elem_classes="inputPanel"):
                original_image = gr.Image(
                    type="pil", label="Condition Image", width=300, elem_id="input"
                )

                scale = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.6, label="Guidance Scale")
                seed = gr.Number(value=42, label="seed", precision=0)
                text = gr.Textbox(
                    lines=2, 
                    label="Text Prompt", 
                    value="A photo of a dragon, imaginative, creative, design", 
                    elem_id="text"
                )

                submit_btn = gr.Button("Run", elem_id="submit_btn")

            with gr.Column(variant="panel", elem_classes="outputPanel"):
                output_image = gr.Image(type="pil", elem_id="output")


        set_seed(seed)
        blended_attn_procs = {}
        for name, _ in pipe.transformer.attn_processors.items():
            if "single" in name:
                blended_attn_procs[name] = FluxBlendedAttnProcessor2_0(3072, ba_scale=scale, num_ref=1)
            else:
                blended_attn_procs[name] = pipe.transformer.attn_processors[name]

        pipe.transformer.set_attn_processor(blended_attn_procs)
        pipe.to(dtype)

        model_path = hf_hub_download(
            repo_id="WonwoongCho/IT-Blender",
            filename="FLUX/it-blender.bin",
            token=token
        )
        
        pretrained_blended_attn_weights = torch.load(model_path, map_location=pipe._execution_device)

        key_changed_blended_attn_weights = {}
        for key, value in pretrained_blended_attn_weights.items():
            block_idx = int(key.split(".")[0]) - 21
            k_or_v = key.split("_")[2]
            changed_key = f'single_transformer_blocks.{block_idx}.attn.processor.blended_attention_{k_or_v}_proj.weight'
            key_changed_blended_attn_weights[changed_key] = value.to(dtype)
            
        missing_keys, unexpected_keys = pipe.transformer.load_state_dict(key_changed_blended_attn_weights, strict=False)
        pipe = pipe.to(device)
        

        with gr.Row():
            examples = gr.Examples(
                examples=get_samples(),
                inputs=[original_image, text],
                label="Examples",
            )
        
        submit_btn.click(
            fn=process_image_and_text,
            inputs=[original_image, text, pipe],
            outputs=output_image,
        )
        
    return app


if __name__ == "__main__":
    demo = create_app(device="cuda")
    demo.launch(debug=True, ssr_mode=False)