Spaces:

QHL067
/

CrossFlow

Paused

App Files Files Community

QHL067 commited on Feb 19, 2025

Commit

20ddbb6

1 Parent(s): f9567e5

code

Browse files

Files changed (1) hide show

app.py +188 -80

app.py CHANGED Viewed

@@ -5,124 +5,231 @@ from absl import app
 from ml_collections import config_flags
 import os
-import ml_collections
 import torch
-from torch import multiprocessing as mp
-import torch.nn as nn
-import accelerate
-import utils
-import tempfile
-from absl import logging
-import builtins
-import einops
-import math
-import numpy as np
-import time
-from PIL import Image
 import random
-from diffusion.flow_matching import FlowMatching, ODEFlowMatchingSolver, ODEEulerFlowMatchingSolver
-from tools.clip_score import ClipSocre
 import libs.autoencoder
 from libs.clip import FrozenCLIPEmbedder
-from libs.t5 import T5Embedder
-def unpreprocess(x):
-        x = 0.5 * (x + 1.)
-        x.clamp_(0., 1.)
-        return x
-def batch_decode(_z, decode, batch_size=10):
-    """
-    The VAE decoder requires large GPU memory. To run the interpolation model on GPUs with 24 GB or smaller RAM, you can use this code to reduce memory usage for the VAE.
-    It works by splitting the input tensor into smaller chunks.
-    """
     num_samples = _z.size(0)
     decoded_batches = []
     for i in range(0, num_samples, batch_size):
-        batch = _z[i:i + batch_size]
         decoded_batch = decode(batch)
         decoded_batches.append(decoded_batch)
-    image_unprocessed = torch.cat(decoded_batches, dim=0)
-    return image_unprocessed
-def get_caption(llm, text_model, prompt_dict, batch_size):
     if batch_size == 3:
-        # only addition or only subtraction
-        assert len(prompt_dict) == 2
-        _batch_con = list(prompt_dict.values()) + [' ']
     elif batch_size == 4:
-        # addition and subtraction
-        assert len(prompt_dict) == 3
-        _batch_con = list(prompt_dict.values()) + [' ']
     elif batch_size >= 5:
-        # linear interpolation
-        assert len(prompt_dict) == 2
-        _batch_con = [prompt_dict['prompt_1']] + [' '] * (batch_size-2) + [prompt_dict['prompt_2']]
     if llm == "clip":
-        _latent, _latent_and_others = text_model.encode(_batch_con)
-        _con = _latent_and_others['token_embedding'].detach()
     elif llm == "t5":
-        _latent, _latent_and_others = text_model.get_text_embeddings(_batch_con)
-        _con = (_latent_and_others['token_embedding'] * 10.0).detach()
     else:
-        raise NotImplementedError
-    _con_mask = _latent_and_others['token_mask'].detach()
-    _batch_token = _latent_and_others['tokens'].detach()
-    _batch_caption = _batch_con
-    return (_con, _con_mask, _batch_token, _batch_caption)
-import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-# pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-# pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
 @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
     prompt1,
     prompt2,
-    negative_prompt,
     seed,
     randomize_seed,
     guidance_scale,
     num_inference_steps,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    # image = pipe(
-    #     prompt=prompt,
-    #     negative_prompt=negative_prompt,
-    #     guidance_scale=guidance_scale,
-    #     num_inference_steps=num_inference_steps,
-    #     width=width,
-    #     height=height,
-    #     generator=generator,
-    # ).images[0]
-    # return image, seed
 # examples = [
@@ -171,13 +278,6 @@ with gr.Blocks(css=css) as demo:
         result = gr.Image(label="Result", show_label=False)
         with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
@@ -205,6 +305,14 @@ with gr.Blocks(css=css) as demo:
                     value=50,  # Replace with defaults that work for your model
                 )
         gr.Examples(examples=examples, inputs=[prompt1, prompt2])
     gr.on(
         triggers=[run_button.click, prompt1.submit, prompt2.submit],
@@ -212,11 +320,11 @@ with gr.Blocks(css=css) as demo:
         inputs=[
             prompt1,
             prompt2,
-            negative_prompt,
             seed,
             randomize_seed,
             guidance_scale,
             num_inference_steps,
         ],
         outputs=[result, seed],
     )

 from ml_collections import config_flags
 import os
+import spaces #[uncomment to use ZeroGPU]
 import torch
+import os
 import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.utils import save_image
+from absl import logging
+import ml_collections
+from diffusion.flow_matching import ODEEulerFlowMatchingSolver
+import utils
 import libs.autoencoder
 from libs.clip import FrozenCLIPEmbedder
+from configs import t2i_512px_clip_dimr
+def unpreprocess(x: torch.Tensor) -> torch.Tensor:
+    x = 0.5 * (x + 1.0)
+    x.clamp_(0.0, 1.0)
+    return x
+def cosine_similarity_torch(latent1: torch.Tensor, latent2: torch.Tensor) -> torch.Tensor:
+    latent1_flat = latent1.view(-1)
+    latent2_flat = latent2.view(-1)
+    cosine_similarity = F.cosine_similarity(
+        latent1_flat.unsqueeze(0), latent2_flat.unsqueeze(0), dim=1
+    )
+    return cosine_similarity
+def kl_divergence(latent1: torch.Tensor, latent2: torch.Tensor) -> torch.Tensor:
+    latent1_prob = F.softmax(latent1, dim=-1)
+    latent2_prob = F.softmax(latent2, dim=-1)
+    latent1_log_prob = torch.log(latent1_prob)
+    kl_div = F.kl_div(latent1_log_prob, latent2_prob, reduction="batchmean")
+    return kl_div
+def batch_decode(_z: torch.Tensor, decode, batch_size: int = 10) -> torch.Tensor:
     num_samples = _z.size(0)
     decoded_batches = []
     for i in range(0, num_samples, batch_size):
+        batch = _z[i : i + batch_size]
         decoded_batch = decode(batch)
         decoded_batches.append(decoded_batch)
+    return torch.cat(decoded_batches, dim=0)
+def get_caption(llm: str, text_model, prompt_dict: dict, batch_size: int):
     if batch_size == 3:
+        # Only addition or only subtraction mode.
+        assert len(prompt_dict) == 2, "Expected 2 prompts for batch_size 3."
+        batch_prompts = list(prompt_dict.values()) + [" "]
     elif batch_size == 4:
+        # Addition and subtraction mode.
+        assert len(prompt_dict) == 3, "Expected 3 prompts for batch_size 4."
+        batch_prompts = list(prompt_dict.values()) + [" "]
     elif batch_size >= 5:
+        # Linear interpolation mode.
+        assert len(prompt_dict) == 2, "Expected 2 prompts for linear interpolation."
+        batch_prompts = [prompt_dict["prompt_1"]] + [" "] * (batch_size - 2) + [prompt_dict["prompt_2"]]
+    else:
+        raise ValueError(f"Unsupported batch_size: {batch_size}")
     if llm == "clip":
+        latent, latent_and_others = text_model.encode(batch_prompts)
+        context = latent_and_others["token_embedding"].detach()
     elif llm == "t5":
+        latent, latent_and_others = text_model.get_text_embeddings(batch_prompts)
+        context = (latent_and_others["token_embedding"] * 10.0).detach()
     else:
+        raise NotImplementedError(f"Language model {llm} not supported.")
+    token_mask = latent_and_others["token_mask"].detach()
+    tokens = latent_and_others["tokens"].detach()
+    captions = batch_prompts
+    return context, token_mask, tokens, captions
+# Load configuration and initialize models.
+config_dict = t2i_512px_clip_dimr.get_config()
+config = ml_collections.ConfigDict(config_dict)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logging.info(f"Using device: {device}")
+# Freeze configuration.
+config = ml_collections.FrozenConfigDict(config)
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024  # Currently not used.
+# Load the main diffusion model.
+nnet_path = os.path.join("..", "..", "ckpt", "released_model", "t2i_512px_clip_dimr.pth")
+nnet = utils.get_nnet(**config.nnet)
+nnet = nnet.to(device)
+state_dict = torch.load(nnet_path, map_location=device)
+nnet.load_state_dict(state_dict)
+nnet.eval()
+# Initialize text model.
+llm = "clip"
+clip = FrozenCLIPEmbedder()
+clip.eval()
+clip.to(device)
+# Load autoencoder.
+autoencoder = libs.autoencoder.get_model(**config.autoencoder)
+autoencoder.to(device)
+@torch.cuda.amp.autocast()
+def encode(_batch: torch.Tensor) -> torch.Tensor:
+    """Encode a batch of images using the autoencoder."""
+    return autoencoder.encode(_batch)
+@torch.cuda.amp.autocast()
+def decode(_batch: torch.Tensor) -> torch.Tensor:
+    """Decode a batch of latent vectors using the autoencoder."""
+    return autoencoder.decode(_batch)
 @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
     prompt1,
     prompt2,
     seed,
     randomize_seed,
     guidance_scale,
     num_inference_steps,
+    num_of_interpolation,
+    save_gpu_memory=True,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
+    torch.manual_seed(seed)
+    if device.type == "cuda":
+        torch.cuda.manual_seed_all(seed)
+    # Only support interpolation in this implementation.
+    prompt_dict = {"prompt_1": prompt1, "prompt_2": prompt2}
+    for key, value in prompt_dict.items():
+        assert value is not None, f"{key} must not be None."
+    assert num_of_interpolation >= 5, "For linear interpolation, please sample at least five images."
+    # Get text embeddings and tokens.
+    _context, _token_mask, _token, _caption = get_caption(
+        llm, clip, prompt_dict=prompt_dict, batch_size=num_of_interpolation
+    )
+    with torch.no_grad():
+        _z_gaussian = torch.randn(num_of_interpolation, *config.z_shape, device=device)
+        _z_x0, _mu, _log_var = nnet(
+            _context, text_encoder=True, shape=_z_gaussian.shape, mask=_token_mask
+        )
+        _z_init = _z_x0.reshape(_z_gaussian.shape)
+        # Prepare the initial latent representations based on the number of interpolations.
+        if num_of_interpolation == 3:
+            # Addition or subtraction mode.
+            if config.prompt_a is not None:
+                assert config.prompt_s is None, "Only one of prompt_a or prompt_s should be provided."
+                z_init_temp = _z_init[0] + _z_init[1]
+            elif config.prompt_s is not None:
+                assert config.prompt_a is None, "Only one of prompt_a or prompt_s should be provided."
+                z_init_temp = _z_init[0] - _z_init[1]
+            else:
+                raise NotImplementedError("Either prompt_a or prompt_s must be provided for 3-sample mode.")
+            mean = z_init_temp.mean()
+            std = z_init_temp.std()
+            _z_init[2] = (z_init_temp - mean) / std
+        elif num_of_interpolation == 4:
+            z_init_temp = _z_init[0] + _z_init[1] - _z_init[2]
+            mean = z_init_temp.mean()
+            std = z_init_temp.std()
+            _z_init[3] = (z_init_temp - mean) / std
+        elif num_of_interpolation >= 5:
+            tensor_a = _z_init[0]
+            tensor_b = _z_init[-1]
+            num_interpolations = num_of_interpolation - 2
+            interpolations = [
+                tensor_a + (tensor_b - tensor_a) * (i / (num_interpolations + 1))
+                for i in range(1, num_interpolations + 1)
+            ]
+            _z_init = torch.stack([tensor_a] + interpolations + [tensor_b], dim=0)
+        else:
+            raise ValueError("Unsupported number of interpolations.")
+        assert guidance_scale > 1, "Guidance scale must be greater than 1."
+        has_null_indicator = hasattr(config.nnet.model_args, "cfg_indicator")
+        ode_solver = ODEEulerFlowMatchingSolver(
+            nnet,
+            bdv_model_fn=None,
+            step_size_type="step_in_dsigma",
+            guidance_scale=guidance_scale,
+        )
+        _z, _ = ode_solver.sample(
+            x_T=_z_init,
+            batch_size=num_of_interpolation,
+            sample_steps=num_inference_steps,
+            unconditional_guidance_scale=guidance_scale,
+            has_null_indicator=has_null_indicator,
+        )
+        if save_gpu_memory:
+            image_unprocessed = batch_decode(_z, decode)
+        else:
+            image_unprocessed = decode(_z)
+        samples = unpreprocess(image_unprocessed).contiguous()[0]
+    return samples, seed
 # examples = [
         result = gr.Image(label="Result", show_label=False)
         with gr.Accordion("Advanced Settings", open=False):
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
                     value=50,  # Replace with defaults that work for your model
                 )
+                num_of_interpolation = gr.Slider(
+                    label="Number of images for interpolation",
+                    minimum=5,
+                    maximum=50,
+                    step=1,
+                    value=10,  # Replace with defaults that work for your model
+                )
         gr.Examples(examples=examples, inputs=[prompt1, prompt2])
     gr.on(
         triggers=[run_button.click, prompt1.submit, prompt2.submit],
         inputs=[
             prompt1,
             prompt2,
             seed,
             randomize_seed,
             guidance_scale,
             num_inference_steps,
+            num_of_interpolation,
         ],
         outputs=[result, seed],
     )