Spaces:

PAQUITA1986
/

turbomain

Runtime error

App Files Files Community

PAQUITA1986 commited on Jun 7

Commit

fd317e7

verified ·

1 Parent(s): af63c17

Upload app.py

Browse files

Files changed (1) hide show

app.py +392 -0

app.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Adapted from https://github.com/luosiallen/latent-consistency-model
+from __future__ import annotations
+import argparse
+from functools import partial
+import os
+import random
+import time
+from omegaconf import OmegaConf
+import gradio as gr
+import numpy as np
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
+from utils.lora import collapse_lora, monkeypatch_remove_lora
+from utils.lora_handler import LoraHandler
+from utils.common_utils import load_model_checkpoint
+from utils.utils import instantiate_from_config
+from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
+from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline
+import torch
+import torchvision
+from concurrent.futures import ThreadPoolExecutor
+import uuid
+DESCRIPTION = """# T2V-Turbo 🚀
+Our model is distilled from [VideoCrafter2](https://ailab-cvc.github.io/videocrafter2/).
+T2V-Turbo learns a LoRA on top of the base model by aligning to the reward feedback from [HPSv2.1](https://github.com/tgxs002/HPSv2/tree/master) and [InternVid2 Stage 2 Model](https://huggingface.co/OpenGVLab/InternVideo2-Stage2_1B-224p-f4).
+T2V-Turbo-v2 optimizes the training techniques by finetuning the full base model and further aligns to [CLIPScore](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)
+T2V-Turbo trains on pure WebVid-10M data, whereas T2V-Turbo-v2 carufully optimizes different learning objectives with a mixutre of VidGen-1M and WebVid-10M data.
+Moreover, T2V-Turbo-v2 supports to distill motion priors from the training videos.
+[Project page for T2V-Turbo](https://t2v-turbo.github.io) 🥳
+[Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓
+"""
+if torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CUDA 😀</p>"
+elif hasattr(torch, "xpu") and torch.xpu.is_available():
+    DESCRIPTION += "\n<p>Running on XPU 🤓</p>"
+else:
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES") == "1"
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE") == "1"
+"""
+Operation System Options:
+    If you are using MacOS, please set the following (device="mps") ;
+    If you are using Linux & Windows with Nvidia GPU, please set the device="cuda";
+    If you are using Linux & Windows with Intel Arc GPU, please set the device="xpu";
+"""
+# device = "mps"    # MacOS
+# device = "xpu"    # Intel Arc GPU
+device = "cuda"  # Linux & Windows
+"""
+   DTYPE Options:
+      To reduce GPU memory you can set "DTYPE=torch.float16",
+      but image quality might be compromised
+"""
+DTYPE = torch.bfloat16
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def save_video(
+    vid_tensor, profile: gr.OAuthProfile | None, metadata: dict, root_path="./", fps=16
+):
+    unique_name = str(uuid.uuid4()) + ".mp4"
+    unique_name = os.path.join(root_path, unique_name)
+    video = vid_tensor.detach().cpu()
+    video = torch.clamp(video.float(), -1.0, 1.0)
+    video = video.permute(1, 0, 2, 3)  # t,c,h,w
+    video = (video + 1.0) / 2.0
+    video = (video * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(
+        unique_name, video, fps=fps, video_codec="h264", options={"crf": "10"}
+    )
+    return unique_name
+def save_videos(
+    video_array, profile: gr.OAuthProfile | None, metadata: dict, fps: int = 16
+):
+    paths = []
+    root_path = "./videos/"
+    os.makedirs(root_path, exist_ok=True)
+    with ThreadPoolExecutor() as executor:
+        paths = list(
+            executor.map(
+                save_video,
+                video_array,
+                [profile] * len(video_array),
+                [metadata] * len(video_array),
+                [root_path] * len(video_array),
+                [fps] * len(video_array),
+            )
+        )
+    return paths[0]
+def generate(
+    prompt: str,
+    seed: int = 0,
+    guidance_scale: float = 7.5,
+    percentage: float = 0.3,
+    num_inference_steps: int = 4,
+    num_frames: int = 16,
+    fps: int = 16,
+    randomize_seed: bool = False,
+    param_dtype="bf16",
+    motion_gs: float = 0.05,
+    use_motion_cond: bool = False,
+    progress=gr.Progress(track_tqdm=True),
+    profile: gr.OAuthProfile | None = None,
+):
+    seed = randomize_seed_fn(seed, randomize_seed)
+    torch.manual_seed(seed)
+    if param_dtype == "bf16":
+        dtype = torch.bfloat16
+        unet.dtype = torch.bfloat16
+    elif param_dtype == "fp16":
+        dtype = torch.float16
+        unet.dtype = torch.float16
+    elif param_dtype == "fp32":
+        dtype = torch.float32
+        unet.dtype = torch.float32
+    else:
+        raise ValueError(f"Unknown dtype: {param_dtype}")
+    pipeline.unet.to(device, dtype)
+    pipeline.text_encoder.to(device, dtype)
+    pipeline.vae.to(device, dtype)
+    pipeline.to(device, dtype)
+    start_time = time.time()
+    result = pipeline(
+        prompt=prompt,
+        frames=num_frames,
+        fps=fps,
+        guidance_scale=guidance_scale,
+        motion_gs=motion_gs,
+        use_motion_cond=use_motion_cond,
+        percentage=percentage,
+        num_inference_steps=num_inference_steps,
+        lcm_origin_steps=200,
+        num_videos_per_prompt=1,
+    )
+    paths = save_videos(
+        result,
+        profile,
+        metadata={
+            "prompt": prompt,
+            "seed": seed,
+            "guidance_scale": guidance_scale,
+            "num_inference_steps": num_inference_steps,
+        },
+        fps=fps,
+    )
+    print(time.time() - start_time)
+    return paths, seed
+examples = [
+    "An astronaut riding a horse.",
+    "Darth vader surfing in waves.",
+    "Robot dancing in times square.",
+    "Clown fish swimming through the coral reef.",
+    "Pikachu snowboarding.",
+    "With the style of van gogh, A young couple dances under the moonlight by the lake.",
+    "A young woman with glasses is jogging in the park wearing a pink headband.",
+    "Impressionist style, a yellow rubber duck floating on the wave on the sunset",
+    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
+    "With the style of low-poly game art, A majestic, white horse gallops gracefully across a moonlit beach.",
+]
+if __name__ == "__main__":
+    # Add model name as parameter
+    parser = argparse.ArgumentParser(description="Gradio demo for T2V-Turbo.")
+    parser.add_argument(
+        "--unet_dir",
+        type=str,
+        default="output/vlcm_vc2_mixed_vid_gen_128k_bs3_percen_0p2_mgs_max_0p1/checkpoint-10000/unet.pt",
+        help="Directory of the UNet model",
+    )
+    parser.add_argument(
+        "--base_model_dir",
+        type=str,
+        default="model_cache/VideoCrafter2_model.ckpt",
+        help="Directory of the VideoCrafter2 checkpoint.",
+    )
+    parser.add_argument(
+        "--version",
+        required=True,
+        choices=["v1", "v2"],
+        help="Whether to use motion condition or not.",
+    )
+    parser.add_argument(
+        "--motion_gs",
+        default=0.05,
+        type=float,
+        help="Guidance scale for motion condition.",
+    )
+    args = parser.parse_args()
+    config = OmegaConf.load("configs/inference_t2v_512_v2.0.yaml")
+    model_config = config.pop("model", OmegaConf.create())
+    pretrained_t2v = instantiate_from_config(model_config)
+    pretrained_t2v = load_model_checkpoint(pretrained_t2v, args.base_model_dir)
+    unet_config = model_config["params"]["unet_config"]
+    unet_config["params"]["use_checkpoint"] = False
+    unet_config["params"]["time_cond_proj_dim"] = 256
+    if args.version == "v2":
+        unet_config["params"]["motion_cond_proj_dim"] = 256
+    unet = instantiate_from_config(unet_config)
+    if "lora" in args.unet_dir:
+        unet.load_state_dict(
+            pretrained_t2v.model.diffusion_model.state_dict(), strict=False
+        )
+        use_unet_lora = True
+        lora_manager = LoraHandler(
+            version="cloneofsimo",
+            use_unet_lora=use_unet_lora,
+            save_for_webui=True,
+            unet_replace_modules=["UNetModel"],
+        )
+        lora_manager.add_lora_to_model(
+            use_unet_lora,
+            unet,
+            lora_manager.unet_replace_modules,
+            lora_path=args.unet_dir,
+            dropout=0.1,
+            r=64,
+        )
+        collapse_lora(unet, lora_manager.unet_replace_modules)
+        monkeypatch_remove_lora(unet)
+    else:
+        unet.load_state_dict(torch.load(args.unet_dir, map_location=device))
+    unet.eval()
+    pretrained_t2v.model.diffusion_model = unet
+    scheduler = T2VTurboScheduler(
+        linear_start=model_config["params"]["linear_start"],
+        linear_end=model_config["params"]["linear_end"],
+    )
+    pipeline = T2VTurboVC2Pipeline(pretrained_t2v, scheduler, model_config)
+    pipeline.to(device)
+    with gr.Blocks(css="style.css") as demo:
+        gr.Markdown(DESCRIPTION)
+        gr.DuplicateButton(
+            value="Duplicate Space for private use",
+            elem_id="duplicate-button",
+            visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+        )
+        with gr.Group():
+            with gr.Row():
+                prompt = gr.Text(
+                    label="Prompt",
+                    show_label=False,
+                    max_lines=1,
+                    placeholder="Enter your prompt",
+                    container=False,
+                )
+                run_button = gr.Button("Run", scale=0)
+            result_video = gr.Video(
+                label="Generated Video", interactive=False, autoplay=True
+            )
+        with gr.Accordion("Advanced options", open=False):
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+                randomize=True,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed across runs", value=True)
+            dtype_choices = ["bf16", "fp16", "fp32"]
+            param_dtype = gr.Radio(
+                dtype_choices,
+                label="torch.dtype",
+                value=dtype_choices[0],
+                interactive=True,
+                info="To save GPU memory, use fp16 or bf16. For better quality, use fp32.",
+            )
+            with gr.Row():
+                percentage = gr.Slider(
+                    label="Percentage of steps to apply motion guidance (v2 w/ MG only)",
+                    minimum=0.0,
+                    maximum=0.5,
+                    step=0.05,
+                    value=0.3,
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale for base",
+                    minimum=2,
+                    maximum=14,
+                    step=0.1,
+                    value=7.5,
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps for base",
+                    minimum=4,
+                    maximum=50,
+                    step=1,
+                    value=8,
+                )
+            with gr.Row():
+                num_frames = gr.Slider(
+                    label="Number of Video Frames",
+                    minimum=16,
+                    maximum=48,
+                    step=8,
+                    value=16,
+                )
+                fps = gr.Slider(
+                    label="FPS",
+                    minimum=8,
+                    maximum=32,
+                    step=4,
+                    value=8,
+                )
+        use_motion_cond = args.version == "v1"
+        generate = partial(
+            generate, use_motion_cond=use_motion_cond, motion_gs=args.motion_gs
+        )
+        gr.Examples(
+            examples=examples,
+            inputs=prompt,
+            outputs=result_video,
+            fn=generate,
+            cache_examples=CACHE_EXAMPLES,
+        )
+        gr.on(
+            triggers=[
+                prompt.submit,
+                run_button.click,
+            ],
+            fn=generate,
+            inputs=[
+                prompt,
+                seed,
+                guidance_scale,
+                percentage,
+                num_inference_steps,
+                num_frames,
+                fps,
+                randomize_seed,
+                param_dtype,
+            ],
+            outputs=[result_video, seed],
+            api_name="run",
+        )
+    demo.queue(api_open=False)
+    # demo.queue(max_size=20).launch()
+    demo.launch()