Spaces:
Runtime error
Runtime error
| import os, sys | |
| import torch | |
| from videogen_hub import MODEL_PATH | |
| class LaVie(): | |
| def __init__(self, model_path=os.path.join(MODEL_PATH, "lavie"), device="cuda"): | |
| """ | |
| 1. Download all necessary models from huggingface. | |
| 2. Initializes the LaVie model with a specific model path and device. | |
| Args: | |
| model_path (str, optional): The path to the model checkpoints. Defaults to "MODEL_PATH/lavie". | |
| device (str, optional): The device to run the model on. Defaults to "cuda". | |
| """ | |
| # Put the source code imports here to avoid dependency version issues | |
| from videogen_hub.pipelines.lavie.lavie_src.base.pipelines.pipeline_videogen import VideoGenPipeline | |
| from videogen_hub.pipelines.lavie.lavie_src.base.download import find_model | |
| from videogen_hub.pipelines.lavie.lavie_src.base.models.unet import UNet3DConditionModel | |
| from diffusers.schedulers import DDPMScheduler | |
| from diffusers.models import AutoencoderKL | |
| from transformers import CLIPTokenizer, CLIPTextModel | |
| from huggingface_hub import snapshot_download | |
| from omegaconf import OmegaConf | |
| snapshot_download(repo_id="Vchitect/LaVie", local_dir=model_path) | |
| snapshot_download(repo_id="CompVis/stable-diffusion-v1-4", local_dir=os.path.join(model_path, "stable-diffusion-v1-4")) | |
| snapshot_download(repo_id="stabilityai/stable-diffusion-x4-upscaler", | |
| local_dir=os.path.join(model_path, "stable-diffusion-x4-upscaler")) | |
| torch.set_grad_enabled(False) | |
| self.device = device | |
| config = { | |
| "model_config": { | |
| "use_compile": False, | |
| "use_fp16": True, | |
| "run_time": 0, | |
| "guidance_scale": 7.5, | |
| "num_sampling_steps": 50 | |
| }, | |
| "scheduler_config": { | |
| "sample_method": "ddpm", | |
| "beta_start": 0.0001, | |
| "beta_end": 0.02, | |
| "beta_schedule": "linear" | |
| } | |
| } | |
| self.config = OmegaConf.create(config) | |
| sd_path = os.path.join(model_path, "stable-diffusion-v1-4") | |
| unet = UNet3DConditionModel.from_pretrained_2d(sd_path, subfolder="unet").to(device, dtype=torch.float16) | |
| state_dict = find_model(os.path.join(model_path, "lavie_base.pt")) | |
| unet.load_state_dict(state_dict) | |
| vae = AutoencoderKL.from_pretrained(sd_path, subfolder="vae", torch_dtype=torch.float16).to(device) | |
| tokenizer_one = CLIPTokenizer.from_pretrained(sd_path, subfolder="tokenizer") | |
| text_encoder_one = CLIPTextModel.from_pretrained(sd_path, subfolder="text_encoder", | |
| torch_dtype=torch.float16).to(device) # huge | |
| scheduler = DDPMScheduler.from_pretrained(sd_path, | |
| subfolder="scheduler", | |
| beta_start=self.config.scheduler_config.beta_start, | |
| beta_end=self.config.scheduler_config.beta_end, | |
| beta_schedule=self.config.scheduler_config.beta_schedule) | |
| self.videogen_pipeline = VideoGenPipeline(vae=vae, | |
| text_encoder=text_encoder_one, | |
| tokenizer=tokenizer_one, | |
| scheduler=scheduler, | |
| unet=unet).to(device) | |
| self.videogen_pipeline.enable_xformers_memory_efficient_attention() | |
| def infer_one_video(self, | |
| prompt: str = None, | |
| size: list = [320, 512], | |
| seconds: int = 2, | |
| fps: int = 8, | |
| seed: int = 42): | |
| """ | |
| Generates a single video based on the provided prompt and parameters. | |
| Args: | |
| prompt (str, optional): The text prompt to generate the video from. Defaults to None. | |
| size (list, optional): The size of the video as [height, width]. Defaults to [320, 512]. | |
| seconds (int, optional): The duration of the video in seconds. Defaults to 2. | |
| fps (int, optional): The frames per second of the video. Defaults to 8. | |
| seed (int, optional): The seed for random number generation. Defaults to 42. | |
| Returns: | |
| torch.Tensor: The generated video as a tensor. | |
| """ | |
| if seed is not None: | |
| torch.manual_seed(seed) | |
| videos = self.videogen_pipeline(prompt, | |
| video_length=seconds * fps, | |
| height=size[0], | |
| width=size[1], | |
| num_inference_steps=self.config.model_config.num_sampling_steps, | |
| guidance_scale=self.config.model_config.guidance_scale).video | |
| return videos[0] | |