Spaces:
Runtime error
Runtime error
| import os | |
| from huggingface_hub import snapshot_download, hf_hub_download | |
| from videogen_hub import MODEL_PATH | |
| class OpenSora12: | |
| def __init__(self, device="gpu"): | |
| """ | |
| 1. Download the pretrained model and put it inside MODEL_PATH/modelscope | |
| 2. Create Pipeline | |
| Note: it seems that the model needed from model_dir cannot support cpu | |
| Args: | |
| device: 'gpu' or 'cpu' the device to use the model | |
| """ | |
| from mmengine import Config as mmengine_config | |
| from videogen_hub.pipelines.opensora.scripts.inference import main | |
| model_path = snapshot_download("hpcai-tech/OpenSora-STDiT-v3", | |
| local_dir=os.path.join(MODEL_PATH, 'OpenSora-STDiT-v3')) | |
| self.pipeline = main | |
| self.config = { | |
| # Basic video frame settings | |
| "num_frames": 51, # Total number of frames in a clip | |
| "frame_interval": 1, # Interval between frames | |
| "fps": 24, # Frames per second | |
| "image_size": [480, 854], # Resolution of each frame (height, width) | |
| # Model configuration for multi-resolution and specific model parameters | |
| "multi_resolution": "STDiT2", # Multi-resolution model type | |
| "model": { | |
| "type": "STDiT3-XL/2", # Model type and size | |
| "from_pretrained": os.path.join(MODEL_PATH, "STDiT3-XL_2"), # Path to pretrained checkpoint | |
| "file_name": "model.safetensors", # Name of the model file | |
| "input_sq_size": 512, # Input square size for the model | |
| "qk_norm": True, # Whether to normalize query-key in attention | |
| "enable_flashattn": False, # Enable flash attention mechanism, require flash_attn package | |
| "enable_layernorm_kernel": False, # Enable layer normalization in kernel, requires apex package | |
| }, | |
| # Variational Autoencoder (VAE) specific settings | |
| "vae": { | |
| "type": "OpenSoraVAE_V1_2", # Type of the autoencoder | |
| "from_pretrained": "hpcai-tech/OpenSora-VAE-v1.2", # Pretrained model from Hugging Face | |
| #"cache_dir": os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"), # Local cache directory for model weights | |
| "micro_frame_size": 17, | |
| "micro_batch_size": 4, # Batch size for processing | |
| }, | |
| # Text encoder settings for embedding textual information | |
| "text_encoder": { | |
| "type": "t5", # Text encoder model type | |
| "from_pretrained": "DeepFloyd/t5-v1_1-xxl", # Pretrained model | |
| "cache_dir": os.path.join(MODEL_PATH, "t5-v1_1-xxl"), # Cache directory | |
| "model_max_length": 300, # Max length of text inputs | |
| }, | |
| # Scheduler settings for diffusion models | |
| "scheduler": { | |
| "type": "rflow", # Type of scheduler for the diffusion process | |
| "num_sampling_steps": 30, # Number of sampling steps in diffusion | |
| "cfg_scale": 7.0, # Scale for classifier-free guidance | |
| # "cfg_channel": 3, # Number of channels for guidance | |
| }, | |
| # Additional settings for processing and output | |
| "dtype": "bf16", # Data type for computation (bfloat16) | |
| # "prompt_path": "./assets/texts/t2v_samples.txt", # Path to text prompts | |
| "prompt_path": None, # Path to text prompts | |
| "prompt": [ | |
| "A beautiful sunset over the city" | |
| ], # List of prompts for generation | |
| "batch_size": 1, # Batch size for generation | |
| "seed": 42, # Seed for random number generators | |
| "save_dir": "./samples/samples/", # Directory to save generated samples | |
| "config": "sample.py", # Path to this configuration file | |
| "prompt_as_path": False, # Treat the prompt as a file path (True/False) | |
| "reference_path": None, # Path to reference image/video for conditioning | |
| "loop": 1, # Number of times to loop the processing | |
| "sample_name": None, # Specific name for the generated sample | |
| "num_sample": 1, # Number of samples to generate | |
| "aes": 6.5, | |
| "flow": None, | |
| } | |
| self.config = mmengine_config(self.config) | |
| hf_hub_download( | |
| repo_id="hpcai-tech/OpenSora-STDiT-v3", | |
| filename="model.safetensors", | |
| local_dir=self.config.model.from_pretrained, | |
| ) | |
| hf_hub_download( | |
| repo_id="hpcai-tech/OpenSora-VAE-v1.2", | |
| filename="model.safetensors", | |
| local_dir=os.path.join(MODEL_PATH, "OpenSora-VAE-v1.2"), | |
| ) | |
| hf_hub_download( | |
| repo_id="DeepFloyd/t5-v1_1-xxl", | |
| filename="pytorch_model-00001-of-00002.bin", | |
| local_dir=self.config.text_encoder.cache_dir, | |
| ) | |
| def infer_one_video( | |
| self, | |
| prompt: str = None, | |
| size: list = [320, 512], | |
| seconds: int = 2, | |
| fps: int = 8, | |
| seed: int = 42, | |
| ): | |
| """ | |
| Generates a single video based on the provided prompt and parameters. | |
| The generated video always has resolution 256x256 | |
| Args: | |
| prompt (str, optional): The text prompt to generate the video from. Defaults to None. | |
| size (list, optional): The resolution of the video. Defaults to [320, 512]. | |
| seconds (int, optional): The duration of the video in seconds. Defaults to 2. | |
| fps (int, optional): The frames per second of the video. Defaults to 8. | |
| seed (int, optional): The seed for random number generation. Defaults to 42. | |
| Returns: | |
| torch.Tensor: The generated video as a tensor. | |
| """ | |
| self.config.num_frames = fps * seconds | |
| self.config.fps = fps | |
| self.config.seed = seed | |
| self.config.prompt = [prompt] | |
| self.config.image_size = size | |
| all_batch_samples = self.pipeline(self.config) | |
| sample = all_batch_samples[0][0] | |
| # sample is torch.Size([1, C, f, H, W]) | |
| output = sample.squeeze(0).permute(1, 2, 3, 0).cpu().float() | |
| # torch.Size([1, C, f, H, W]) -> torch.Size([f, H, W, C]) | |
| # BFloat16 -> Float | |
| return output | |