| | import os |
| | import torch |
| |
|
| | __all__ = [ |
| | "C_SCALE", |
| | "PROMPT_TEMPLATE", |
| | "MODEL_BASE", |
| | "PRECISIONS", |
| | "NORMALIZATION_TYPE", |
| | "ACTIVATION_TYPE", |
| | "VAE_PATH", |
| | "TEXT_ENCODER_PATH", |
| | "TOKENIZER_PATH", |
| | "TEXT_PROJECTION", |
| | "DATA_TYPE", |
| | "NEGATIVE_PROMPT", |
| | "NEGATIVE_PROMPT_I2V", |
| | "FLOW_PATH_TYPE", |
| | "FLOW_PREDICT_TYPE", |
| | "FLOW_LOSS_WEIGHT", |
| | "FLOW_SNR_TYPE", |
| | "FLOW_SOLVER", |
| | ] |
| |
|
| | PRECISION_TO_TYPE = { |
| | 'fp32': torch.float32, |
| | 'fp16': torch.float16, |
| | 'bf16': torch.bfloat16, |
| | } |
| |
|
| | |
| | |
| | |
| | C_SCALE = 1_000_000_000_000_000 |
| |
|
| | |
| | |
| | |
| | PROMPT_TEMPLATE_ENCODE = ( |
| | "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, " |
| | "quantity, text, spatial relationships of the objects and background:<|eot_id|>" |
| | "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" |
| | ) |
| | PROMPT_TEMPLATE_ENCODE_VIDEO = ( |
| | "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: " |
| | "1. The main content and theme of the video." |
| | "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." |
| | "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." |
| | "4. background environment, light, style and atmosphere." |
| | "5. camera angles, movements, and transitions used in the video:<|eot_id|>" |
| | "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" |
| | ) |
| |
|
| | PROMPT_TEMPLATE_ENCODE_I2V = ( |
| | "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the image by detailing the color, shape, size, texture, " |
| | "quantity, text, spatial relationships of the objects and background:<|eot_id|>" |
| | "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" |
| | "<|start_header_id|>assistant<|end_header_id|>\n\n" |
| | ) |
| |
|
| | PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( |
| | "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: " |
| | "1. The main content and theme of the video." |
| | "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." |
| | "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." |
| | "4. background environment, light, style and atmosphere." |
| | "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" |
| | "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" |
| | "<|start_header_id|>assistant<|end_header_id|>\n\n" |
| | ) |
| |
|
| | NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion" |
| | NEGATIVE_PROMPT_I2V = "deformation, a poor composition and deformed video, bad teeth, bad eyes, bad limbs" |
| |
|
| | PROMPT_TEMPLATE = { |
| | "dit-llm-encode": { |
| | "template": PROMPT_TEMPLATE_ENCODE, |
| | "crop_start": 36, |
| | }, |
| | "dit-llm-encode-video": { |
| | "template": PROMPT_TEMPLATE_ENCODE_VIDEO, |
| | "crop_start": 95, |
| | }, |
| | "dit-llm-encode-i2v": { |
| | "template": PROMPT_TEMPLATE_ENCODE_I2V, |
| | "crop_start": 36, |
| | "image_emb_start": 5, |
| | "image_emb_end": 581, |
| | "image_emb_len": 576, |
| | "double_return_token_id": 271 |
| | }, |
| | "dit-llm-encode-video-i2v": { |
| | "template": PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, |
| | "crop_start": 103, |
| | "image_emb_start": 5, |
| | "image_emb_end": 581, |
| | "image_emb_len": 576, |
| | "double_return_token_id": 271 |
| | }, |
| | } |
| |
|
| | |
| | PRECISIONS = {"fp32", "fp16", "bf16"} |
| | NORMALIZATION_TYPE = {"layer", "rms"} |
| | ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"} |
| |
|
| | |
| | MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts") |
| |
|
| | |
| | DATA_TYPE = {"image", "video", "image_video"} |
| |
|
| | |
| | VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"} |
| |
|
| | |
| | TEXT_ENCODER_PATH = { |
| | "clipL": f"{MODEL_BASE}/clip_vit_large_patch14", |
| | "llm": f"{MODEL_BASE}/llava-llama-3-8b", |
| | "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b", |
| | } |
| |
|
| | |
| | TOKENIZER_PATH = { |
| | "clipL": f"{MODEL_BASE}/clip_vit_large_patch14", |
| | "llm": f"{MODEL_BASE}/llava-llama-3-8b", |
| | "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b", |
| | } |
| |
|
| | TEXT_PROJECTION = { |
| | "linear", |
| | "single_refiner", |
| | } |
| |
|
| | |
| | FLOW_PATH_TYPE = { |
| | "linear", |
| | "gvp", |
| | "vp", |
| | } |
| |
|
| | |
| | FLOW_PREDICT_TYPE = { |
| | "velocity", |
| | "score", |
| | "noise", |
| | } |
| |
|
| | |
| | FLOW_LOSS_WEIGHT = { |
| | "velocity", |
| | "likelihood", |
| | } |
| |
|
| | |
| | FLOW_SNR_TYPE = { |
| | "lognorm", |
| | "uniform", |
| | } |
| |
|
| | |
| | FLOW_SOLVER = { |
| | "euler", |
| | } |