| import os
|
| import torch
|
|
|
| __all__ = [
|
| "C_SCALE",
|
| "PROMPT_TEMPLATE",
|
| "MODEL_BASE",
|
| "PRECISIONS",
|
| "NORMALIZATION_TYPE",
|
| "ACTIVATION_TYPE",
|
| "VAE_PATH",
|
| "TEXT_ENCODER_PATH",
|
| "TOKENIZER_PATH",
|
| "TEXT_PROJECTION",
|
| "DATA_TYPE",
|
| "NEGATIVE_PROMPT",
|
| "NEGATIVE_PROMPT_I2V",
|
| "FLOW_PATH_TYPE",
|
| "FLOW_PREDICT_TYPE",
|
| "FLOW_LOSS_WEIGHT",
|
| "FLOW_SNR_TYPE",
|
| "FLOW_SOLVER",
|
| ]
|
|
|
| PRECISION_TO_TYPE = {
|
| 'fp32': torch.float32,
|
| 'fp16': torch.float16,
|
| 'bf16': torch.bfloat16,
|
| }
|
|
|
|
|
|
|
|
|
| C_SCALE = 1_000_000_000_000_000
|
|
|
|
|
|
|
|
|
| PROMPT_TEMPLATE_ENCODE = (
|
| "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
|
| "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
| "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| )
|
| PROMPT_TEMPLATE_ENCODE_VIDEO = (
|
| "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
|
| "1. The main content and theme of the video."
|
| "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
| "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
| "4. background environment, light, style and atmosphere."
|
| "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
|
| "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| )
|
|
|
| PROMPT_TEMPLATE_ENCODE_I2V = (
|
| "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the image by detailing the color, shape, size, texture, "
|
| "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
| "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| "<|start_header_id|>assistant<|end_header_id|>\n\n"
|
| )
|
|
|
| PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
|
| "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
|
| "1. The main content and theme of the video."
|
| "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
| "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
| "4. background environment, light, style and atmosphere."
|
| "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
|
| "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
| "<|start_header_id|>assistant<|end_header_id|>\n\n"
|
| )
|
|
|
| NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
|
| NEGATIVE_PROMPT_I2V = "deformation, a poor composition and deformed video, bad teeth, bad eyes, bad limbs"
|
|
|
| PROMPT_TEMPLATE = {
|
| "dit-llm-encode": {
|
| "template": PROMPT_TEMPLATE_ENCODE,
|
| "crop_start": 36,
|
| },
|
| "dit-llm-encode-video": {
|
| "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
|
| "crop_start": 95,
|
| },
|
| "dit-llm-encode-i2v": {
|
| "template": PROMPT_TEMPLATE_ENCODE_I2V,
|
| "crop_start": 36,
|
| "image_emb_start": 5,
|
| "image_emb_end": 581,
|
| "image_emb_len": 576,
|
| "double_return_token_id": 271
|
| },
|
| "dit-llm-encode-video-i2v": {
|
| "template": PROMPT_TEMPLATE_ENCODE_VIDEO_I2V,
|
| "crop_start": 103,
|
| "image_emb_start": 5,
|
| "image_emb_end": 581,
|
| "image_emb_len": 576,
|
| "double_return_token_id": 271
|
| },
|
| }
|
|
|
|
|
| PRECISIONS = {"fp32", "fp16", "bf16"}
|
| NORMALIZATION_TYPE = {"layer", "rms"}
|
| ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
|
|
|
|
|
| MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
|
|
|
|
|
| DATA_TYPE = {"image", "video", "image_video"}
|
|
|
|
|
| VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
|
|
|
|
|
| TEXT_ENCODER_PATH = {
|
| "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
|
| "llm": f"{MODEL_BASE}/llava-llama-3-8b",
|
| "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
|
| }
|
|
|
|
|
| TOKENIZER_PATH = {
|
| "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
|
| "llm": f"{MODEL_BASE}/llava-llama-3-8b",
|
| "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
|
| }
|
|
|
| TEXT_PROJECTION = {
|
| "linear",
|
| "single_refiner",
|
| }
|
|
|
|
|
| FLOW_PATH_TYPE = {
|
| "linear",
|
| "gvp",
|
| "vp",
|
| }
|
|
|
|
|
| FLOW_PREDICT_TYPE = {
|
| "velocity",
|
| "score",
|
| "noise",
|
| }
|
|
|
|
|
| FLOW_LOSS_WEIGHT = {
|
| "velocity",
|
| "likelihood",
|
| }
|
|
|
|
|
| FLOW_SNR_TYPE = {
|
| "lognorm",
|
| "uniform",
|
| }
|
|
|
|
|
| FLOW_SOLVER = {
|
| "euler",
|
| } |