| | import os |
| | import torch |
| | import argparse |
| | from demo import GetAnchorVideos |
| | from datetime import datetime |
| |
|
| | def get_parser(): |
| | parser = argparse.ArgumentParser() |
| |
|
| | |
| | parser.add_argument('--video_path', type=str, help='Input path') |
| |
|
| | parser.add_argument( |
| | '--out_dir', type=str, required=True, help='Output dir' |
| | ) |
| |
|
| | parser.add_argument( |
| | '--device', type=str, default='cuda:0', help='The device to use' |
| | ) |
| |
|
| | parser.add_argument( |
| | '--exp_name', |
| | type=str, |
| | default=None, |
| | help='Experiment name, use video file name by default', |
| | ) |
| |
|
| | parser.add_argument( |
| | '--save_name', |
| | type=str, |
| | default=None, |
| | help='Experiment name, use video file name by default', |
| | ) |
| |
|
| | parser.add_argument( |
| | '--seed', type=int, default=43, help='Random seed for reproducibility' |
| | ) |
| |
|
| | parser.add_argument( |
| | '--video_length', type=int, default=49, help='Length of the video frames' |
| | ) |
| | |
| | parser.add_argument('--fps', type=int, default=10, help='Fps for saved video') |
| | parser.add_argument( |
| | '--stride', type=int, default=1, help='Sampling stride for input video' |
| | ) |
| | parser.add_argument('--server_name', type=str, help='Server IP address') |
| |
|
| | |
| | parser.add_argument( |
| | '--radius_scale', |
| | type=float, |
| | default=1.0, |
| | help='Scale factor for the spherical radius', |
| | ) |
| | parser.add_argument('--camera', type=str, default='traj', help='traj or target') |
| | parser.add_argument( |
| | '--mode', type=str, default='gradual', help='gradual, bullet or direct' |
| | ) |
| | parser.add_argument( |
| | '--mask', action='store_true', default=False, help='Clean the pcd if true' |
| | ) |
| | parser.add_argument( |
| | '--traj_txt', |
| | type=str, |
| | help="Required for 'traj' camera, a txt file that specify camera trajectory", |
| | ) |
| | parser.add_argument( |
| | '--target_pose', |
| | nargs=5, |
| | type=float, |
| | help="Required for 'target' mode, specify target camera pose, <theta phi r x y>", |
| | ) |
| | parser.add_argument( |
| | '--near', type=float, default=0.0001, help='Near clipping plane distance' |
| | ) |
| | parser.add_argument( |
| | '--far', type=float, default=10000.0, help='Far clipping plane distance' |
| | ) |
| | parser.add_argument( |
| | '--height', type=int, default=480, help='Height' |
| | ) |
| | parser.add_argument( |
| | '--width', type=int, default=720, help='width' |
| | ) |
| |
|
| | parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None) |
| | parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame') |
| | parser.add_argument( |
| | '--near_far_estimated', |
| | type=bool, |
| | default=True, |
| | help='Use estimated near and far values', |
| | ) |
| | parser.add_argument( |
| | '--anchor_incre_res_input', |
| | type=bool, |
| | default=True, |
| | help='Dont load heavy models if generating the higher resolution anchor video', |
| | ) |
| | |
| | |
| | parser.add_argument( |
| | '--low_gpu_memory_mode', |
| | type=bool, |
| | default=False, |
| | help='Enable low GPU memory mode', |
| | ) |
| | |
| | parser.add_argument( |
| | '--model_name', |
| | type=str, |
| | default='/data/pretrained/CogVideoX-Fun-V1.1-5b-InP', |
| | help='Path to the model', |
| | ) |
| | parser.add_argument( |
| | '--sampler_name', |
| | type=str, |
| | choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], |
| | default='DDIM_Origin', |
| | help='Choose the sampler', |
| | ) |
| | |
| | |
| | parser.add_argument( |
| | '--transformer_path', |
| | type=str, |
| | default="/data/pretrained/TrajectoryCrafter", |
| | help='Path to the pretrained transformer model', |
| | ) |
| | |
| | parser.add_argument( |
| | '--load_size', |
| | type=int, |
| | nargs=2, |
| | default=[384, 672], |
| | help='Load size as [height, width]', |
| | ) |
| | |
| | parser.add_argument( |
| | '--sample_size', |
| | type=int, |
| | nargs=2, |
| | default=[384, 672], |
| | help='Sample size as [height, width]', |
| | ) |
| |
|
| | parser.add_argument( |
| | '--depth_size', |
| | type=int, |
| | nargs=2, |
| | default=[480, 720], |
| | help='Depth size as [height, width]', |
| | ) |
| | |
| | parser.add_argument( |
| | '--diffusion_guidance_scale', |
| | type=float, |
| | default=6.0, |
| | help='Guidance scale for inference', |
| | ) |
| | parser.add_argument( |
| | '--diffusion_inference_steps', |
| | type=int, |
| | default=50, |
| | help='Number of inference steps', |
| | ) |
| | parser.add_argument( |
| | '--prompt', type=str, default=None, help='Prompt for video generation' |
| | ) |
| | parser.add_argument( |
| | '--negative_prompt', |
| | type=str, |
| | default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.", |
| | help='Negative prompt for video generation', |
| | ) |
| | parser.add_argument( |
| | '--refine_prompt', |
| | type=str, |
| | default="The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.", |
| | help='Prompt for video generation', |
| | ) |
| | parser.add_argument('--qwen_path', type=str, default="/data/pretrained/Qwen2.5-VL-7B-Instruct") |
| |
|
| | |
| | |
| | parser.add_argument( |
| | '--unet_path', |
| | type=str, |
| | default="/data/pretrained/DepthCrafter", |
| | help='Path to the UNet model', |
| | ) |
| |
|
| | |
| | parser.add_argument( |
| | '--pre_train_path', |
| | type=str, |
| | default="/data/pretrained/stable-video-diffusion-img2vid", |
| | help='Path to the pre-trained model', |
| | ) |
| | parser.add_argument( |
| | '--cpu_offload', type=str, default='model', help='CPU offload strategy' |
| | ) |
| | parser.add_argument( |
| | '--depth_inference_steps', type=int, default=5, help='Number of inference steps' |
| | ) |
| | parser.add_argument( |
| | '--depth_guidance_scale', |
| | type=float, |
| | default=1.0, |
| | help='Guidance scale for inference', |
| | ) |
| | parser.add_argument( |
| | '--window_size', type=int, default=110, help='Window size for processing' |
| | ) |
| | parser.add_argument( |
| | '--overlap', type=int, default=25, help='Overlap size for processing' |
| | ) |
| | parser.add_argument( |
| | '--max_res', type=int, default=1024, help='Maximum resolution for processing' |
| | ) |
| | parser.add_argument('--init_dx', type=float, default=0.0) |
| | parser.add_argument('--init_dy', type=float, default=0.0) |
| | parser.add_argument('--init_dz', type=float, default=0.0) |
| | parser.add_argument('--init_theta', type=float, default=0.0) |
| | parser.add_argument('--init_phi', type=float, default=0.0) |
| | |
| | return parser |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = get_parser() |
| | opts = parser.parse_args() |
| | opts.weight_dtype = torch.bfloat16 |
| | pvd = GetAnchorVideos(opts) |
| | if opts.mode == 'gradual': |
| | pvd.infer_gradual(opts) |
| | elif opts.mode == 'direct': |
| | pvd.infer_direct(opts) |
| | elif opts.mode == 'bullet': |
| | pvd.infer_bullet(opts) |
| | elif opts.mode == 'image': |
| | pvd.infer_image(opts) |
| | elif opts.mode == 'start_end': |
| | pvd.infer_start_end(opts) |
| | elif opts.mode == 'zoom': |
| | pvd.infer_zoom(opts) |