Spaces:

chenwang
/

physctrl

Running on Zero

File size: 10,017 Bytes
import numpy as np
import torch
import gc
from PIL import Image
import sys
import os

# Add the project root directory to Python path (use absolute paths for robustness)
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(project_root)
sys.path.append(os.path.join(project_root, "libs"))
sys.path.append(os.path.join(project_root, "libs", "LGM"))
sys.path.append(os.path.join(project_root, "libs", "das"))
sys.path.append(os.path.join(project_root, "src"))

from sv3d.diffusers_sv3d import SV3DUNetSpatioTemporalConditionModel, StableVideo3DDiffusionPipeline
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from diffusers import AutoencoderKL, EulerDiscreteScheduler, DDPMScheduler, DDIMScheduler
from diffusers.utils import export_to_gif, export_to_video
from kiui.cam import orbit_camera
from safetensors.torch import load_file
from omegaconf import OmegaConf

from LGM.core.models import LGM
from LGM.core.options import AllConfigs 
from LGM.core.gs import GaussianRenderer
from .track_utils.visualize_tracks import visualize_tracks
from .track_utils.preprocessing import track_first, find_and_remove_nearest_point
from .interpolate import interpolate_points
from das.models.pipelines import DiffusionAsShaderPipeline

import h5py
import tyro
from tqdm import tqdm
from options import TestingConfig
from pipeline_traj import TrajPipeline
from model.spacetime import MDM_ST
from argparse import Namespace

def load_sv3d_pipeline(device, model_path="chenguolin/sv3d-diffusers"):
    unet = SV3DUNetSpatioTemporalConditionModel.from_pretrained(model_path, subfolder="unet")
    vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae")
    scheduler = EulerDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
    image_encoder = CLIPVisionModelWithProjection.from_pretrained(model_path, subfolder="image_encoder")
    feature_extractor = CLIPImageProcessor.from_pretrained(model_path, subfolder="feature_extractor")
    pipeline = StableVideo3DDiffusionPipeline(
        image_encoder=image_encoder, feature_extractor=feature_extractor, 
        unet=unet, vae=vae,
        scheduler=scheduler,
    ).to(device)
    return pipeline

def load_LGM(opt, device, lgm_ckpt_path="./checkpoints/lgm_fp16.safetensors"):
    model = LGM(opt)
    ckpt = load_file(lgm_ckpt_path, device='cpu')
    model.load_state_dict(ckpt, strict=False)
    model = model.half().to(device)
    model.eval()
    return model

def load_diffusion(device, model_cfg_path, diffusion_ckpt_path, seed=0):
    schema = OmegaConf.structured(TestingConfig)
    cfg = OmegaConf.load(model_cfg_path)
    cfg = OmegaConf.merge(schema, cfg)
    n_training_frames = cfg.train_dataset.n_training_frames
    n_frames_interval = cfg.train_dataset.n_frames_interval
    norm_fac = cfg.train_dataset.norm_fac

    model = MDM_ST(cfg.pc_size, n_training_frames, n_feats=3, model_config=cfg.model_config).to(device)

    ckpt = load_file(diffusion_ckpt_path, device='cpu')
    model.load_state_dict(ckpt, strict=False)
    model.eval().requires_grad_(False)
    noise_scheduler = DDIMScheduler(num_train_timesteps=1000, prediction_type='sample', clip_sample=False)
    pipeline = TrajPipeline(model=model, scheduler=noise_scheduler)
    return pipeline

def gen_tracking_video(base_dir):
    
    animated_points = np.load(f'{base_dir}/gen_data.npy')
    animated_points = animated_points * 2
    new_animate_points = np.zeros((49, 2048, 3))
    for i in range(47):
        if i % 2 == 0:  
            new_animate_points[i + 1] = animated_points[i // 2]
        else:
            new_animate_points[i + 1] = (animated_points[i // 2] + animated_points[i // 2 + 1]) / 2
    new_animate_points[0] = new_animate_points[1]
    new_animate_points[48] = new_animate_points[47]
    animated_points = new_animate_points

    projection_matrix = np.load(f'{base_dir}/projection.npy')
    crop_info = np.load(f'{base_dir}/crop_info.npy')
    center = np.load(f'{base_dir}/center.npy')
    scale = np.load(f'{base_dir}/scale.npy')
    animated_points = (animated_points / scale) + center    

    ## Aligned to Gaussian points at this moment
    print(animated_points.mean(), animated_points.std(), animated_points.max(), animated_points.min())
    device = torch.device("cuda")
    sys.argv = ['pipeline_track_gen.py', 'big']
    opt = tyro.cli(AllConfigs)

    scale_factor = 2
    focal = 0.5 * opt.output_size / np.tan(np.deg2rad(opt.fovy) / 2)
    new_fovy_rad = scale_factor * np.arctan(opt.output_size / focal)
    new_fovy_deg = np.rad2deg(new_fovy_rad)
    opt.fovy = new_fovy_deg
    opt.output_size *= scale_factor # Expand canvas size by 2

    gs = GaussianRenderer(opt)
    gaussians = gs.load_ply(f'{base_dir}/point_cloud.ply', compatible=True).to(device).float()
    idx = torch.from_numpy(np.load(f'{base_dir}/idx.npy')).to(device)
    gaussian_pos = gaussians[:, :3].contiguous()
    drive_x = gaussian_pos[idx]
    cdist = -1.0 * torch.cdist(gaussian_pos, drive_x) # [N, 2048]
    _, topk_index = torch.topk(cdist, 8, -1)

    cam_poses = torch.from_numpy(orbit_camera(0, 0, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
    cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
    cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
    cam_view_proj = cam_view @ gs.proj_matrix.to(device) # [V, 4, 4]
    cam_pos = - cam_poses[:, :3, 3] # [V, 3]

    pos = []

    for i in tqdm(range(0, 49, 1)):
        drive_current = torch.from_numpy(animated_points[i]).to(device).float()
        ret_points, new_rotation = interpolate_points(gaussian_pos, gaussians[:, 7:11], drive_x, drive_current, topk_index)
        gaussians_new = gaussians.clone()
        gaussians_new[:, :3] = ret_points
        gaussians_new[:, 7:11] = new_rotation
        pos.append(ret_points.cpu().numpy())

        # with torch.no_grad():
        #     ret = gs.render(gaussians_new.unsqueeze(0), cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=1)
        #     mask = (ret['alpha'][0,0].permute(1, 2, 0).contiguous().float().cpu().numpy() * 255.0).astype(np.uint8)
        #     image = (ret['image'][0, 0].permute(1, 2, 0).contiguous().float().cpu().numpy()*255.0).astype(np.uint8)
        #     image_save = np.concatenate([image, mask], axis=-1)

        #     h_begin, w_begin, res = crop_info[0], crop_info[1], crop_info[2]
        #     h_begin = h_begin - (256 * scale_factor - 256)
        #     w_begin = w_begin - (256 * scale_factor - 256)
        #     image_save = Image.fromarray(image_save).resize((res * scale_factor, res * scale_factor), Image.LANCZOS) 
    
    template_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates', 'tracks_template.npy')
    track_template = np.load(template_path, allow_pickle=True)
    tracks = track_template.item()['tracks']
    tracks_output = tracks.copy()
    tracks_init = tracks[0, 0]  
    track_idx = []
    mask = np.zeros(tracks_init.shape[0], dtype=bool)
    
    for i in tqdm(range(49)):
        
        # points = animated_points[i]
        points = pos[i]
    
        projected_points = (projection_matrix.T @ np.hstack((points, np.ones((points.shape[0], 1)))).T).T
        projected_points_weights = 1. / (projected_points[:, -1:] + 1e-8)
        projected_points = (projected_points * projected_points_weights)[:, :-1]
        
        h_begin, w_begin, res = crop_info[0], crop_info[1], crop_info[2]
        image_shape = (res, res)  # Example image shape (H, W)
        projected_points[:, :2] = ((projected_points[:, :2] + 1) * image_shape[1] - 1) / 2
        projected_points[:, 0] += w_begin
        projected_points[:, 1] += h_begin

        if i == 0: 
            track_point_candidates = track_first(projected_points, (480, 720))            
            for j in range(tracks_init.shape[0]):
                x, y = tracks_init[j, 0], tracks_init[j, 1]
                target = np.array([x, y])
                candidate, track_point_candidates = find_and_remove_nearest_point(target, track_point_candidates)
                if candidate is not None:
                    track_idx.append(candidate[3].astype(np.int32))
                    mask[j] = True
                    
        tracks_output[0, i, mask] = projected_points[track_idx]
        tracks_output[0, i, ~mask, :2] = tracks_output[0, 0, ~mask, :2]
        tracks_output[0, i, ~mask, 2] = 2
    
    track_template.item()['tracks'] = tracks_output
    # track_template.item()['drag_points'] = np.stack(drag_points, axis=0)
    sub_name = 'tracks_gen'
    sub_dir = f'{base_dir}/{sub_name}'
    os.makedirs(sub_dir, exist_ok=True)

    np.save(f'{sub_dir}/tracks.npy', track_template)
    args = Namespace(tracks_dir=sub_dir, output_dir=sub_dir, output_fps=24, point_size=10, len_track=0, num_frames=49, video_path=None)
    visualize_tracks(tracks_dir=sub_dir, output_dir=sub_dir, args=args)

def load_das(gpu_id, output_dir):
    das = DiffusionAsShaderPipeline(gpu_id=gpu_id, output_dir=output_dir)
    return das

def normalize_points(output_dir, fluid=False):
    from .transform import transform2origin, shift2center
    import trimesh
    from torch_cluster import fps
    
    device = 'cuda'
    
    pc_path = f'{output_dir}/point_cloud.ply'
    pc = trimesh.load_mesh(pc_path)
    points = pc.vertices
    points = np.array(points)
    points, center, scale = transform2origin(points, size=1)
    N = 2048
    grid_center = [5, 5, 5]
    drag_size = [0.4, 0.4, 0.4]

    points = shift2center(points, center=grid_center)
    points = torch.tensor(points, dtype=torch.float32, device=device).contiguous()
    np.save(f'{output_dir}/center.npy', center)
    np.save(f'{output_dir}/scale.npy', scale)
    ratio_N = N / points.shape[0]
    idx = fps(points, ratio=ratio_N, random_start=True)
    points = points[idx].cpu().numpy()
    np.save(f'{output_dir}/idx.npy', idx.cpu().numpy())
    return points, center, scale