Spaces:

dxm21
/

4dgs-dpm

Runtime error

App Files Files Community

4dgs-dpm / app.py

dxm21

Upload folder using huggingface_hub

f61284c verified about 2 months ago

raw

history blame contribute delete

24.5 kB

	"""
	DPM-Splat: End-to-end pipeline for Video → 4D Gaussian Splats
	Combines VDPM inference with Dynamic 4DGS training in a single Gradio interface.
	"""

	import os
	import sys
	import shutil
	import zipfile
	import gc
	import json
	import glob
	import time
	from pathlib import Path
	from datetime import datetime

	import cv2
	import numpy as np
	import gradio as gr
	import torch
	import imageio

	# Set memory optimization
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

	# Add paths
	sys.path.insert(0, str(Path(__file__).parent / "vdpm"))
	sys.path.insert(0, str(Path(__file__).parent / "gs"))

	# Import depth utilities
	from vdpm.util.depth import write_depth_to_png

	# Check GPU availability
	device = "cuda" if torch.cuda.is_available() else "cpu"

	if device == "cuda":
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	gpu_name = torch.cuda.get_device_name(0)
	gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
	print(f"✓ GPU: {gpu_name} ({gpu_mem:.1f} GB)")
	else:
	print("⚠ No GPU detected - running on CPU (will be slow)")

	# Configuration
	VIDEO_SAMPLE_HZ = 1.0

	# Set MAX_FRAMES based on VRAM
	if device == "cuda":
	if gpu_mem >= 20.0:
	MAX_FRAMES = 32
	elif gpu_mem >= 12.0:
	MAX_FRAMES = 16
	else:
	MAX_FRAMES = 8
	else:
	MAX_FRAMES = 4

	print(f"✓ Configured limit: {MAX_FRAMES} frames based on {gpu_mem:.1f} GB VRAM")

	# Global model cache
	_vdpm_model = None


	def get_vdpm_model():
	"""Load and cache the VDPM model"""
	global _vdpm_model

	if _vdpm_model is not None:
	print("✓ Using cached VDPM model")
	return _vdpm_model

	print("Loading VDPM model...")
	sys.stdout.flush()

	from hydra import compose, initialize
	from hydra.core.global_hydra import GlobalHydra
	from dpm.model import VDPM

	if GlobalHydra.instance().is_initialized():
	GlobalHydra.instance().clear()

	with initialize(config_path="vdpm/configs"):
	cfg = compose(config_name="visualise")

	model = VDPM(cfg).to(device)

	# Load weights
	cache_dir = os.path.expanduser("~/.cache/vdpm")
	os.makedirs(cache_dir, exist_ok=True)
	model_path = os.path.join(cache_dir, "vdpm_model.pt")

	_URL = "https://huggingface.co/edgarsucar/vdpm/resolve/main/model.pt"

	if not os.path.exists(model_path):
	print(f"Downloading VDPM model...")
	sd = torch.hub.load_state_dict_from_url(_URL, file_name="vdpm_model.pt", progress=True, map_location=device)
	torch.save(sd, model_path)
	else:
	print(f"✓ Loading cached model from {model_path}")
	sd = torch.load(model_path, map_location=device)

	model.load_state_dict(sd, strict=True)
	model.eval()

	# Use half precision
	if device == "cuda":
	if torch.cuda.get_device_capability()[0] >= 8:
	model = model.to(torch.bfloat16)
	print("✓ Using BF16 precision")
	else:
	model = model.half()
	print("✓ Using FP16 precision")

	_vdpm_model = model
	return model


	def process_videos(video_files, target_dir):
	"""Extract and interleave frames from uploaded videos"""
	images_dir = target_dir / "images"
	images_dir.mkdir(parents=True, exist_ok=True)

	num_views = len(video_files)
	captures = []
	intervals = []

	for vid_obj in video_files:
	video_path = vid_obj.name if hasattr(vid_obj, 'name') else str(vid_obj)
	vs = cv2.VideoCapture(video_path)
	fps = float(vs.get(cv2.CAP_PROP_FPS) or 30.0)
	interval = max(int(fps / max(VIDEO_SAMPLE_HZ, 1e-6)), 1)
	captures.append(vs)
	intervals.append(interval)

	# Interleave frames: [cam0_t0, cam1_t0, cam0_t1, cam1_t1, ...]
	frame_num = 0
	step_count = 0
	active = True
	image_paths = []

	while active:
	active = False
	for i, vs in enumerate(captures):
	if not vs.isOpened():
	continue
	ret, frame = vs.read()
	if ret:
	active = True
	if step_count % intervals[i] == 0:
	out_path = images_dir / f"{frame_num:06d}.png"
	cv2.imwrite(str(out_path), frame)
	image_paths.append(str(out_path))
	frame_num += 1
	else:
	vs.release()
	step_count += 1

	for vs in captures:
	if vs.isOpened():
	vs.release()

	# Save metadata
	meta = {"num_views": num_views}
	with open(target_dir / "meta.json", "w") as f:
	json.dump(meta, f)

	return image_paths, num_views


	def decode_poses(pose_enc: np.ndarray, image_hw: tuple) -> tuple:
	"""Decode VGGT pose encodings to camera matrices."""
	try:
	from vggt.utils.pose_enc import pose_encoding_to_extri_intri

	pose_enc_t = torch.from_numpy(pose_enc).float()
	extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc_t, image_hw)

	extrinsic = extrinsic[0].numpy() # (N, 3, 4)
	intrinsic = intrinsic[0].numpy() # (N, 3, 3)

	N = extrinsic.shape[0]
	bottom = np.array([0, 0, 0, 1], dtype=np.float32).reshape(1, 1, 4)
	bottom = np.tile(bottom, (N, 1, 1))
	extrinsics_4x4 = np.concatenate([extrinsic, bottom], axis=1)

	return extrinsics_4x4, intrinsic

	except ImportError:
	print("Warning: vggt not available. Using identity poses.")
	N = pose_enc.shape[1]
	extrinsics = np.tile(np.eye(4, dtype=np.float32), (N, 1, 1))

	H, W = image_hw
	fx = fy = max(H, W)
	cx, cy = W / 2, H / 2
	intrinsic = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
	intrinsics = np.tile(intrinsic, (N, 1, 1))

	return extrinsics, intrinsics


	def compute_depths(world_points: np.ndarray, extrinsics: np.ndarray, num_views: int) -> np.ndarray:
	"""
	Compute depth maps from world points and camera extrinsics.

	Args:
	world_points: (T, V, H, W, 3) world-space 3D points
	extrinsics: (N, 4, 4) camera extrinsics (world-to-camera)
	num_views: Number of camera views

	Returns:
	depths: (T, V, H, W) depth maps (Z in camera coordinates)
	"""
	T, V, H, W, _ = world_points.shape
	depths = np.zeros((T, V, H, W), dtype=np.float32)

	for t in range(T):
	for v in range(V):
	# Get camera extrinsic for this view at this timestep
	img_idx = t * num_views + v
	if img_idx >= len(extrinsics):
	img_idx = v # Fallback to first timestep's cameras

	w2c = extrinsics[img_idx] # (4, 4)
	R = w2c[:3, :3] # (3, 3)
	t_vec = w2c[:3, 3] # (3,)

	# Transform world points to camera coordinates
	pts_world = world_points[t, v].reshape(-1, 3) # (H*W, 3)
	pts_cam = (R @ pts_world.T).T + t_vec # (H*W, 3)

	# Depth is Z in camera coordinates
	depth = pts_cam[:, 2].reshape(H, W)
	depths[t, v] = depth

	return depths


	def run_vdpm_inference(target_dir, progress):
	"""Run VDPM inference and save outputs in output_4d.npz format"""
	from vggt.utils.load_fn import load_and_preprocess_images

	model = get_vdpm_model()

	image_names = sorted(glob.glob(os.path.join(target_dir, "images", "*")))
	if not image_names:
	raise ValueError("No images found")

	# Load metadata
	meta_path = target_dir / "meta.json"
	num_views = 1
	if meta_path.exists():
	with open(meta_path) as f:
	num_views = json.load(f).get("num_views", 1)

	# Limit frames
	if len(image_names) > MAX_FRAMES:
	limit = (MAX_FRAMES // num_views) * num_views
	if limit == 0:
	limit = num_views
	print(f"⚠ Limiting to {limit} frames")
	image_names = image_names[:limit]

	progress(0.15, desc=f"Loading {len(image_names)} images...")
	images = load_and_preprocess_images(image_names).to(device)

	# Store original images for visualization
	images_np = images.cpu().numpy() # (S, 3, H, W)

	# Construct views
	views = []
	for i in range(len(image_names)):
	t_idx = i // num_views
	cam_idx = i % num_views
	views.append({
	"img": images[i].unsqueeze(0),
	"view_idxs": torch.tensor([[cam_idx, t_idx]], device=device, dtype=torch.long)
	})

	progress(0.2, desc="Running VDPM forward pass...")
	print(f"Running inference on {len(image_names)} images...")
	sys.stdout.flush()

	with torch.no_grad():
	with torch.amp.autocast('cuda'):
	predictions = model.inference(views=views)

	# Extract results
	pts_list = [pm["pts3d"].detach().cpu().numpy() for pm in predictions["pointmaps"]]
	conf_list = [pm["conf"].detach().cpu().numpy() for pm in predictions["pointmaps"]]

	pose_enc = None
	if "pose_enc" in predictions:
	pose_enc = predictions["pose_enc"].detach().cpu().numpy()

	del predictions
	torch.cuda.empty_cache()

	world_points_raw = np.concatenate(pts_list, axis=0) # (T, S, H, W, 3)
	world_points_conf_raw = np.concatenate(conf_list, axis=0) # (T, S, H, W)

	T = world_points_raw.shape[0]
	S = world_points_raw.shape[1]
	H, W = world_points_raw.shape[2:4]
	num_timesteps = S // num_views

	print(f"VDPM output shape: T={T}, S={S}, num_views={num_views}")

	progress(0.3, desc="Processing VDPM outputs...")

	# ========================================================================
	# Extract diagonal entries for 4DGS (each image at its natural timestep)
	# Format: (num_timesteps, num_viewsHW, 3) flattened for train_dynamic.py
	# ========================================================================

	world_points_4d = []
	world_points_conf_4d = []
	images_4d = []

	for t in range(num_timesteps):
	# Collect all views for this timestep
	pts_t = []
	conf_t = []
	imgs_t = []

	for v in range(num_views):
	img_idx = t * num_views + v
	if img_idx >= S:
	break

	# Use the pointmap at timestep query = img_idx (diagonal)
	# VDPM outputs: world_points_raw[query_t, input_img_idx, H, W, 3]
	# We want the point where query_t == input_img_idx for single-view consistency
	query_t = min(img_idx, T - 1)
	pts_v = world_points_raw[query_t, img_idx] # (H, W, 3)
	conf_v = world_points_conf_raw[query_t, img_idx] # (H, W)
	img_v = images_np[img_idx] # (3, H, W)

	pts_t.append(pts_v.reshape(-1, 3)) # (H*W, 3)
	conf_t.append(conf_v.reshape(-1)) # (H*W,)
	imgs_t.append(img_v)

	if pts_t:
	# Concatenate all views: (VHW, 3)
	world_points_4d.append(np.concatenate(pts_t, axis=0))
	world_points_conf_4d.append(np.concatenate(conf_t, axis=0))
	# Stack images: (V, 3, H, W) -> average to (3, H, W) for visualization
	# Or just use first view
	images_4d.append(imgs_t[0])

	world_points_4d = np.stack(world_points_4d, axis=0) # (T, N, 3) where N = VHW
	world_points_conf_4d = np.stack(world_points_conf_4d, axis=0) # (T, N)
	images_4d = np.stack(images_4d, axis=0) # (T, 3, H, W)

	print(f"4DGS format: world_points={world_points_4d.shape}, images={images_4d.shape}")

	progress(0.35, desc="Saving outputs...")

	# Save in output_4d.npz format (compatible with train_dynamic.py)
	np.savez_compressed(
	target_dir / "output_4d.npz",
	world_points=world_points_4d,
	world_points_conf=world_points_conf_4d,
	images=images_4d,
	num_views=num_views,
	num_timesteps=num_timesteps
	)

	if pose_enc is not None:
	np.savez_compressed(target_dir / "poses.npz", pose_enc=pose_enc)

	# ========================================================================
	# COMPUTE AND SAVE DEPTHS
	# ========================================================================
	if pose_enc is not None:
	print("Computing depth maps...")

	# Reshape for depth computation: (T, V, H, W, 3)
	world_points_for_depth = world_points_4d.reshape(num_timesteps, num_views, H, W, 3)

	extrinsics, intrinsics = decode_poses(pose_enc, (H, W))
	depths = compute_depths(world_points_for_depth, extrinsics, num_views)

	# Save depths
	np.savez_compressed(
	target_dir / "depths.npz",
	depths=depths,
	num_views=num_views,
	num_timesteps=num_timesteps
	)

	# Save depth images
	depths_dir = target_dir / "depths"
	depths_dir.mkdir(exist_ok=True)

	for t in range(depths.shape[0]):
	for v in range(depths.shape[1]):
	png_path = depths_dir / f"depth_t{t:04d}_v{v:02d}.png"
	write_depth_to_png(str(png_path), depths[t, v])

	print(f"✓ Saved {depths.shape[0] * depths.shape[1]} depth images")

	print(f"✓ VDPM complete: {num_timesteps} timesteps, {num_views} views")
	sys.stdout.flush()

	return num_timesteps, num_views


	def run_4dgs_training(target_dir, output_dir, initial_iterations, subsequent_iterations, conf_threshold, progress):
	"""Run Dynamic 4D Gaussian Splatting training"""
	import warp as wp
	from gs.train_dynamic import load_dynamic_data, DynamicGaussianTrainer

	wp.init()

	print(f"\n{'='*50}")
	print("[DYNAMIC 4D GAUSSIANS TRAINING]")
	print(f"Frame 0: {initial_iterations} iterations")
	print(f"Frames 1+: {subsequent_iterations} iterations each")
	print(f"{'='*50}")
	sys.stdout.flush()

	data = load_dynamic_data(str(target_dir))
	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	# Create and run trainer
	trainer = DynamicGaussianTrainer(
	data=data,
	output_path=str(output_path),
	conf_threshold=conf_threshold,
	initial_iterations=initial_iterations,
	subsequent_iterations=subsequent_iterations,
	simultaneous_mode=False,
	)

	def progress_callback(frac, desc):
	progress(0.4 + 0.5 * frac, desc=desc)

	trainer.train_sequential(progress_callback=progress_callback)

	# Return paths to outputs
	npz_path = output_path / "dynamic_gaussians.npz"
	mp4_path = output_path / "animation.mp4"
	gif_path = output_path / "animation.gif"

	print(f"✓ 4DGS training complete: {trainer.num_timesteps} frames, {trainer.num_points} Gaussians")
	sys.stdout.flush()

	return {
	'npz_path': str(npz_path) if npz_path.exists() else None,
	'mp4_path': str(mp4_path) if mp4_path.exists() else None,
	'gif_path': str(gif_path) if gif_path.exists() else None,
	'num_frames': trainer.num_timesteps,
	'num_points': trainer.num_points,
	}


	def run_pipeline(video_files, initial_iterations, subsequent_iterations, conf_threshold, progress=gr.Progress()):
	"""Run the full VDPM → 4DGS pipeline"""

	if not video_files:
	return None, None, None, "❌ Please upload video file(s)"

	gc.collect()
	if device == "cuda":
	torch.cuda.empty_cache()

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	run_dir = Path(f"output/pipeline/run_{timestamp}")
	run_dir.mkdir(parents=True, exist_ok=True)

	try:
	# Step 1: Process videos
	progress(0.05, desc="Processing uploaded videos...")
	print("=" * 50)
	print("Processing Videos")
	print("=" * 50)
	sys.stdout.flush()

	image_paths, num_views = process_videos(video_files, run_dir)
	print(f"✓ Extracted {len(image_paths)} frames from {num_views} videos")
	sys.stdout.flush()

	# Step 2: VDPM inference
	progress(0.1, desc="Running VDPM inference...")
	print("=" * 50)
	print("Running VDPM Inference")
	print("=" * 50)
	sys.stdout.flush()

	num_timesteps, num_views = run_vdpm_inference(run_dir, progress)

	# Clear VRAM before 4DGS training
	global _vdpm_model
	_vdpm_model = None
	gc.collect()
	if device == "cuda":
	torch.cuda.empty_cache()
	print(f"✓ Cleared VRAM: {torch.cuda.memory_allocated()/1024**3:.2f} GB in use")
	sys.stdout.flush()

	# Step 3: 4DGS training
	progress(0.4, desc="Training 4D Gaussian Splats...")
	print("=" * 50)
	print("Training 4D Gaussian Splats")
	print("=" * 50)
	sys.stdout.flush()

	splat_dir = run_dir / "splats"
	results = run_4dgs_training(
	run_dir, splat_dir,
	int(initial_iterations),
	int(subsequent_iterations),
	float(conf_threshold),
	progress
	)

	# Step 4: Package results
	progress(0.95, desc="Packaging results...")

	zip_path = run_dir / "results.zip"
	with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
	# Add main outputs
	if results['npz_path'] and Path(results['npz_path']).exists():
	zf.write(results['npz_path'], "dynamic_gaussians.npz")

	if results['mp4_path'] and Path(results['mp4_path']).exists():
	zf.write(results['mp4_path'], "animation.mp4")

	if results['gif_path'] and Path(results['gif_path']).exists():
	zf.write(results['gif_path'], "animation.gif")

	# Add frame renders
	for frame_dir in splat_dir.glob("frame_*"):
	for subdir in ["renders", "training_renders"]:
	render_dir = frame_dir / subdir
	if render_dir.exists():
	for img in render_dir.glob("*.png"):
	rel_path = img.relative_to(splat_dir)
	zf.write(img, f"renders/{rel_path}")

	# Add VDPM data
	for f in ["output_4d.npz", "poses.npz", "depths.npz", "meta.json"]:
	fp = run_dir / f
	if fp.exists():
	zf.write(fp, f)

	# Add input images
	images_dir = run_dir / "images"
	if images_dir.exists():
	for img in sorted(images_dir.glob("*"))[:20]: # Limit to first 20
	zf.write(img, f"images/{img.name}")

	# Add depth images
	depths_dir = run_dir / "depths"
	if depths_dir.exists():
	for img in sorted(depths_dir.glob("*.png"))[:20]:
	zf.write(img, f"depths/{img.name}")

	progress(1.0, desc="Complete!")

	status = f"""✅ Pipeline Complete!

	📊 Results:
	• {results['num_frames']} timesteps × {num_views} views
	• {results['num_points']:,} Gaussians
	• Animation: {'✓' if results['mp4_path'] else '✗'}

	📁 Output: {run_dir}
	📦 Download the ZIP for all files"""

	# Return video for preview
	video_path = results.get('mp4_path')

	return str(zip_path), video_path, status

	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, None, f"❌ Error: {str(e)}"


	# ===== Gradio Interface =====
	with gr.Blocks(title="DPM-Splat: 4D Gaussian Splatting", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🎬 DPM-Splat: Video → 4D Gaussian Splats

	End-to-end pipeline combining V-DPM (Video Dynamic Point Maps) with 4D Gaussian Splatting.
	Upload synchronized videos to generate temporally consistent 4D reconstructions with time-varying Gaussians.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	video_input = gr.File(
	label="📹 Upload Videos",
	file_count="multiple",
	file_types=[".mp4", ".mov", ".avi", ".webm"]
	)

	gr.Markdown("Upload 1-4 synchronized video files for multi-view reconstruction")

	with gr.Accordion("⚙️ Training Settings", open=True):
	initial_iterations = gr.Slider(
	minimum=100, maximum=10000, value=3000, step=100,
	label="Frame 0 Iterations",
	info="Training iterations for canonical frame (more = better base quality)"
	)
	subsequent_iterations = gr.Slider(
	minimum=100, maximum=5000, value=500, step=100,
	label="Subsequent Frame Iterations",
	info="Training iterations for frames 1+ (positions only)"
	)
	conf_threshold = gr.Slider(
	minimum=0, maximum=100, value=0, step=5,
	label="Confidence Threshold (%)",
	info="0% keeps all points, higher = filter low confidence"
	)

	run_btn = gr.Button("🚀 Run Pipeline", variant="primary", size="lg")

	status_text = gr.Textbox(
	label="Status",
	interactive=False,
	lines=8,
	value="Upload videos and click 'Run Pipeline' to begin."
	)

	with gr.Column(scale=2):
	video_viewer = gr.Video(
	label="🎞️ 4D Gaussian Animation",
	height=500,
	autoplay=True,
	loop=True
	)
	download_btn = gr.File(label="📦 Download Results (ZIP)")

	gr.Markdown("""
	---
	### 📋 Output Contents

	The downloaded ZIP contains:
	- `dynamic_gaussians.npz` - All Gaussian parameters (positions per frame, shared scales/rotations/opacities/SHs)
	- `animation.mp4` - Rendered video with smooth camera interpolation
	- `renders/` - Per-frame training renders showing RGB and depth
	- `output_4d.npz` - VDPM point tracks
	- `poses.npz` - Camera poses
	- `depths/` - Computed depth maps
	- `images/` - Input frames

	### 🎯 How It Works

	1. VDPM: Extracts temporally consistent 3D point maps from video
	2. 4DGS Training:
	- Train canonical frame (t=0) with all Gaussian parameters
	- Train subsequent frames with position-only updates (shared appearance)
	3. Animation: Smooth camera path through training viewpoints

	Local runs: Results saved to `output/pipeline/run_TIMESTAMP/`
	""")

	run_btn.click(
	fn=run_pipeline,
	inputs=[video_input, initial_iterations, subsequent_iterations, conf_threshold],
	outputs=[download_btn, video_viewer, status_text]
	)

	if __name__ == "__main__":
	# Download model on startup
	if device == "cuda":
	print("Pre-loading VDPM model...")
	try:
	get_vdpm_model()
	_vdpm_model = None # Free VRAM but keep file cached
	gc.collect()
	torch.cuda.empty_cache()
	print("✓ Model pre-loaded and cached")
	except Exception as e:
	print(f"⚠ Failed to pre-load model: {e}")

	app.queue().launch(share=True, show_error=True)