Spaces:

SteveZh
/

SATA

Running

SATA / src /Visualization /decode_z_sliding.py

zzysteve

Initial commit

5221c8c 9 days ago

19.3 kB

	"""
	Batch decode latent vectors to BVH files.

	Supports sliding-window decoding and multiple input formats. The Gradio demo
	uses decode_z_skel_to_bvh directly; the CLI keeps batch decoding support for
	local debugging.
	"""

	import argparse
	import os
	import numpy as np
	import torch
	from fairmotion.data import bvh
	from torch_geometric.data import Batch

	# Required conversion helpers
	from sata.conversions.graph_to_motion import hatD_recon_motion
	from sata.skel_pose_graph import SkelPoseGraph
	from sata.conversions.graph_to_motion import graph_2_skel
	from fairmotion.core import motion as motion_class

	from sata.utils.model_loading import load_model_by_type
	# Reuse shared SATA motion-data helpers.
	from sata.utils.motion_data import (
	SkelData,
	load_skeleton_and_tf_from_npz,
	create_graph_list_from_skeleton,
	fix_skeleton_coordinate_system,
	)
	from sata.utils.sliding_decode import decode_to_hatD_with_sliding_window

	# Default directory configuration
	VISUAL_BASE_DIR = os.path.join(os.path.dirname(__file__), "visual")
	Z_DIR = os.path.join(VISUAL_BASE_DIR, "z")
	SKEL_DIR = os.path.join(VISUAL_BASE_DIR, "skel", "processed")
	SKEL_TF_DIR = os.path.join(VISUAL_BASE_DIR, "skel", "joint_text_features")
	OUTPUT_DIR = os.path.join(VISUAL_BASE_DIR, "bvh_output")

	# visual/z.txt contains one z file path per line.
	# visual/skel.txt contains one skeleton npz path per line.
	# The tf path is derived by replacing processed with joint_text_features.
	# OUTPUT_DIR: visual/bvh_output


	def decode_z_skel_to_bvh(z_path, skel_path, model_epoch, output_dir,
	output_name="decoded_motion.bvh", device="cuda:0",
	window_size=64, overlap=16, use_sliding_window=None,
	model_type="vae", model_state=None):
	"""
	Decode a latent/code file with a target skeleton and save it as BVH.

	Args:
	z_path: path to a latent/code file (.npy, .npz, or .pt)
	- VAE: latent vectors [T, z_dim]
	- RVQ: code_idx [T, Q]
	skel_path: target skeleton .npz path. The matching tf file is loaded automatically.
	model_epoch: model checkpoint name, for example "ckpt0"
	output_dir: output directory path
	output_name: output file stem, default "decoded_motion.bvh"
	device: compute device, default "cuda:0"
	window_size: sliding-window size, default 64
	overlap: sliding-window overlap, default 16
	use_sliding_window: force sliding-window decoding; None selects automatically
	model_type: "vae" or "rvq", default "vae"
	model_state: optional preloaded (model, cfg, ms_dict); loaded on demand when None

	Returns:
	output_path: saved BVH path
	"""
	# 1. Load the model or reuse a preloaded model state.
	if model_state is not None:
	model, cfg, ms_dict = model_state
	print(f"[Model] Reusing preloaded model ({model_type.upper()})")
	else:
	print(f"Loading model: {model_epoch} ({model_type.upper()})")
	model, cfg, ms_dict = load_model_by_type(model_type, model_epoch, device)
	model = model.to(device)
	model.eval()

	# 2. Load latent vectors or code indices.
	print(f"Loading {'latent vectors' if model_type == 'vae' else 'code_idx'}: {z_path}")

	if z_path.endswith('.npy'):
	# NumPy .npy file
	data = torch.from_numpy(np.load(z_path)).float().to(device)

	elif z_path.endswith('.npz'):
	# NumPy .npz file, possibly with multiple arrays
	data_loaded = np.load(z_path)
	# Try common payload keys.
	possible_keys = ['z', 'z_pred', 'sample', 'code_idx', 'codes', 'idx_pred']
	data_key = None
	for key in possible_keys:
	if key in data_loaded:
	data_key = key
	break

	if data_key is None:
	# Fall back to the first key.
	data_key = list(data_loaded.keys())[0]
	print(f" Loaded key from npz: {data_key}")
	else:
	print(f" Loaded key from npz: {data_key}")

	data = torch.from_numpy(data_loaded[data_key]).float().to(device)

	elif z_path.endswith('.pt'):
	# PyTorch .pt file compatible with gen_dec.py encoded data.
	encoded_data = torch.load(z_path, map_location=device)

	# Extract the payload for the selected model type.
	if model_type == "vae":
	# VAE: prefer z_pred, then z, then sample.
	if 'z_pred' in encoded_data:
	data = encoded_data['z_pred'].float().to(device)
	print(" Loaded key from pt: z_pred")
	elif 'z' in encoded_data:
	data = encoded_data['z'].float().to(device)
	print(" Loaded key from pt: z")
	elif 'sample' in encoded_data:
	data = encoded_data['sample'].float().to(device)
	print(" Loaded key from pt: sample")
	else:
	raise KeyError("pt file does not contain any of: z_pred, z, sample")
	elif model_type == "rvq":
	# RVQ: prefer idx_pred, then code_idx.
	if 'idx_pred' in encoded_data:
	data = encoded_data['idx_pred'].long().to(device)
	print(" Loaded key from pt: idx_pred")
	elif 'code_idx' in encoded_data:
	data = encoded_data['code_idx'].long().to(device)
	print(" Loaded key from pt: code_idx")
	else:
	raise KeyError("pt file does not contain either idx_pred or code_idx")

	# Print metadata when present.
	if 'text' in encoded_data:
	print(f" Text metadata: {encoded_data['text']}")
	if 'm_len' in encoded_data:
	print(f" Original length: {encoded_data['m_len']}")
	if 'is_segment' in encoded_data and encoded_data['is_segment']:
	print(" Note: this file contains a segment")

	else:
	raise ValueError(f"Unsupported file format: {z_path}. Only .npy, .npz, and .pt are supported")

	# Normalize payload dtype for the selected model type.
	if model_type == "vae":
	z = data
	data_length = z.shape[0]
	print(f" z shape: {z.shape}")
	elif model_type == "rvq":
	code_idx = data.long() # RVQ code_idx must be long.
	data_length = code_idx.shape[0]
	print(f" code_idx shape: {code_idx.shape}")

	# 3. Load the skeleton and convert it to a graph. Only NPZ is supported.
	if not skel_path.endswith('.npz'):
	raise ValueError(f"Unsupported skeleton format: {skel_path}. Only .npz is supported")

	# NPZ format contains skeleton data and a matching tf file.
	# Build the tf path by replacing processed with joint_text_features.
	print(f"Loading skeleton from NPZ: {skel_path}")
	tf_npz_path = skel_path.replace('/processed/', '/joint_text_features/')

	# If the path did not change, try the common parent directory pattern.
	if tf_npz_path == skel_path:
	import pathlib
	path_obj = pathlib.Path(skel_path)
	parent = path_obj.parent
	if 'processed' in str(parent):
	new_parent = str(parent).replace('processed', 'joint_text_features')
	tf_npz_path = os.path.join(new_parent, path_obj.name)

	# Load through utils.py.
	skel_data = load_skeleton_and_tf_from_npz(skel_path, tf_npz_path)
	print(f" Skeleton joints: {skel_data.lo.shape[0]}")
	print(f" tf shape: {skel_data.tf.shape}")

	skel_graph = SkelPoseGraph(skel_data, None) # Convert directly to a graph.
	# Keep the skeleton for later BVH generation.
	saved_skel = graph_2_skel(Batch.from_data_list([skel_graph]).to(device), 1)[0]

	# 4. Decide whether to use sliding-window decoding.
	if use_sliding_window is None:
	# Automatically use sliding windows for sequences longer than the window size.
	use_sliding_window = (data_length > window_size)

	# 5. Decode to motion.
	print("Decoding...")
	out_rep_cfg = cfg["representation"]["out"]

	with torch.no_grad():
	if use_sliding_window and data_length > window_size:
	# Decode with the shared sliding-window implementation.
	print(f" Long sequence ({data_length} frames); using sliding-window decoding")

	# Prepare latent codes for the selected model type.
	latent_codes = z if model_type == "vae" else code_idx

	# Create a graph list of the required length from one skeleton graph.
	from sata.utils.motion_data import create_graph_list_from_single_graph
	src_graphs_list = create_graph_list_from_single_graph(skel_graph, data_length)

	# Call the shared sliding-window decoder.
	hatD_full, src_batch_full, actual_frames, num_nodes_per_frame = \
	decode_to_hatD_with_sliding_window(
	model, latent_codes, src_graphs_list,
	data_length, window_size, overlap, device, model_type
	)

	# Reconstruct motion with hatD_recon_motion.
	out_motion_list, out_contact_list = hatD_recon_motion(
	hatD_full, src_batch_full, out_rep_cfg, ms_dict, actual_frames
	)
	out_motion = out_motion_list[0]
	out_contact = out_contact_list[0]
	else:
	# Use single-pass decoding.
	print(f" Short sequence ({data_length} frames); using single-pass decoding")

	if model_type == "vae":
	# VAE single-pass decode.
	skel_batch = Batch.from_data_list([skel_graph] * data_length).to(device)
	hatD = model.decode(z, skel_batch, data_length)
	elif model_type == "rvq":
	# RVQ single-pass decode.
	skel_batch = Batch.from_data_list([skel_graph] * data_length).to(device)
	hatD, _ = model.decode_from_codes(code_idx, skel_batch, data_length)

	# Reconstruct motion.
	out_motion_list, out_contact_list = hatD_recon_motion(
	hatD, skel_batch, out_rep_cfg, ms_dict, data_length
	)
	out_motion = out_motion_list[0]
	out_contact = out_contact_list[0]

	print(f" Output motion frames: {out_motion.num_frames()}")

	# 6. Save the BVH file.
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	print(f"Created output directory: {output_dir}")

	# Set FPS to 20.
	out_motion.fps = 20

	# Fix coordinate system differences.
	out_motion_fixed = fix_skeleton_coordinate_system(out_motion)
	print(" [Info] Applied coordinate-system fix (Y-Z axis conversion)")

	output_path = os.path.join(output_dir, output_name + '.bvh')
	bvh.save(out_motion_fixed, output_path, rot_order="XYZ")
	# bvh.save(out_motion, output_path, rot_order="XYZ")
	print(f"Saved to: {output_path}")

	return output_path


	def scan_z_and_skeleton_files(visual_base_dir):
	"""
	Read z and skeleton path lists from txt files and build all pairings.

	Args:
	visual_base_dir: visual root containing z.txt and skel.txt

	Returns:
	pairs: list of tuple, [(z_path, z_name, skel_path, skel_name), ...]
	"""
	# Read z.txt.
	z_files = []
	z_txt_path = os.path.join(visual_base_dir, "z.txt")
	if os.path.exists(z_txt_path):
	with open(z_txt_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line and not line.startswith('#'): # Skip empty lines and comments.
	z_path = line
	# Use the file stem as the display name.
	z_name = os.path.splitext(os.path.basename(z_path))[0]
	z_files.append((z_path, z_name))
	print(f" Read {len(z_files)} z files from {z_txt_path}")
	else:
	print(f" Not found: {z_txt_path}")

	# Read skel.txt.
	skel_files = []
	skel_txt_path = os.path.join(visual_base_dir, "skel.txt")
	if os.path.exists(skel_txt_path):
	with open(skel_txt_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line and not line.startswith('#'): # Skip empty lines and comments.
	skel_path = line
	# Use the file stem as the display name.
	skel_name = os.path.splitext(os.path.basename(skel_path))[0]
	skel_files.append((skel_path, skel_name))
	print(f" Read {len(skel_files)} skeleton files from {skel_txt_path}")
	else:
	print(f" Not found: {skel_txt_path}")

	# Build the Cartesian product.
	pairs = []
	for z_path, z_name in z_files:
	for skel_path, skel_name in skel_files:
	pairs.append((z_path, z_name, skel_path, skel_name))

	return pairs


	def batch_decode_all(model_epoch, visual_base_dir=VISUAL_BASE_DIR,
	output_dir=OUTPUT_DIR, device="cuda:0",
	window_size=64, overlap=16, model_type="vae"):
	"""
	Batch decode every z/skeleton pairing listed in txt files.

	Args:
	model_epoch: model checkpoint name
	visual_base_dir: visual root containing z.txt and skel.txt; default visual/
	output_dir: output directory; default visual/bvh_output
	device: compute device
	window_size: sliding-window size
	overlap: sliding-window overlap
	model_type: model type

	Returns:
	results: list of per-pair result dictionaries
	"""
	# Read txt files and build pairings.
	print("="*70)
	print("Reading path lists...")
	print(f" z.txt: {os.path.join(visual_base_dir, 'z.txt')}")
	print(f" skel.txt: {os.path.join(visual_base_dir, 'skel.txt')}")
	print("="*70)

	pairs = scan_z_and_skeleton_files(visual_base_dir)

	if len(pairs) == 0:
	print("No z or skeleton files found. Check the directory configuration.")
	return []

	# Count source files.
	z_count = len(set([p[1] for p in pairs]))
	skel_count = len(set([p[3] for p in pairs]))
	print(f"Found {z_count} z files x {skel_count} skeleton files = {len(pairs)} pairs")
	print()

	# Show all pairings.
	print("Pairings to process:")
	for i, (z_path, z_name, skel_path, skel_name) in enumerate(pairs, 1):
	output_name = f"{z_name}_{skel_name}"
	print(f" [{i}/{len(pairs)}] {output_name}")
	print("="*70)
	print()

	# Process each pair.
	results = []
	for i, (z_path, z_name, skel_path, skel_name) in enumerate(pairs, 1):
	output_name = f"{z_name}_{skel_name}"

	print("="*70)
	print(f"Processing pair [{i}/{len(pairs)}]: {output_name}")
	print("="*70)
	print(f" z file: {z_path}")
	print(f" skeleton file: {skel_path}")
	print()

	try:
	output_path = decode_z_skel_to_bvh(
	z_path=z_path,
	skel_path=skel_path,
	model_epoch=model_epoch,
	output_dir=output_dir,
	output_name=output_name,
	device=device,
	window_size=window_size,
	overlap=overlap,
	use_sliding_window=None, # Select automatically.
	model_type=model_type
	)

	results.append({
	'z_name': z_name,
	'skel_name': skel_name,
	'output_name': output_name,
	'output_path': output_path,
	'status': 'success'
	})
	print(f"[{i}/{len(pairs)}] Success: {output_name}")

	except Exception as e:
	print(f"[{i}/{len(pairs)}] Failed: {output_name}")
	print(f" Error: {str(e)}")
	results.append({
	'z_name': z_name,
	'skel_name': skel_name,
	'output_name': output_name,
	'output_path': None,
	'status': 'failed',
	'error': str(e)
	})

	print()

	# Summary.
	print("="*70)
	print("Batch processing complete")
	print("="*70)
	success_count = sum(1 for r in results if r['status'] == 'success')
	failed_count = sum(1 for r in results if r['status'] == 'failed')
	print(f"Total pairs: {len(results)}")
	print(f" Success: {success_count}")
	print(f" Failed: {failed_count}")

	if failed_count > 0:
	print()
	print("Failed pairs:")
	for r in results:
	if r['status'] == 'failed':
	print(f" - {r['output_name']}: {r.get('error', 'Unknown error')}")

	print("="*70)

	return results


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Batch decode latent vectors to BVH files", formatter_class=argparse.RawDescriptionHelpFormatter, epilog="")

	parser.add_argument("--model_type", type=str, default="vae", choices=["vae", "rvq"], help="Model type: vae or rvq (default: vae)")
	parser.add_argument("--model_epoch", type=str, required=True, help="Model checkpoint name, for example ckpt0")
	parser.add_argument("--visual_dir", type=str, default=VISUAL_BASE_DIR, help=f"Visual root containing z.txt and skel.txt (default: {VISUAL_BASE_DIR})")
	parser.add_argument("--output_dir", type=str, default=OUTPUT_DIR, help=f"Output directory (default: {OUTPUT_DIR})")
	parser.add_argument("--device", type=str, default="cuda:0", help="Compute device (default: cuda:0)")
	parser.add_argument("--window_size", type=int, default=64, help="Sliding-window size in frames (default: 64)")
	parser.add_argument("--overlap", type=int, default=16, help="Sliding-window overlap in frames (default: 16)")

	args = parser.parse_args()

	print("="*70)
	print("Batch latent-vector decoder (reads path lists from txt files)")
	print("="*70)
	print(f"Model type: {args.model_type.upper()}")
	print(f"Model: {args.model_epoch}")
	print(f"z path list: {os.path.join(args.visual_dir, 'z.txt')}")
	print(f"skeleton path list: {os.path.join(args.visual_dir, 'skel.txt')}")
	print(f"Output directory: {args.output_dir}")
	print(f"Device: {args.device}")
	print(f"Sliding window: size={args.window_size}, overlap={args.overlap}")
	print("="*70)
	print()

	try:
	results = batch_decode_all(
	model_epoch=args.model_epoch,
	visual_base_dir=args.visual_dir,
	output_dir=args.output_dir,
	device=args.device,
	window_size=args.window_size,
	overlap=args.overlap,
	model_type=args.model_type
	)

	# Set exit status from the batch result.
	failed_count = sum(1 for r in results if r['status'] == 'failed')
	if failed_count > 0:
	exit(1)

	except Exception as e:
	print()
	print("="*70)
	print(f"Error: {str(e)}")
	print("="*70)
	import traceback
	traceback.print_exc()
	exit(1)