Upload folder using huggingface_hub

0a7036f verified 2 months ago

31.3 kB

	# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
	import argparse
	import os
	import sys
	import time
	from functools import partial
	from PIL import Image
	from diffusers.video_processor import VideoProcessor
	from diffusers.utils import export_to_video

	import numpy as np
	import torch
	import torch.nn.functional as F
	from diffusers.pipelines.wan.pipeline_wan import prompt_clean
	from einops import rearrange
	from tqdm import tqdm

	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	from configs import VA_CONFIGS
	from distributed.fsdp import shard_model
	from distributed.util import _configure_model, init_distributed
	from modules.utils import (
	WanVAEStreamingWrapper,
	load_text_encoder,
	load_tokenizer,
	load_transformer,
	load_vae,
	)
	from utils import (
	FlowMatchScheduler,
	data_seq_to_patch,
	get_mesh_id,
	init_logger,
	logger,
	run_async_server_mode,
	save_async,
	)


	class VA_Server:

	def __init__(self, job_config):
	self.cache_name = 'pos'
	self.job_config = job_config
	self.save_root = job_config.save_root
	self.dtype = job_config.param_dtype
	self.device = torch.device(f"cuda:{job_config.local_rank}")

	self.scheduler = FlowMatchScheduler(shift=self.job_config.snr_shift,
	sigma_min=0.0,
	extra_one_step=True)
	self.action_scheduler = FlowMatchScheduler(
	shift=self.job_config.action_snr_shift,
	sigma_min=0.0,
	extra_one_step=True)
	self.scheduler.set_timesteps(1000, training=True)
	self.action_scheduler.set_timesteps(1000, training=True)

	self.vae = load_vae(
	os.path.join(job_config.wan22_pretrained_model_name_or_path,
	'vae'),
	torch_dtype=self.dtype,
	torch_device=self.device,
	)
	self.streaming_vae = WanVAEStreamingWrapper(self.vae)

	self.tokenizer = load_tokenizer(
	os.path.join(job_config.wan22_pretrained_model_name_or_path,
	'tokenizer'), )

	self.text_encoder = load_text_encoder(
	os.path.join(job_config.wan22_pretrained_model_name_or_path,
	'text_encoder'),
	torch_dtype=self.dtype,
	torch_device=self.device,
	)

	self.transformer = load_transformer(
	os.path.join(job_config.wan22_pretrained_model_name_or_path,
	'transformer'),
	torch_dtype=self.dtype,
	torch_device=self.device,
	)
	shard_fn = partial(shard_model, device_id=job_config.local_rank)
	self.transformer = _configure_model(model=self.transformer,
	shard_fn=shard_fn,
	param_dtype=self.dtype,
	device=self.device)

	self.env_type = job_config.env_type
	self.streaming_vae_half = None
	if self.env_type == 'robotwin_tshape':
	vae_half = load_vae(
	os.path.join(job_config.wan22_pretrained_model_name_or_path,
	'vae'),
	torch_dtype=self.dtype,
	torch_device=self.device,
	)
	self.streaming_vae_half = WanVAEStreamingWrapper(vae_half)

	def _get_t5_prompt_embeds(
	self,
	prompt=None,
	num_videos_per_prompt=1,
	max_sequence_length=512,
	device=None,
	dtype=None,
	):
	device = device or self.device
	dtype = dtype or self.dtype

	prompt = [prompt] if isinstance(prompt, str) else prompt
	prompt = [prompt_clean(u) for u in prompt]
	batch_size = len(prompt)

	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=max_sequence_length,
	truncation=True,
	add_special_tokens=True,
	return_attention_mask=True,
	return_tensors="pt",
	)
	text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
	seq_lens = mask.gt(0).sum(dim=1).long()

	prompt_embeds = self.text_encoder(text_input_ids.to(device),
	mask.to(device)).last_hidden_state
	prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
	prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
	prompt_embeds = torch.stack([
	torch.cat(
	[u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))])
	for u in prompt_embeds
	],
	dim=0)

	# duplicate text embeddings for each generation per prompt, using mps friendly method
	_, seq_len, _ = prompt_embeds.shape
	prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt,
	seq_len, -1)

	return prompt_embeds

	def encode_prompt(
	self,
	prompt,
	negative_prompt=None,
	do_classifier_free_guidance=True,
	num_videos_per_prompt=1,
	prompt_embeds=None,
	negative_prompt_embeds=None,
	max_sequence_length=226,
	device=None,
	dtype=None,
	):
	r"""
	TODO
	"""
	device = device or self.device
	dtype = dtype or self.dtype

	prompt = [prompt] if isinstance(prompt, str) else prompt
	if prompt is not None:
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:
	prompt_embeds = self._get_t5_prompt_embeds(
	prompt=prompt,
	num_videos_per_prompt=num_videos_per_prompt,
	max_sequence_length=max_sequence_length,
	device=device,
	dtype=dtype,
	)

	if do_classifier_free_guidance and negative_prompt_embeds is None:
	negative_prompt = negative_prompt or ""
	negative_prompt = batch_size * [negative_prompt] if isinstance(
	negative_prompt, str) else negative_prompt

	if prompt is not None and type(prompt) is not type(
	negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}.")
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`.")

	negative_prompt_embeds = self._get_t5_prompt_embeds(
	prompt=negative_prompt,
	num_videos_per_prompt=num_videos_per_prompt,
	max_sequence_length=max_sequence_length,
	device=device,
	dtype=dtype,
	)
	return prompt_embeds, negative_prompt_embeds

	def normalize_latents(
	self,
	latents: torch.Tensor,
	latents_mean: torch.Tensor,
	latents_std: torch.Tensor,
	) -> torch.Tensor:
	latents_mean = latents_mean.view(1, -1, 1, 1,
	1).to(device=latents.device)
	latents_std = latents_std.view(1, -1, 1, 1,
	1).to(device=latents.device)
	latents = ((latents.float() - latents_mean) * latents_std).to(latents)
	return latents

	def preprocess_action(self, action):
	action_model_input = torch.from_numpy(action)
	CA, FA, HA = action_model_input.shape # C, F, H
	action_model_input_paded = F.pad(action_model_input,
	[0, 0, 0, 0, 0, 1],
	mode='constant',
	value=0)

	action_model_input = action_model_input_paded[
	self.job_config.inverse_used_action_channel_ids]

	if self.action_norm_method == 'quantiles':
	action_model_input = (action_model_input - self.actions_q01) / (
	self.actions_q99 - self.actions_q01 + 1e-6) * 2. - 1.
	else:
	raise NotImplementedError
	return action_model_input.unsqueeze(0).unsqueeze(-1) # B, C, F, H, W

	def postprocess_action(self, action):
	action = action.cpu() # B, C, F, H, W

	action = action[0, ..., 0] #C, F, H
	if self.action_norm_method == 'quantiles':
	action = (action + 1) / 2 * (self.actions_q99 - self.actions_q01 +
	1e-6) + self.actions_q01
	else:
	raise NotImplementedError
	action = action.squeeze(0).detach().cpu().numpy()
	return action[self.job_config.used_action_channel_ids]

	def _repeat_input_for_cfg(self, input_dict):
	if self.use_cfg:
	input_dict['noisy_latents'] = input_dict['noisy_latents'].repeat(2, 1, 1, 1, 1)
	input_dict['text_emb'] = torch.cat([self.prompt_embeds.to(self.dtype).clone(), self.negative_prompt_embeds.to(self.dtype).clone()], dim=0)
	input_dict['grid_id'] = input_dict['grid_id'][None].repeat(2, 1, 1)
	input_dict['timesteps'] = input_dict['timesteps'][None].repeat(2, 1)
	else:
	input_dict['grid_id'] = input_dict['grid_id'][None]
	input_dict['timesteps'] = input_dict['timesteps'][None]
	return input_dict

	def _prepare_latent_input(self,
	latent_model_input,
	action_model_input,
	latent_t=0,
	action_t=0,
	latent_cond=None,
	action_cond=None,
	frame_st_id=0,
	patch_size=(1, 2, 2)):
	logger.info(f"FRAME START ID: {frame_st_id}")
	input_dict = dict()
	if latent_model_input is not None:
	input_dict['latent_res_lst'] = {
	'noisy_latents':
	latent_model_input,
	'timesteps':
	torch.ones([latent_model_input.shape[2]],
	dtype=torch.float32,
	device=self.device) * latent_t,
	'grid_id':
	get_mesh_id(latent_model_input.shape[-3] // patch_size[0],
	latent_model_input.shape[-2] // patch_size[1],
	latent_model_input.shape[-1] // patch_size[2], 0,
	1, frame_st_id).to(self.device),
	'text_emb':
	self.prompt_embeds.to(self.dtype).clone(),
	}
	if latent_cond is not None:
	input_dict['latent_res_lst'][
	'noisy_latents'][:, :, 0:1] = latent_cond[:, :, 0:1]
	input_dict['latent_res_lst']['timesteps'][0:1] *= 0

	if action_model_input is not None:
	input_dict['action_res_lst'] = {
	'noisy_latents':
	action_model_input,
	'timesteps':
	torch.ones([action_model_input.shape[2]],
	dtype=torch.float32,
	device=self.device) * action_t,
	'grid_id':
	get_mesh_id(action_model_input.shape[-3],
	action_model_input.shape[-2],
	action_model_input.shape[-1],
	1,
	1,
	frame_st_id,
	action=True).to(self.device),
	'text_emb':
	self.prompt_embeds.to(self.dtype).clone(),
	}

	if action_cond is not None:
	input_dict['action_res_lst'][
	'noisy_latents'][:, :, 0:1] = action_cond[:, :, 0:1]
	input_dict['action_res_lst']['timesteps'][0:1] *= 0
	input_dict['action_res_lst']['noisy_latents'][:, ~self.
	action_mask] *= 0
	return input_dict

	def _encode_obs(self, obs):
	images = obs['obs']
	if not isinstance(images, list):
	images = [images]
	if len(images) < 1:
	return None
	videos = []
	for k_i, k in enumerate(self.job_config.obs_cam_keys):
	if self.env_type == 'robotwin_tshape':
	if k_i == 0: # camera high
	height_i, width_i = self.height, self.width
	else:
	height_i, width_i = self.height // 2, self.width // 2
	else:
	height_i, width_i = self.height, self.width

	history_video_k = torch.from_numpy(
	np.stack([each[k]
	for each in images])).float().permute(3, 0, 1, 2)
	history_video_k = F.interpolate(history_video_k,
	size=(height_i, width_i),
	mode='bilinear',
	align_corners=False).unsqueeze(0)
	videos.append(history_video_k)

	if self.env_type == 'robotwin_tshape':
	videos_high = videos[0] / 255.0 * 2.0 - 1.0
	videos_left_and_right = torch.cat(videos[1:],
	dim=0) / 255.0 * 2.0 - 1.0
	enc_out_high = self.streaming_vae.encode_chunk(
	videos_high.to(self.device).to(self.dtype))
	enc_out_left_and_right = self.streaming_vae_half.encode_chunk(
	videos_left_and_right.to(self.device).to(self.dtype))
	enc_out = torch.cat([
	torch.cat(enc_out_left_and_right.split(1, dim=0), dim=-1),
	enc_out_high
	],
	dim=-2)
	else:
	videos = torch.cat(videos, dim=0) / 255.0 * 2.0 - 1.0
	videos_chunk = videos.to(self.device).to(self.dtype)
	enc_out = self.streaming_vae.encode_chunk(videos_chunk)

	mu, logvar = torch.chunk(enc_out, 2, dim=1)
	latents_mean = torch.tensor(self.vae.config.latents_mean).to(mu.device)
	latents_std = torch.tensor(self.vae.config.latents_std).to(mu.device)
	mu_norm = self.normalize_latents(mu, latents_mean, 1.0 / latents_std)
	video_latent = torch.cat(mu_norm.split(1, dim=0), dim=-1)
	return video_latent

	def _reset(self, prompt=None):
	logger.info('Reset.')
	self.use_cfg = (self.job_config.guidance_scale > 1) or (self.job_config.action_guidance_scale > 1)
	#### Reset all parameters
	self.frame_st_id = 0
	self.init_latent = None
	#### clean vae and transformer cache
	self.transformer.clear_cache(self.cache_name)
	self.streaming_vae.clear_cache()

	self.action_per_frame = self.job_config.action_per_frame
	self.height, self.width = self.job_config.height, self.job_config.width

	if self.env_type == 'robotwin_tshape':
	self.latent_height, self.latent_width = (
	(self.height // 16) * 3) // 2, self.width // 16
	self.streaming_vae_half.clear_cache()
	else:
	self.latent_height, self.latent_width = self.height // 16, self.width // 16 * len(
	self.job_config.obs_cam_keys)

	patch_size = self.job_config.patch_size
	latent_token_per_chunk = (self.job_config.frame_chunk_size *
	self.latent_height * self.latent_width) // (
	patch_size[0] * patch_size[1] *
	patch_size[2])
	action_token_per_chunk = self.job_config.frame_chunk_size * self.action_per_frame
	self.transformer.create_empty_cache(self.cache_name,
	self.job_config.attn_window,
	latent_token_per_chunk,
	action_token_per_chunk,
	dtype=self.dtype,
	device=self.device,
	batch_size = 2 if self.use_cfg else 1
	)

	self.action_mask = torch.zeros([self.job_config.action_dim]).bool()
	self.action_mask[self.job_config.used_action_channel_ids] = True

	self.actions_q01 = torch.tensor(self.job_config.norm_stat['q01'],
	dtype=torch.float32).reshape(-1, 1, 1)
	self.actions_q99 = torch.tensor(self.job_config.norm_stat['q99'],
	dtype=torch.float32).reshape(-1, 1, 1)
	self.action_norm_method = self.job_config.action_norm_method

	##### get prompt
	if prompt is None:
	self.prompt_embeds = self.negative_prompt_embeds = None
	else:
	self.prompt_embeds, self.negative_prompt_embeds = self.encode_prompt(
	prompt=prompt,
	negative_prompt=None,
	do_classifier_free_guidance=self.job_config.guidance_scale > 1,
	num_videos_per_prompt=1,
	prompt_embeds=None,
	negative_prompt_embeds=None,
	max_sequence_length=512,
	device=self.device,
	dtype=self.dtype,
	)

	self.exp_name = f"{prompt}_{time.strftime('%Y%m%d_%H%M%S')}" if prompt else "default"
	self.exp_save_root = os.path.join(self.save_root, 'real', self.exp_name)
	os.makedirs(self.exp_save_root, exist_ok=True)
	torch.cuda.empty_cache()

	def _infer(self, obs, frame_st_id=0):
	frame_chunk_size = self.job_config.frame_chunk_size
	if frame_st_id == 0:
	init_latent = self._encode_obs(obs)
	self.init_latent = init_latent

	latents = torch.randn(1,
	48,
	frame_chunk_size,
	self.latent_height,
	self.latent_width,
	device=self.device,
	dtype=self.dtype)
	actions = torch.randn(1,
	self.job_config.action_dim,
	frame_chunk_size,
	self.action_per_frame,
	1,
	device=self.device,
	dtype=self.dtype)

	video_inference_step = self.job_config.num_inference_steps
	action_inference_step = self.job_config.action_num_inference_steps
	video_step = self.job_config.video_exec_step

	self.scheduler.set_timesteps(video_inference_step)
	self.action_scheduler.set_timesteps(action_inference_step)
	timesteps = self.scheduler.timesteps
	action_timesteps = self.action_scheduler.timesteps

	timesteps = F.pad(timesteps, (0, 1), mode='constant', value=0)

	if video_step != -1:
	timesteps = timesteps[:video_step]

	action_timesteps = F.pad(
	action_timesteps,
	(0,
	1), # pad 1 element at the end (right side) of the last dimension
	mode='constant',
	value=0)

	with (
	torch.amp.autocast('cuda', dtype=self.dtype),
	torch.no_grad(),
	):
	# 1. Video Generation Loop
	for i, t in enumerate(tqdm(timesteps)):
	last_step = i == len(timesteps) - 1
	latent_cond = init_latent[:, :, 0:1].to(
	self.dtype) if frame_st_id == 0 else None
	input_dict = self._prepare_latent_input(
	latents,
	None,
	t,
	t,
	latent_cond,
	None,
	frame_st_id=frame_st_id)

	video_noise_pred = self.transformer(
	self._repeat_input_for_cfg(input_dict['latent_res_lst']),
	update_cache=1 if last_step else 0,
	cache_name=self.cache_name,
	action_mode=False)

	if not last_step or video_step != -1:
	video_noise_pred = data_seq_to_patch(
	self.job_config.patch_size, video_noise_pred,
	frame_chunk_size, self.latent_height,
	self.latent_width, batch_size=2 if self.use_cfg else 1)
	if self.job_config.guidance_scale > 1:
	video_noise_pred = video_noise_pred[1:] + self.job_config.guidance_scale * (video_noise_pred[:1] - video_noise_pred[1:])
	else:
	video_noise_pred = video_noise_pred[:1]
	latents = self.scheduler.step(video_noise_pred,
	t,
	latents,
	return_dict=False)

	latents[:, :, 0:1] = latent_cond if frame_st_id == 0 else latents[:, :, 0:1]

	for i, t in enumerate(tqdm(action_timesteps)):
	last_step = i == len(action_timesteps) - 1
	action_cond = torch.zeros(
	[
	1, self.job_config.action_dim, 1,
	self.action_per_frame, 1
	],
	device=self.device,
	dtype=self.dtype) if frame_st_id == 0 else None

	input_dict = self._prepare_latent_input(
	None,
	actions,
	t,
	t,
	None,
	action_cond,
	frame_st_id=frame_st_id)
	action_noise_pred = self.transformer(
	self._repeat_input_for_cfg(input_dict['action_res_lst']),
	update_cache=1 if last_step else 0,
	cache_name=self.cache_name,
	action_mode=True)

	if not last_step:
	action_noise_pred = rearrange(action_noise_pred,
	'b (f n) c -> b c f n 1',
	f=frame_chunk_size)
	if self.job_config.action_guidance_scale > 1:
	action_noise_pred = action_noise_pred[1:] + self.job_config.action_guidance_scale * (action_noise_pred[:1] - action_noise_pred[1:])
	else:
	action_noise_pred = action_noise_pred[:1]
	actions = self.action_scheduler.step(action_noise_pred,
	t,
	actions,
	return_dict=False)

	actions[:, :, 0:1] = action_cond if frame_st_id == 0 else actions[:, :, 0:1]

	actions[:, ~self.action_mask] *= 0

	save_async(latents, os.path.join(self.exp_save_root, f'latents_{frame_st_id}.pt'))
	save_async(actions, os.path.join(self.exp_save_root, f'actions_{frame_st_id}.pt'))

	actions = self.postprocess_action(actions)
	torch.cuda.empty_cache()
	return actions, latents

	def _compute_kv_cache(self, obs):
	### optional async save obs for debug
	self.transformer.clear_pred_cache(self.cache_name)
	save_async(obs['obs'], os.path.join(self.exp_save_root, f'obs_data_{self.frame_st_id}.pt'))
	latent_model_input = self._encode_obs(obs)
	if self.frame_st_id == 0:
	latent_model_input = torch.cat(
	[self.init_latent, latent_model_input],
	dim=2) if latent_model_input is not None else self.init_latent

	action_model_input = self.preprocess_action(obs['state'])
	action_model_input = action_model_input.to(latent_model_input)
	logger.info(
	f"get KV cache obs: {latent_model_input.shape} {action_model_input.shape}"
	)
	input_dict = self._prepare_latent_input(latent_model_input,
	action_model_input,
	frame_st_id=self.frame_st_id)

	with (
	torch.amp.autocast('cuda', dtype=self.dtype),
	torch.no_grad(),
	):
	self.transformer(self._repeat_input_for_cfg(input_dict['latent_res_lst']),
	update_cache=2,
	cache_name=self.cache_name,
	action_mode=False)

	self.transformer(self._repeat_input_for_cfg(input_dict['action_res_lst']),
	update_cache=2,
	cache_name=self.cache_name,
	action_mode=True)
	torch.cuda.empty_cache()
	self.frame_st_id += latent_model_input.shape[2]

	@torch.no_grad()
	def infer(self, obs):
	reset = obs.get('reset', False)
	prompt = obs.get('prompt', None)
	compute_kv_cache = obs.get('compute_kv_cache', False)

	if reset:
	logger.info(f"***************** Reset server ****************")
	self._reset(prompt=prompt)
	return dict()
	elif compute_kv_cache:
	logger.info(
	f"################# Compute KV Cache #################")
	self._compute_kv_cache(obs)
	return dict()
	else:
	logger.info(f"################# Infer One Chunk #################")
	action, _ = self._infer(obs, frame_st_id=self.frame_st_id)
	return dict(action=action)

	def decode_one_video(self, latents, output_type):
	latents = latents.to(self.vae.dtype)
	latents_mean = (
	torch.tensor(self.vae.config.latents_mean)
	.view(1, self.vae.config.z_dim, 1, 1, 1)
	.to(latents.device, latents.dtype)
	)
	latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
	latents.device, latents.dtype
	)
	latents = latents / latents_std + latents_mean
	video = self.vae.decode(latents, return_dict=False)[0]
	video = self.video_processor.postprocess_video(video, output_type=output_type)
	return video

	def load_init_obs(self):
	imf_dict = {v: np.array(Image.open(os.path.join(self.job_config.input_img_path, f"{v}.png")).convert("RGB")) for v in self.job_config.obs_cam_keys}
	init_obs = {}
	init_obs['obs'] = [imf_dict]
	return init_obs

	@torch.no_grad()
	def generate(self):
	self.video_processor = VideoProcessor(vae_scale_factor=1)
	self._reset(self.job_config.prompt)
	init_obs = self.load_init_obs()
	pred_latent_lst = []
	pred_action_lst = []
	for chunk_id in range(self.job_config.num_chunks_to_infer):
	actions, latents = self._infer(init_obs, frame_st_id=(chunk_id * self.job_config.frame_chunk_size))
	actions = torch.from_numpy(actions)
	pred_latent_lst.append(latents)
	pred_action_lst.append(actions)
	pred_latent = torch.cat(pred_latent_lst, dim=2)
	pred_action = torch.cat(pred_action_lst, dim=1).flatten(1)
	self.transformer.clear_cache(self.cache_name)
	self.streaming_vae.clear_cache()
	if self.streaming_vae_half:
	self.streaming_vae_half.clear_cache()
	del self.transformer
	del self.streaming_vae_half
	del self.text_encoder
	torch.cuda.empty_cache()
	decoded_video = self.decode_one_video(pred_latent, 'np')[0]
	export_to_video(decoded_video, os.path.join(self.save_root, "demo.mp4"), fps=10)

	def run(args):

	config = VA_CONFIGS[args.config_name]
	port = config.port if args.port is None else args.port
	if args.save_root is not None:
	config.save_root = args.save_root
	rank = int(os.getenv("RANK", 0))
	local_rank = int(os.environ.get('LOCAL_RANK', 0))
	world_size = int(os.environ.get("WORLD_SIZE", 1))
	init_distributed(world_size, local_rank, rank)
	config.rank = rank
	config.local_rank = local_rank
	config.world_size = world_size
	model = VA_Server(config)
	if getattr(args, "debug_infer_once", False):
	from utils.Simple_Remote_Infer.deploy.msgpack_numpy import unpackb
	from pathlib import Path

	logger.info("***************** debug_infer_once: reset ****************")
	path = Path("debug/place_fan/call1_reset.msgpack") # 或 step0_call2 / step0_call3
	inp = unpackb(path.read_bytes())
	# 然后在本地 model 上 debug
	out = model.infer(inp)

	logger.info("***************** debug_infer_once: first infer ****************")
	path = Path("debug/place_fan/call2.msgpack") # 或 step0_call2 / step0_call3
	inp = unpackb(path.read_bytes())
	# 然后在本地 model 上 debug
	out = model.infer(inp)

	logger.info("***************** debug_infer_once: kv cache ****************")
	path = Path("debug/place_fan/call3.msgpack") # 或 step0_call2 / step0_call3
	inp = unpackb(path.read_bytes())
	# 然后在本地 model 上 debug
	out = model.infer(inp)

	if config.infer_mode == "i2va":
	logger.info(f"****************************USE I2AV mode****************************")
	model.generate()
	elif config.infer_mode == "server":
	logger.info(f"****************************USE Server mode****************************")
	run_async_server_mode(model, local_rank, config.host, port)
	else:
	raise ValueError(f"Unknown infer mode: {config.infer_mode}")

	def main():
	"""
	TODO
	"""
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--config-name",
	type=str,
	required=False,
	default='robotwin',
	help="config name.",
	)
	parser.add_argument(
	"--port",
	type=int,
	default=None,
	help='(start) port'
	)
	parser.add_argument(
	"--save_root",
	type=str,
	default=None,
	help='save root'
	)
	parser.add_argument(
	"--debug_infer_once",
	action="store_true",
	help="Run one infer with dummy observation then exit (for debugging infer() without WebSocket client)",
	)
	args = parser.parse_args()
	run(args)
	logger.info("Finish all process!!!!!!!!!!!!")


	if __name__ == "__main__":
	init_logger()
	main()