returnzeros
/

anyrobot_bimanual

Model card Files Files and versions

xet

Community

Upload demo_deploy.py

by returnzeros - opened Jun 6, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+273

-0

Files changed (1) hide show

demo_deploy.py +273 -0

demo_deploy.py ADDED Viewed

	@@ -0,0 +1,273 @@

+from control_joints import *
+from read_joints import *
+from realsense import *
+import os
+import cv2
+import numpy as np
+import torch
+import time
+import argparse
+import sys
+import json
+# get current workspace
+sys.path.insert(0, 'policy/ACT_DP_multitask')  # Add the path to the policy directory
+from policy_anyrobot import ACTDiffusionPolicy
+from t5_encoder import T5Embedder
+from utils import convert_weight
+import yaml
+def get_image(cameras):
+    """
+    Get the latest images from all cameras and return them as a dictionary.
+    """
+    obs_image = dict()
+    for i in range(3):
+        for cam in cameras:
+            color_image = cam.get_latest_image()  # Get the latest image from the camera BGR 0-255 H W 3
+            if color_image is not None:
+                obs_image[cam.name] = color_image[:,:,::-1]  # BGR to RGB conversion 0-255 H W 3
+                # cv save
+            filename = f"{cam.name}_image_{i}.png"
+            cv2.imwrite(filename, color_image)
+            print(f"Saved image: {filename}")
+    return obs_image  # Return the dictionary containing images from all cameras
+def get_observation(obs_image, joint):
+    # for key, value in obs_image.items():
+    #     print(f"Camera: {key}, Image shape: {value.shape} , Image: {value.max()}")  # Debugging line to check image shapes
+    observation = dict()
+    observation['images'] = dict()
+    observation['images']['cam_high'] = np.moveaxis(obs_image['head_camera'], -1, 0) # rgb H W C -> C H W
+    observation['images']['cam_left_wrist']= np.moveaxis(obs_image['left_camera'], -1, 0) # rgb H W C -> C H W
+    observation['images']['cam_right_wrist'] = np.moveaxis(obs_image['right_camera'], -1, 0) # rgb H W C -> C H W
+    observation['qpos'] = joint
+    return observation
+def encode_obs(observations, camera_names):
+    obs_img = []
+    for camera_name in camera_names: # ['cam_high', 'cam_left_wrist', 'cam_right_wrist']
+        obs_img.append(observations['images'][camera_name])
+    obs_img = np.stack(obs_img, axis=0)  # shape: (N_views, H, W, C) 0-255 rgb
+    image_data = torch.from_numpy(obs_img).unsqueeze(0).float() / 255.0  # Normalize to [0, 1] and add history dimension
+    qpos_data = torch.from_numpy(np.array(observations['qpos'], dtype=np.float32)).unsqueeze(0)  # shape: (1, 14)
+    return image_data, qpos_data # no batch dimension
+def get_model(config_file, ckpt_file, device):
+    with open(config_file, "r", encoding="utf-8") as file:
+            policy_config = json.load(file)
+    print(f"Loading policy config from {config_file}")
+    policy = ACTDiffusionPolicy(policy_config)
+    print(f"Loading model from {ckpt_file}")
+    policy.load_state_dict(convert_weight(torch.load(ckpt_file)["state_dict"]))
+    policy.to(device)
+    policy.eval()
+    stats = torch.load(ckpt_file)["stats"]
+    print('Resetting observation normalization stats')
+    policy.reset_obs(stats, norm_type = policy_config["norm_type"])
+    camera_names = policy_config["camera_names"]
+    return policy, camera_names
+def get_language_encoder(device):
+    MODEL_PATH ='/data/gjx/.cache/huggingface/hub/models--google--t5-v1_1-xxl/snapshots/3db67ab1af984cf10548a73467f0e5bca2aaaeb2'
+    # MODEL_PATH = 'policy/weights/RDT/t5-v1_1-xxl'
+    CONFIG_PATH = os.path.join('policy/ACT_DP_multitask',"base.yaml")
+    with open(CONFIG_PATH, "r") as fp:
+        config = yaml.safe_load(fp)
+    text_embedder = T5Embedder(
+        from_pretrained=MODEL_PATH,
+        model_max_length=config["dataset"]["tokenizer_max_length"],
+        device=device,
+        use_offload_folder=None,
+        )
+    tokenizer, text_encoder = text_embedder.tokenizer, text_embedder.model
+    return tokenizer, text_encoder
+def get_language_embed(tokenizer, text_encoder, language_instruction, device):
+    tokens = tokenizer(
+    language_instruction, return_tensors="pt",
+    padding="longest",
+    truncation=True
+    )["input_ids"].to(device)
+    tokens = tokens.view(1, -1)
+    with torch.no_grad():
+        pred = text_encoder(tokens).last_hidden_state.detach().cpu()
+    return pred.squeeze(0).mean(0)  # shape: (hidden_size) cpu tensor
+def test_code():
+    print("Testing the code...")  # Dummy test function to ensure the script runs without errors
+    # Dummy test function to ensure the script runs without errors
+    # reader = JointReader(left_can="can_left_1", right_can="can_right_1") # ��ȡ�ؽڶ���
+    # cameras = [
+    #         RealSenseCam("250122079815", "left_camera"),
+    #         RealSenseCam("048522073543", "head_camera"),
+    #         RealSenseCam("030522070109", "right_camera"),
+    #     ]
+    # left_can, right_can = "can_left_2", "can_right_2"
+    # ctx = rs.context()
+    # if len(ctx.devices) > 0:
+    #     print("Found RealSense devices:")
+    #     for d in ctx.devices:
+    #         # ��ȡ�豸�����ƺ����к�
+    #         name = d.get_info(rs.camera_info.name)
+    #         serial_number = d.get_info(rs.camera_info.serial_number)
+    #         print(f"Device: {name}, Serial Number: {serial_number}")
+    # else:
+    #     print("No Intel RealSense devices connected")
+    # for cam in cameras:
+    #     cam.start()
+    # for i in range(20):
+    #     print(f"Warm up: {i}", end="\r")
+    #     for cam in cameras:
+    #         color_image = cam.get_latest_image()
+    #     time.sleep(0.15)
+    device = 'cpu'
+    ckpt_dir = 'policy/ACT_DP_multitask/checkpoints/real_fintune_50_2000/act_dp'
+    config_path = os.path.join(ckpt_dir, 'policy_config.json')
+    ckpt_path = os.path.join(ckpt_dir, 'policy_lastest_seed_0.ckpt')
+    policy, cameras_name = get_model(config_path, ckpt_path, device)
+    instruction_file = 'instruction.txt'
+    with open(instruction_file, 'r') as f:
+        instruction = f.readline().strip().strip('"')
+    print(f"Instruction: {instruction}")
+    tokenizer, text_encoder = get_language_encoder(device)  # ��ȡ���Ա�����
+    task_emb = get_language_embed(tokenizer, text_encoder, instruction, device)  # ��ȡ����Ƕ�� temsor D
+    controller = ControlJoints(left_can=left_can, right_can=right_can)
+    image = torch.rand(1, 3, 3, 480, 640)  # Dummy image tensor
+    qpos = torch.rand(1, 14)  # Dummy joint position tensor
+    actions = policy.get_action(qpos.float().to(device), image.float().to(device),  task_emb.float().to(device))
+    print(f"Actions shape: {actions.shape}")
+if __name__ == "__main__":
+    # test_code()  # Uncomment to run the test function
+    # print("Testing the done...")
+    # exit()
+    print(torch.backends.cudnn.enabled)
+    print(torch.backends.cudnn.version())
+    print(torch.version.cuda)
+    print(torch.__version__)
+    # �������
+    parser = argparse.ArgumentParser(description="Deploy the action for a specific player")
+    parser.add_argument("--ckpt_path", type=str, default='policy/ACT_DP_multitask/checkpoints/real_fintune_50_2000/act_dp')
+    # ��ȡ�������� PLAYER
+    player_value = os.getenv("PLAYER")
+    # ��黷�������Ƿ������������
+    if player_value is None:
+        raise ValueError("�������� PLAYER δ����")
+    try:
+        player_value = int(player_value)
+    except ValueError:
+        raise ValueError("�������� PLAYER ������һ������")
+    # ���� PLAYER ��ִֵ�в�ͬ�Ĳ���
+    if player_value == 1:
+        print("Player 1")
+        cameras = [
+            RealSenseCam("337322073280", "left_camera"),
+            RealSenseCam("337322074191", "head_camera"),
+            RealSenseCam("337122072617", "right_camera"),
+        ]
+        left_can, right_can = "can_left_1", "can_right_1"
+    elif player_value == 2:
+        print("Player 2")
+        cameras = [
+            RealSenseCam("250122079815", "left_camera"),
+            RealSenseCam("048522073543", "head_camera"),
+            RealSenseCam("030522070109", "right_camera"),
+        ]
+        left_can, right_can = "can_left_2", "can_right_2"
+    else:
+        raise ValueError("PLAYER ֵ��Ч�������� 1 �� 2")
+    reader = JointReader(left_can=left_can, right_can=right_can) # ��ȡ�ؽڶ���
+    # ==== Get RGB ====
+    # ���������Ķ������ڹ����������ӵ� RealSense �豸
+    ctx = rs.context()
+    # ����Ƿ����豸����
+    if len(ctx.devices) > 0:
+        print("Found RealSense devices:")
+        for d in ctx.devices:
+            # ��ȡ�豸�����ƺ����к�
+            name = d.get_info(rs.camera_info.name)
+            serial_number = d.get_info(rs.camera_info.serial_number)
+            print(f"Device: {name}, Serial Number: {serial_number}")
+    else:
+        print("No Intel RealSense devices connected")
+    # �����������
+    for cam in cameras:
+        cam.start()
+    # Ԥ�����
+    for i in range(10):
+        print(f"Warm up: {i}", end="\r")
+        for cam in cameras:
+            color_image = cam.get_latest_image()
+        time.sleep(0.15)
+    # Load model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # device ='cpu'
+    ckpt_dir = parser.parse_args().ckpt_path
+    config_path = os.path.join(ckpt_dir, 'policy_config.json')
+    # ckpt_path = os.path.join(ckpt_dir, 'policy_lastest_seed_0.ckpt') # policy_epoch_600_seed_0.ckpt
+    ckpt_path = os.path.join(ckpt_dir, 'policy_epoch_600_seed_0.ckpt')
+    policy, camera_names  = get_model(config_path, ckpt_path, device)  # ��ȡģ��
+    print('camera_names:', camera_names)
+    # Get instructions
+    instruction_file = 'instruction.txt'
+    with open(instruction_file, 'r') as f:
+        instruction = f.readline().strip()
+    print(f"Using instruction: {instruction}")
+    tokenizer, text_encoder = get_language_encoder(device)  # ��ȡ���Ա�����
+    print('Loading language tokenizer and encoder...')
+    task_emb = get_language_embed(tokenizer, text_encoder, instruction, device)  # ��ȡ����Ƕ�� temsor D
+    # # ==== Get Observation ====
+    controller = ControlJoints(left_can=left_can, right_can=right_can)
+    max_timestep = 600
+    step = 0
+    while step < max_timestep:
+        obs_image = get_image(cameras)
+        joint = reader.get_joint_value()
+        observation = get_observation(obs_image, joint)
+        image, qpos = encode_obs(observation, camera_names)
+        actions = policy.get_action(qpos.float().to(device), image.float().to(device),  task_emb.float().to(device))
+        print(f"Step: {step}/{max_timestep}, Action: {actions.shape}")
+        for action in actions[0:30]:  # ִ��ÿ������
+            controller.control(action)
+            step += 1
+            # import pdb; pdb.set_trace()  # Debugging line to pause execution
+            time.sleep(0.05)
+        # joint = actions[-1]
+    # obs = dict()
+    # for i in range(3):
+    #     for cam in cameras:
+    #         color_image = cam.get_latest_image()
+    #         if color_image is not None:
+    #             # ����ͼ��
+    #             obs[cam] = color_image
+    #             # filename = f"{cam.name}_image_{i}.png"
+    #             # cv2.imwrite(filename, color_image)
+    #             # print(f"Saved image: {filename}")
+    # # ==== Get Joint ====
+    # reader = JointReader(left_can=left_can, right_can=right_can)
+    # print(reader.get_joint_value())
+    # # ==== Deploy Action ====
+    # controller = ControlJoints(left_can=left_can, right_can=right_can)
+    # for i in range(10):
+    #     positions = [0] * 14
+    #     controller.control(positions)
+    #     time.sleep(0.1)