robo-eval / sim_eval.py
Neo-X
Updating dockerfile for learning.
84fe74c
import dill
import h5py
import numpy as np
import torch
def get_text_tokens(cfg, tokenizer, text_model, goal, model=None):
"""
Get the text tokens/embeddings for the goal.
If a `model` with `encode_text_goal` is provided, use it so callers don't need a buffer.
"""
if model is not None:
return model.encode_text_goal(goal, tokenizer=tokenizer, text_model=text_model)
# fallback to legacy behaviour
if cfg.dataset.encode_with_t5:
goal_ = np.zeros((cfg.max_block_size, cfg.n_embd), dtype=np.float32)
input_ids = tokenizer(goal, return_tensors="pt").input_ids
goal_t = text_model.encoder(input_ids).last_hidden_state.detach().cpu().numpy() ## Get the goal embedding
goal_[:len(goal_t[0]), :] = goal_t[0][:cfg.max_block_size] ## Overwrite just the zeros up to the size of this vector, smaller vectors will have < max_block_size
else:
goal_ = " " * cfg.max_block_size
goal_ = goal[:cfg.max_block_size] + goal_[len(goal):cfg.max_block_size]
# legacy buffer-based encoding is not available here
raise RuntimeError("Text encoding without model requires a buffer; pass model into get_text_tokens")
return np.expand_dims(goal_, axis=0)
def get_blocked_mask(cfg, targets=None, T=0):
## Compute blocked masks
c=192 ## Number of patches/channels in the image
mask = torch.ones((1 + (c * cfg.policy.obs_stacking) + T + c, ), device=cfg.device) ## (1, T)
if targets is None:
pass
elif (torch.rand(1)[0] > 0.66):
mask[1 + (c * cfg.policy.obs_stacking): 1 + (c * cfg.policy.obs_stacking) + T] = torch.zeros((1,T), device=cfg.device) ## Mask goal string
elif (torch.rand(1)[0] > 0.33):
mask[1 + (c * cfg.policy.obs_stacking) + T: 1 + (c * cfg.policy.obs_stacking) + T + c] = torch.zeros((1,c), device=cfg.device) ## Mask goal image
def eval_model_in_sim(cfg, model, device, log_dir, env, env_unwrapped,
wandb, iter_, tokenizer=None, text_model=None):
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
print("Evaluating model in sim environment")
from collections import deque
from einops import rearrange
rewards = []
for j in range(cfg.sim.eval_episodes): ## Better to eval over a few different goal configurations
obs, reset_info = env.reset()
obs_ = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)[:,:,:3]
obs_hist = deque(maxlen=cfg.policy.obs_stacking)
last_action = np.zeros(cfg.action_dim) # Track last action taken
for _ in range(cfg.policy.obs_stacking):
obs_hist.append(obs_)
instruction = env_unwrapped.get_language_instruction()
# print("Reset info", reset_info)
print("Instruction", instruction)
frames = []
done, truncated, timeLimit, t = False, False, 100, 0
txt_goal = get_text_tokens(cfg, tokenizer, text_model, instruction, model=model)
# obs_hist.append(image) ## Add the new observation to the history buffer
while not (done or truncated or (t > timeLimit)):
# action[:3]: delta xyz; action[3:6]: delta rotation in axis-angle representation;
# action[6:7]: gripper (the meaning of open / close depends on robot URDF)
# obs = [obs_["image"] for obs_ in obs] # obs is a list of dicts
image = np.stack(obs_hist, axis=-1) # stack along the last dimension
image = rearrange(image, 'h w c t -> h w (c t)') # add batch dimension
obs_state = torch.tensor(model.preprocess_state(image), dtype=torch.float32)
goal_state = torch.tensor(model.preprocess_goal_image(image[:,:,:3]), dtype=torch.float32)
# Prepare last_action tensor if available
last_action_tensor = None
if last_action is not None:
last_action_tensor = torch.tensor(last_action[:cfg.action_dim], dtype=torch.float32).unsqueeze(0).to(device)
action, loss = model.forward(torch.tensor(obs_state.unsqueeze(0), dtype=torch.float32).to(device)
,torch.tensor(txt_goal).to(device)
,torch.tensor(goal_state.unsqueeze(0), dtype=torch.float32).to(device),
mask_=True, ## Masks goal image
pose=torch.tensor([[obs["extra"]["tcp_pose"]]], dtype=torch.float32).to(device),
last_action=last_action_tensor,
)
action = model.decode_action(action[0]).cpu().detach().numpy() ## Add in the gripper close action
last_action = action.copy() # Store for next iteration
## If the actions are stacked into a longer vector execute the sequence of actions
for step_ in range(cfg.policy.action_stacking):
act_ = action[cfg.action_dim*step_:(cfg.action_dim*(step_+1))]
obs, reward, done, truncated, info = env.step(act_)
image = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)
image = image[:,:,:3] ## Remove last dimension of image color
# Store the original image for video before stacking/processing
frames.append(image)
reward = -(np.linalg.norm(info["eof_to_obj1_diff"]) + np.linalg.norm(info["eof_to_obj1_diff"])) ## Use a shaped reward as distance between gripper and objects
rewards.append(reward)
t=t+1
if done or truncated:
break
episode_stats = info.get('episode_stats', {})
episode_stats['rewards'] = np.mean(rewards)
# print("Episode stats", episode_stats)
print(f"avg reward {np.mean(episode_stats['rewards']):.8f}")
if not cfg.testing:
wandb.log({"avg reward": np.mean(rewards)})
import os
path_ = os.path.join(log_dir, f"simple-env-{iter_}.mp4")
import imageio
imageio.mimsave(path_, frames, fps=20)
episode_stats['video_url'] = path_
if not cfg.testing:
try:
wandb.log({"example": wandb.Video(path_)})
except Exception as e:
print(f"Warning: failed to log video to wandb: {e}")
return episode_stats
import gymnasium as gym
# --- History Stacking Wrapper ---
class DictWrapper(gym.ObservationWrapper):
# from gymnasium.spaces import Box
"""
A wrapper that grabs the observation from a specific key in the dictionary.
"""
def __init__(self, env, obs_key=""):
# gym.Wrapper.__init__(self, env)
self.env = env
self.observation_space = gym.spaces.Box(
low=0,
high=255,
shape=(256,256,3), # Assuming the observation is an image of size 256x256 with 3 color channels
dtype=np.uint8)
self._obs_key = obs_key
def observation(self, observation):
"""
This method is called by the gym.ObservationWrapper after the environment's
step or reset methods return an observation.
"""
# Add the new observation to the history buffer
return observation[self._obs_key]
def step(self, action):
"""
Step the environment and return the observation from the specified key.
"""
obs, reward, done, info = self.env.step(action) ## LIBERO does not return truncated
return obs[self._obs_key][::-1, :, :], reward, done, False, obs ## Not sure why the image was upside down.
def reset(self, **kwargs):
"""
Reset the environment and return the observation from the specified key.
"""
obs = self.env.reset()
return obs[self._obs_key][::-1, :, :], obs
def eval_libero(model, device, cfg, iter_=0, log_dir="./",
tokenizer=None, text_model=None, wandb=None):
# cfg, model, device, log_dir, env, env_unwrapped, buffer,
# wandb, iter_, tokenizer=None, text_model=None):
from libero.libero import benchmark
from libero.libero.envs import OffScreenRenderEnv, DenseRewardEnv
import os
from libero.libero.utils import get_libero_path
from gymnasium.wrappers import FrameStackObservation
from einops import rearrange
benchmark_dict = benchmark.get_benchmark_dict()
task_suite_name = cfg.sim.task_set # can also choose libero_spatial, libero_object, etc.
task_suite = benchmark_dict[task_suite_name]()
# Load initial states and goal images from Hugging Face dataset if provided
init_states_dataset = None
if hasattr(cfg.sim, 'libero_init_state_hf_repo') and cfg.sim.libero_init_state_hf_repo:
print(f"Loading initial states from Hugging Face: {cfg.sim.libero_init_state_hf_repo}")
from datasets import load_dataset
init_states_dataset = load_dataset(cfg.sim.libero_init_state_hf_repo, split='train')
print(f"Loaded dataset with {len(init_states_dataset)} entries")
elif hasattr(cfg.sim, 'libero_init_state_file') and cfg.sim.libero_init_state_file:
print(f"Loading initial states from HDF5: {cfg.sim.libero_init_state_file}")
init_states_dataset = h5py.File(hydra.utils.get_original_cwd()+cfg.sim.libero_init_state_file, 'r')
# retrieve a specific task
tasks = cfg.sim.eval_tasks
for task_id in tasks:
task = task_suite.get_task(task_id)
task_name = task.name
instruction = task.language
task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
print(f"[info] retrieving task {task_id} from suite {task_suite_name}, the " + \
f"language instruction is {instruction}, and the bddl file is {task_bddl_file}")
# step over the environment
env_args = {
"bddl_file_name": task_bddl_file,
"camera_heights": 256,
"camera_widths": 256
}
env = DenseRewardEnv(**env_args) # env = OffScreenRenderEnv(**env_args)
env.seed(0)
# Load initial states from dataset if available, otherwise use default
task_description = instruction.replace(" ", "_")
task_demos = None
if init_states_dataset is not None:
if isinstance(init_states_dataset, h5py.File):
# HDF5 format
if task_description in init_states_dataset:
task_grp = init_states_dataset[task_description]
num_init_states = len(task_grp.keys())
print(f"Loaded {num_init_states} initial states from HDF5 for task: {task_description}")
else:
task_grp = None
init_states = task_suite.get_task_init_states(task_id)
num_init_states = len(init_states)
print(f"Using default initial states for task: {task_description}")
else:
# Hugging Face dataset format
task_demos = [item for item in init_states_dataset if item.get('task_description') == task_description]
num_init_states = len(task_demos)
if num_init_states > 0:
print(f"Loaded {num_init_states} initial states from HF dataset for task: {task_description}")
else:
init_states = task_suite.get_task_init_states(task_id)
num_init_states = len(init_states)
print(f"Using default initial states for task: {task_description}")
else:
init_states = task_suite.get_task_init_states(task_id) # for benchmarking purpose, we fix the a set of initial states
num_init_states = len(init_states)
print(f"Using default initial states for task: {task_description}")
# for init_state_id in range(len(init_states)):
for init_state_id in range(min(2, num_init_states)): ## Just do a couple different initializations for eval
# Load init_state and goal_img from dataset or use default
if init_states_dataset is not None:
if isinstance(init_states_dataset, h5py.File):
# HDF5 format
if task_grp is not None:
demo_key = f"demo_{init_state_id}"
if demo_key in task_grp:
init_state = task_grp[demo_key]['init_state'][()]
goal_img = task_grp[demo_key]['goal_img'][()] if 'goal_img' in task_grp[demo_key] else None
print(f"Loaded init_state and goal_img from HDF5 for {demo_key}")
else:
init_state = init_states[init_state_id]
goal_img = None
else:
init_state = init_states[init_state_id]
goal_img = None
else:
# Hugging Face dataset format
if task_demos and init_state_id < len(task_demos):
demo = task_demos[init_state_id]
init_state = np.array(demo['init_state'])
goal_img = np.array(demo['goal_img']) if 'goal_img' in demo and demo['goal_img'] is not None else None
print(f"Loaded init_state and goal_img from HF dataset for demo {init_state_id}")
else:
init_state = init_states[init_state_id]
goal_img = None
else:
init_state = init_states[init_state_id]
goal_img = None
env.reset()
env.set_init_state(init_state)
env_ = FrameStackObservation(DictWrapper(env, obs_key="agentview_image"), cfg.policy.obs_stacking) ## Stacking the observations
obs, info = env_.reset()
mask = get_blocked_mask(cfg, targets=None, T=0) ## Get the blocked mask
txt_goal = get_text_tokens(cfg, tokenizer, text_model, instruction, model=model)
# Use goal image from HDF5 if available, otherwise use first observation
if goal_img is not None:
image_goal = goal_img
print(f"Using goal image from HDF5, shape: {image_goal.shape}")
else:
image_goal = obs.reshape((256, 256, 3*cfg.policy.obs_stacking))[:,:,:3]
print("Using first observation as goal image")
frames = []
rewards = []
infos = []
last_action = np.zeros(cfg.action_dim) # Track last action taken
done, truncated, timeLimit, t, wait_steps = False, False, 400, 0, 00
while not (done or truncated or (t > (timeLimit + wait_steps))):
## Reshape the image to the correct size and stack the hostory on the last channel dimension
# image = obs[0]
if t < wait_steps: ## let object stabalize before acting.
obs, reward, done, truncated, info = env_.step([0,0,0,0,0,0,0])
t += 1
continue
# obs = obs.reshape((128, 128, 3*cfg.policy.obs_stacking)) ## Assuming the observation is an image of size 128x128 with 3 color channels
obs = rearrange(obs, 't h w c -> h w (t c)', c=3, t=cfg.policy.obs_stacking) ## Rearranging the image to have the stacked history in the last channel dimension
# image = obs[:,:,:3] ## Remove the last dimension of the image color
obs_state = model.preprocess_state(obs)
goal_state = model.preprocess_goal_image(image_goal)
pose_ = model.encode_pose(torch.tensor([[np.concatenate(
(info["robot0_eef_pos"],
info["robot0_eef_quat"][:3],
[(info["robot0_gripper_qpos"][0])]), axis=-1)]],
dtype=torch.float32)).to(device)
# Prepare last_action tensor if available
last_action_tensor = None
if last_action is not None:
last_action_tensor = model.encode_action(torch.tensor([last_action[:cfg.action_dim]], dtype=torch.float32)).to(device)
action, loss = model.forward(torch.tensor(np.array([obs_state])).to(device)
,torch.tensor(txt_goal).to(device)
,torch.tensor(np.array([goal_state])).to(device),
mask_=True,
pose=pose_,
last_action=last_action_tensor,
)
action = model.decode_action(action[0]).cpu().detach().numpy()
last_action = action.copy() # Store for next iteration
## If the actions are stacked into a longer vector execute the sequence of actions
for step_ in range(cfg.policy.action_stacking):
act_ = action[cfg.action_dim*step_:(cfg.action_dim*(step_+1))]
## Need to process LIBERO gripper action [0, 1] -> [-1, 1], then invert, https://github.com/moojink/openvla-oft/blob/e4287e94541f459edc4feabc4e181f537cd569a8/experiments/robot/libero/run_libero_eval.py#L265
## If the model is the RepayModel don't do this conversion
# if not hasattr(model, 'trajectory_loaded'):
# act_[6] = ((act_[6] - 0.5) * 2) # * -1.0
obs, reward, done, truncated, info = env_.step(act_)
# image = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)
# image = image[:,:,:3] ## Remove last dimension of image color
# Store the original image for video before stacking/processing
image = obs[0]
frames.append(image)
# reward = -(np.linalg.norm(info["eof_to_obj1_diff"]) + np.linalg.norm(info["eof_to_obj1_diff"])) ## Use a shaped reward as distance between gripper and objects
rewards.append(reward)
infos.append(info)
t=t+1
# print(f"Step {t}, reward: {reward:.4f}, done: {done}, truncated: {truncated}")
if done or truncated:
print("Episode finished with success after {} timesteps".format(step_))
break
if done:
print("Episode finished with success after {} timesteps".format(step_))
break
import os
path_ = os.path.join(log_dir, f"libero-{iter_}-task-id-{task_id}-init-id-{init_state_id}.mp4")
import imageio
imageio.mimsave(path_, frames, fps=20)
episode_stats = info.get('episode_stats', {})
episode_stats['rewards'] = np.mean(rewards)
episode_stats['video_url'] = path_
print(f"avg reward {np.mean(rewards):.8f}")
if not cfg.testing:
wandb.log({"avg reward_"+str(task_id): np.mean(rewards)})
if not cfg.testing:
wandb.log({"example": wandb.Video(path_)})
env.close()
# Close HDF5 file if it was opened
if init_states_dataset is not None and isinstance(init_states_dataset, h5py.File):
init_states_dataset.close()
print("Closed HDF5 file")
return episode_stats
import hydra
from omegaconf import DictConfig
@hydra.main(config_path="./conf", config_name="64pix-pose")
def my_main(cfg: DictConfig):
import torch
# ------------
# Train and test splits
# Loading data
# create RLDS dataset builder
log_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
cfg.dataset.load_dataset = "skip"
# model = GRP(cfg)
# model_ = torch.load("/home/gberseth/playground/mini_grp/miniGRP.pth")
model_dir = hydra.utils.get_original_cwd()+"/mini-grp/miniGRP.pth"
print ("Loading model from:", model_dir)
if "dataset" == cfg.model.type:
## load the dataset
from mini_shuffel_buffer import CircularBuffer
from mock_grp_model import ReplayModel
cfg.dataset.load_dataset = True
model_ = ReplayModel(cfg)
dataset_buffer = CircularBuffer(cfg.dataset.buffer_size, cfg, model=model_)
model_.set_dataset(dataset_buffer)
else:
from grp_model import GRP
model_ = torch.load(model_dir, pickle_module=dill)
# model_._cgf = cfg
tokenizer = None
text_model = None
if cfg.dataset.encode_with_t5: ## Load T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained(cfg.dataset.t5_version)
text_model = T5ForConditionalGeneration.from_pretrained(cfg.dataset.t5_version)
if "libero" in cfg.simEval:
results = eval_libero(model_.to(cfg.device), device=cfg.device, cfg=cfg,
iter_=0, tokenizer=tokenizer, text_model=text_model, wandb=None,
log_dir=log_dir)
if "simple_env" in cfg.simEval:
import simpler_env
task_name = "widowx_carrot_on_plate" # @param ["google_robot_pick_coke_can", "google_robot_move_near", "google_robot_open_drawer", "google_robot_close_drawer", "widowx_spoon_on_towel", "widowx_carrot_on_plate", "widowx_stack_cube", "widowx_put_eggplant_in_basket"]
if 'env' in locals():
print("Closing existing env")
env.close()
del env
env = simpler_env.make(task_name)
env_unwrapped = env.env.env.env ## Updated gymnasium wrapper adds lots of wrappers.
results = eval_model_in_sim(cfg, model_.to(cfg.device), device=cfg.device, log_dir=log_dir,
env=env, env_unwrapped=env_unwrapped,
wandb=None, iter_=0, tokenizer=tokenizer, text_model=text_model)
print("results:", results)
# cbuffer.save(cfg.dataset.to_name)
if __name__ == "__main__":
results = my_main()
print("results:", results)