Spaces:

gberseth
/

robo-eval

Runtime error

App Files Files Community

Neo-X commited on Jan 6

Commit

f3d825f

1 Parent(s): daef8c2

Adding config.yaml to .gitignore and updating app.py to load GRP model and config correctly.

Browse files

Files changed (6) hide show

.gitignore +1 -0
README.md +7 -0
app.py +69 -28
requests.csv +2 -0
results.csv +2 -0
sim_eval.py +273 -0

.gitignore CHANGED Viewed

@@ -14,3 +14,4 @@ temp.gif
 miniGRP.pth
 __pycache__/
 mini_grp.egg-info/

 miniGRP.pth
 __pycache__/
 mini_grp.egg-info/
+conf/config.yaml

README.md CHANGED Viewed

@@ -9,3 +9,10 @@ short_description: test robot models
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Submit your files in the form
+miniGRP.pth
+conf/config.yaml
+grp_model.py

app.py CHANGED Viewed

@@ -11,11 +11,10 @@ import time
 HF_TOKEN = os.environ.get("HF_TOKEN")
 REQUESTS_DATASET = "gberseth/rl-leaderboard-requests" # REPLACE THIS
 RESULTS_DATASET = "gberseth/rl-leaderboard-results"   # REPLACE THIS
-ENV_NAME = "CartPole-v1"  # The Gym environment to evaluate
 EVAL_EPISODES = 10        # How many times to run the agent
 # Authenticate
-login(token=HF_TOKEN)
 api = HfApi()
 def evaluate_policy(model_id):
@@ -27,35 +26,69 @@ def evaluate_policy(model_id):
         # 1. Download the model repository
         # We look for a file named "ppo_cartpole.zip" or just standard "model.zip"
         # Adjust 'allow_patterns' to match what you require users to submit.
-        repo_path = snapshot_download(repo_id=model_id, allow_patterns=["*.zip"])
         # Find the .zip file in the downloaded folder
         model_file = None
         for root, dirs, files in os.walk(repo_path):
             for file in files:
-                if file.endswith(".zip"):
                     model_file = os.path.join(root, file)
-                    break
         if not model_file:
-            return None, "Error: No .zip model file found in repo."
         # 2. Load the PPO Agent
         # custom_objects map may be needed if python versions differ, but usually fine for PPO
-        from mini_shuffel_buffer import CircularBuffer
         import torch
         # ------------
         # Train and test splits
         # Loading data
         # create RLDS dataset builder
-        log_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
         cfg.dataset.load_dataset = "skip"
-        cBuffer = CircularBuffer(cfg.dataset.buffer_size, cfg)
-        model_dir = hydra.utils.get_original_cwd()+"/mini-grp/miniGRP.pth"
-        print ("Loading model from:", model_dir)
-        model_ = torch.load(model_dir)
         # model_._cgf = cfg
         # model = PPO.load(model_file)
         # 3. Run Evaluation Loop
@@ -67,9 +100,9 @@ def evaluate_policy(model_id):
             text_model = T5ForConditionalGeneration.from_pretrained(cfg.dataset.t5_version)
         if "libero" in cfg.simEval:
-            results = eval_libero(cBuffer, model_.to(cfg.device), device=cfg.device, cfg=cfg,
                             iter_=0, tokenizer=tokenizer, text_model=text_model, wandb=None,
-                            log_dir=log_dir)
         if "simple_env" in cfg.simEval:
             import simpler_env
             task_name = "widowx_carrot_on_plate"  # @param ["google_robot_pick_coke_can", "google_robot_move_near", "google_robot_open_drawer", "google_robot_close_drawer", "widowx_spoon_on_towel", "widowx_carrot_on_plate", "widowx_stack_cube", "widowx_put_eggplant_in_basket"]
@@ -79,9 +112,10 @@ def evaluate_policy(model_id):
                 del env
             env = simpler_env.make(task_name)
             env_unwrapped = env.env.env.env ## Updated gymnasium wrapper adds lots of wrappers.
-            results = eval_model_in_sim(cfg, model_.to(cfg.device), device=cfg.device, log_dir=log_dir,
                                     env=env, env_unwrapped=env_unwrapped,
-                                    buffer=cBuffer, wandb=None, iter_=0, tokenizer=tokenizer, text_model=text_model)
             print("results:", results)
         # cbuffer.save(cfg.dataset.to_name)
@@ -111,7 +145,8 @@ def run_evaluation_loop():
     # 2. Filter for Pending Submissions
     # Assuming columns: [model_id, status, submitted_by]
-    pending_rows = requests_df[requests_df["status"] == "Pending"]
     if len(pending_rows) == 0:
         return "No pending submissions."
@@ -167,18 +202,24 @@ def run_evaluation_loop():
     return f"Processed {model_id}: Score {score}"
-# --- GRADIO UI (To keep the Space running) ---
-with gr.Blocks() as demo:
-    gr.Markdown("# RL Evaluation Backend")
-    gr.Markdown("This space runs in the background to evaluate new submissions.")
-    # A button to manually trigger eval (useful for debugging)
-    eval_btn = gr.Button("Run Evaluator Now")
-    output = gr.Textbox(label="Logs")
-    eval_btn.click(fn=run_evaluation_loop, outputs=output)
-    # Auto-run every 60 seconds (requires Gradio 'live' updates or external scheduler)
-    # In a real deployment, you might use a simplified cron loop or `gradio.Timer`
-demo.queue().launch()

 HF_TOKEN = os.environ.get("HF_TOKEN")
 REQUESTS_DATASET = "gberseth/rl-leaderboard-requests" # REPLACE THIS
 RESULTS_DATASET = "gberseth/rl-leaderboard-results"   # REPLACE THIS
 EVAL_EPISODES = 10        # How many times to run the agent
 # Authenticate
+# login(token=HF_TOKEN)
 api = HfApi()
 def evaluate_policy(model_id):
         # 1. Download the model repository
         # We look for a file named "ppo_cartpole.zip" or just standard "model.zip"
         # Adjust 'allow_patterns' to match what you require users to submit.
+        repo_path = snapshot_download(repo_id=model_id, allow_patterns=["*.pth", "*.pt", "*.zip", "*.yaml", "*.py"])
         # Find the .zip file in the downloaded folder
         model_file = None
         for root, dirs, files in os.walk(repo_path):
             for file in files:
+                if file.endswith(".pth"):
                     model_file = os.path.join(root, file)
+                if file.endswith("model.py"):
+                    grp_file_path = os.path.join(root, file)
+                if file.endswith(".yaml") or file.endswith(".yalm"):
+                    hydra_config_file_path = os.path.join(root, file)
         if not model_file:
+            return None, "Error: No .pth model file found in repo."
         # 2. Load the PPO Agent
         # custom_objects map may be needed if python versions differ, but usually fine for PPO
         import torch
         # ------------
         # Train and test splits
         # Loading data
         # create RLDS dataset builder
+        # log_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
+        ## Load the hydra config
+        from omegaconf import OmegaConf
+        cfg = OmegaConf.load(hydra_config_file_path)
         cfg.dataset.load_dataset = "skip"
+        ## load the GRP model from the file doanloaded in the snappshot
+        # Dynamically load the module
+        import importlib.util, sys
+        # module_name = "GRP"
+        # spec = importlib.util.spec_from_file_location(module_name, grp_file_path)
+        # if spec is None:
+        #     print(f"Could not find a spec for module {module_name} at {grp_file_path}")
+        #     return None
+        # module = importlib.util.module_from_spec(spec)
+        # sys.modules[module_name] = module
+        # try:
+        #     spec.loader.exec_module(module)
+        #     print(f"Successfully loaded module: {module_name}")
+        # except Exception as e:
+        #     print(f"Error executing module {module_name}: {e}")        # module_name = "GRP"
+        # spec = importlib.util.spec_from_file_location(module_name, grp_file_path)
+        # if spec is None:
+        #     print(f"Could not find a spec for module {module_name} at {grp_file_path}")
+        #     return None
+        # module = importlib.util.module_from_spec(spec)
+        # sys.modules[module_name] = module
+        # try:
+        #     spec.loader.exec_module(module)
+        #     print(f"Successfully loaded module: {module_name}")
+        # except Exception as e:
+        #     print(f"Error executing module {module_name}: {e}")
+        sys.path.insert(0, repo_path+"/")
+        from grp_model import GRP
+        model_ = torch.load(model_file)
         # model_._cgf = cfg
         # model = PPO.load(model_file)
+        print("Memory used by the model:", torch.cuda.memory_allocated(cfg.device) / 1e6, "MB") ## This to the database later.
         # 3. Run Evaluation Loop
             text_model = T5ForConditionalGeneration.from_pretrained(cfg.dataset.t5_version)
         if "libero" in cfg.simEval:
+            results = eval_libero(model_.to(cfg.device), device=cfg.device, cfg=cfg,
                             iter_=0, tokenizer=tokenizer, text_model=text_model, wandb=None,
+                            log_dir="./")
         if "simple_env" in cfg.simEval:
             import simpler_env
             task_name = "widowx_carrot_on_plate"  # @param ["google_robot_pick_coke_can", "google_robot_move_near", "google_robot_open_drawer", "google_robot_close_drawer", "widowx_spoon_on_towel", "widowx_carrot_on_plate", "widowx_stack_cube", "widowx_put_eggplant_in_basket"]
                 del env
             env = simpler_env.make(task_name)
             env_unwrapped = env.env.env.env ## Updated gymnasium wrapper adds lots of wrappers.
+            from sim_eval import eval_model_in_sim
+            results = eval_model_in_sim(cfg, model_.to(cfg.device), device=cfg.device, log_dir="./",
                                     env=env, env_unwrapped=env_unwrapped,
+                                    wandb=None, iter_=0, tokenizer=tokenizer, text_model=text_model)
             print("results:", results)
         # cbuffer.save(cfg.dataset.to_name)
     # 2. Filter for Pending Submissions
     # Assuming columns: [model_id, status, submitted_by]
+    # pending_rows = requests_df[requests_df["status"] == "Pending"]
+    pending_rows = requests_df[requests_df["status"].isin(["Pending", "In Progress", "Failed"])]
     if len(pending_rows) == 0:
         return "No pending submissions."
     return f"Processed {model_id}: Score {score}"
+# # --- GRADIO UI (To keep the Space running) ---
+# with gr.Blocks() as demo:
+#     gr.Markdown("# RL Evaluation Backend")
+#     gr.Markdown("This space runs in the background to evaluate new submissions.")
+#     # A button to manually trigger eval (useful for debugging)
+#     eval_btn = gr.Button("Run Evaluator Now")
+#     output = gr.Textbox(label="Logs")
+#     eval_btn.click(fn=run_evaluation_loop, outputs=output)
+#     # Auto-run every 60 seconds (requires Gradio 'live' updates or external scheduler)
+#     # In a real deployment, you might use a simplified cron loop or `gradio.Timer`
+# demo.queue().launch()
+if __name__ == "__main__":
+    # while True:
+    log = run_evaluation_loop()
+    print(log)
+    # time.sleep(60)  # Check every 60 seconds

requests.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ model_id,status,created_at,submitted_by
2	+ gberseth/mini-grp,Done,2026-01-05 23:02:20,

results.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ model_id,mean_reward,status,completed_at
2	+ gberseth/mini-grp,-0.49810627,Success,

sim_eval.py ADDED Viewed

	@@ -0,0 +1,273 @@

+def get_text_tokens(cfg, tokenizer, text_model, goal, model=None):
+    """
+    Get the text tokens/embeddings for the goal.
+    If a `model` with `encode_text_goal` is provided, use it so callers don't need a buffer.
+    """
+    if model is not None:
+        return model.encode_text_goal(goal, tokenizer=tokenizer, text_model=text_model)
+    # fallback to legacy behaviour
+    if cfg.dataset.encode_with_t5:
+        goal_ = np.zeros((cfg.max_block_size, cfg.n_embd), dtype=np.float32)
+        input_ids = tokenizer(goal, return_tensors="pt").input_ids
+        goal_t = text_model.encoder(input_ids).last_hidden_state.detach().cpu().numpy() ## Get the goal embedding
+        goal_[:len(goal_t[0]), :] = goal_t[0][:cfg.max_block_size] ## Overwrite just the zeros up to the size of this vector, smaller vectors will have < max_block_size
+    else:
+        goal_ = " " * cfg.max_block_size
+        goal_ = goal[:cfg.max_block_size] + goal_[len(goal):cfg.max_block_size]
+        # legacy buffer-based encoding is not available here
+        raise RuntimeError("Text encoding without model requires a buffer; pass model into get_text_tokens")
+    return np.expand_dims(goal_, axis=0)
+def get_blocked_mask(cfg, targets=None, T=0):
+    ## Compute blocked masks
+    c=192 ## Number of patches/channels in the image
+    mask = torch.ones((1 + (c * cfg.policy.obs_stacking) + T + c, ), device=cfg.device) ## (1, T)
+    if targets is None:
+        pass
+    elif (torch.rand(1)[0] > 0.66):
+        mask[1 + (c * cfg.policy.obs_stacking): 1 + (c * cfg.policy.obs_stacking) + T] = torch.zeros((1,T), device=cfg.device) ## Mask goal string
+    elif (torch.rand(1)[0] > 0.33):
+        mask[1 + (c * cfg.policy.obs_stacking) + T: 1 + (c * cfg.policy.obs_stacking) + T + c] = torch.zeros((1,c), device=cfg.device) ## Mask goal image
+def eval_model_in_sim(cfg, model, device, log_dir, env, env_unwrapped,
+                      wandb, iter_, tokenizer=None, text_model=None):
+    from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
+    print("Evaluating model in sim environment")
+    from collections import deque
+    from einops import rearrange
+    rewards = []
+    for j in range(cfg.sim.eval_episodes): ## Better to eval over a few different goal configurations
+        obs, reset_info = env.reset()
+        obs_ = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)[:,:,:3]
+        obs_hist = deque(maxlen=cfg.policy.obs_stacking)
+        obs_hist.append(obs_)
+        obs_hist.append(obs_)
+        obs_hist.append(obs_)
+        instruction = env_unwrapped.get_language_instruction()
+        # print("Reset info", reset_info)
+        print("Instruction", instruction)
+        frames = []
+        done, truncated, timeLimit, t = False, False, 100, 0
+        txt_goal = get_text_tokens(cfg, tokenizer, text_model, instruction, model=model)
+        while not (done or truncated or (t > timeLimit)):
+            # action[:3]: delta xyz; action[3:6]: delta rotation in axis-angle representation;
+            # action[6:7]: gripper (the meaning of open / close depends on robot URDF)
+            image = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)
+            image = image[:,:,:3] ## Remove last dimension of image color
+            obs_hist.append(image) ## Add the new observation to the history buffer
+            # obs = [obs_["image"] for obs_ in obs] # obs is a list of dicts
+            image = np.stack(obs_hist, axis=-1)  # stack along the last dimension
+            image = rearrange(image, 'h w c t -> h w (c t)')  # add batch dimension
+            obs_state = model.preprocess_state(image).to(device)
+            goal_state = model.preprocess_goal_image(image[:,:,:3]).to(device)
+            action, loss = model.forward(torch.tensor(obs_state.unsqueeze(0), dtype=torch.float32).to(device)
+                                ,torch.tensor(txt_goal).to(device)
+                                ,torch.tensor(goal_state.unsqueeze(0), dtype=torch.float32).to(device),
+                                mask_=True, ## Masks goal image
+                                pose=torch.tensor([[obs["extra"]["tcp_pose"]]], dtype=torch.float32).to(device),
+                                )
+            action = model.decode_action(action[0]).cpu().detach().numpy() ## Add in the gripper close action
+            obs, reward, done, truncated, info = env.step(action)
+            reward = -np.linalg.norm(info["eof_to_obj1_diff"])
+            frames.append(image)
+            rewards.append(reward)
+            t=t+1
+    episode_stats = info.get('episode_stats', {})
+    episode_stats['rewards'] = np.mean(rewards)
+    # print("Episode stats", episode_stats)
+    # print(f"avg reward {np.mean(episode_stats['rewards']):.8f}")
+    if not cfg.testing:
+        wandb.log({"avg reward": np.mean(rewards)})
+    import moviepy.editor as mpy
+    clip = mpy.ImageSequenceClip(list(frames), fps=20)
+    path_ = log_dir+"/sim-env-"+str(iter_)+".mp4"
+    # clip.write_videofile(path_, fps=20, audio=False, logger=None) ## Getting weird Nonetype issues. Will need to fix version issue later.
+    if not cfg.testing:
+        wandb.log({"example": wandb.Video(path_)})
+    return episode_stats
+import gymnasium as gym
+# --- History Stacking Wrapper ---
+class DictWrapper(gym.ObservationWrapper):
+    # from gymnasium.spaces import Box
+    """
+    A wrapper that grabs the observation from a specific key in the dictionary.
+    """
+    def __init__(self, env, obs_key=""):
+        # gym.Wrapper.__init__(self, env)
+        self.env = env
+        self.observation_space = gym.spaces.Box(
+            low=0,
+            high=255,
+            shape=(128,128,3),  # Assuming the observation is an image of size 128x128 with 3 color channels
+            dtype=np.uint8)
+        self._obs_key = obs_key
+    def observation(self, observation):
+        """
+        This method is called by the gym.ObservationWrapper after the environment's
+        step or reset methods return an observation.
+        """
+        # Add the new observation to the history buffer
+        return observation[self._obs_key]
+    def step(self, action):
+        """
+        Step the environment and return the observation from the specified key.
+        """
+        obs, reward, done, info = self.env.step(action) ## LIBERO does not return truncated
+        return obs[self._obs_key][::-1, :, :], reward, done, False, obs ## Not sure why the image was upside down.
+    def reset(self, **kwargs):
+        """
+        Reset the environment and return the observation from the specified key.
+        """
+        obs = self.env.reset()
+        return obs[self._obs_key][::-1, :, :], obs
+def eval_libero(model, device, cfg, iter_=0, log_dir="./",
+                tokenizer=None, text_model=None, wandb=None):
+        # cfg, model, device, log_dir, env, env_unwrapped, buffer,
+        #               wandb, iter_, tokenizer=None, text_model=None):
+    from libero.libero import benchmark
+    from libero.libero.envs import OffScreenRenderEnv, DenseRewardEnv
+    import os
+    from libero.libero.utils import get_libero_path
+    from gymnasium.wrappers import FrameStackObservation
+    from einops import rearrange
+    benchmark_dict = benchmark.get_benchmark_dict()
+    task_suite_name = "libero_90" # can also choose libero_spatial, libero_object, etc.
+    task_suite = benchmark_dict[task_suite_name]()
+    # retrieve a specific task
+    tasks = cfg.sim.eval_tasks
+    for task_id in tasks:
+        task = task_suite.get_task(task_id)
+        task_name = task.name
+        instruction = task.language
+        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+        print(f"[info] retrieving task {task_id} from suite {task_suite_name}, the " + \
+            f"language instruction is {instruction}, and the bddl file is {task_bddl_file}")
+        # step over the environment
+        env_args = {
+            "bddl_file_name": task_bddl_file,
+            "camera_heights": 128,
+            "camera_widths": 128
+        }
+        env = DenseRewardEnv(**env_args)
+        env.seed(0)
+        init_states = task_suite.get_task_init_states(task_id) # for benchmarking purpose, we fix the a set of initial states
+        init_state_id = 0
+        env.set_init_state(init_states[init_state_id])
+        env = FrameStackObservation(DictWrapper(env, obs_key="agentview_image"), cfg.policy.obs_stacking) ## Stacking the observations
+        obs, info = env.reset()
+        mask = get_blocked_mask(cfg, targets=None, T=0) ## Get the blocked mask
+        txt_goal = get_text_tokens(cfg, tokenizer, text_model, instruction, model=model)
+        image_goal = obs.reshape((128, 128, 3*cfg.policy.obs_stacking))[:,:,:3] ## Assuming the observation is an image of size 128x128 with 3 color channels
+        frames = []
+        rewards = []
+        infos = []
+        for step_ in range(250):
+            ## Reshape the image to the correct size and stack the hostory on the last channel dimension
+            image = obs[0]
+            # obs = obs.reshape((128, 128, 3*cfg.policy.obs_stacking)) ## Assuming the observation is an image of size 128x128 with 3 color channels
+            obs = rearrange(obs, 't h w c -> h w (t c)', c=3, t=cfg.policy.obs_stacking) ## Rearranging the image to have the stacked history in the last channel dimension
+            # image = obs[:,:,:3] ## Remove the last dimension of the image color
+            obs_state = model.preprocess_state(obs)
+            goal_state = model.preprocess_goal_image(image_goal)
+            action, loss = model.forward(torch.tensor(np.array([obs_state])).to(device)
+                        ,torch.tensor(txt_goal).to(device)
+                        ,torch.tensor(np.array([goal_state])).to(device), ## Not the correct goal image... Should mask this.
+                        mask_=True,
+                        pose=torch.tensor([[np.concatenate( (info["robot0_eef_pos"],
+                                                           info["robot0_eef_quat"][:3],
+                                                            [(info["robot0_gripper_qpos"][0] - info["robot0_gripper_qpos"][0]) < 0.005 ]), axis=-1)]], dtype=torch.float32).to(device),
+                        morphology=torch.tensor([0], dtype=torch.uint8).to(device) ## Morphology is 0 for arm, 1 for A1}
+                        )
+            action = model.decode_action(action[0,0,:7]).cpu().detach().numpy() ## Add in the gripper close action
+            frames.append(image)
+            x = env.step(action)
+            obs, reward, done, truncated, info = x
+            rewards.append(reward)
+            infos.append(info)
+            if done:
+                print("Episode finished after {} timesteps".format(step_))
+                break
+        print(f"avg reward {np.mean(rewards):.8f}")
+        if not cfg.testing:
+            wandb.log({"avg reward_"+str(task_id): np.mean(rewards)})
+        import moviepy.editor as mpy
+        clip = mpy.ImageSequenceClip(list(frames), fps=20)
+        path_ = log_dir+"/sim-libero-90-"+str(task_id)+"-"+str(iter_)+".mp4"
+        clip.write_videofile(path_, fps=20)
+        if not cfg.testing:
+            wandb.log({"example": wandb.Video(path_)})
+        env.close()
+import hydra
+from omegaconf import DictConfig
+from mini_grp import *
+@hydra.main(config_path="./conf", config_name="libero-simpleEnv-64pix-pose")
+def my_main(cfg: DictConfig):
+    from mini_shuffel_buffer import CircularBuffer
+    import torch
+    # ------------
+    # Train and test splits
+    # Loading data
+    # create RLDS dataset builder
+    log_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
+    cfg.dataset.load_dataset = "skip"
+    # model = GRP(cfg)
+    # model_ = torch.load("/home/gberseth/playground/mini_grp/miniGRP.pth")
+    model_dir = hydra.utils.get_original_cwd()+"/mini-grp/miniGRP.pth"
+    print ("Loading model from:", model_dir)
+    model_ = torch.load(model_dir)
+    # model_._cgf = cfg
+    tokenizer = None
+    text_model = None
+    if cfg.dataset.encode_with_t5: ## Load T5 model
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+        tokenizer = T5Tokenizer.from_pretrained(cfg.dataset.t5_version)
+        text_model = T5ForConditionalGeneration.from_pretrained(cfg.dataset.t5_version)
+    if "libero" in cfg.simEval:
+        results = eval_libero(model_.to(cfg.device), device=cfg.device, cfg=cfg,
+                          iter_=0, tokenizer=tokenizer, text_model=text_model, wandb=None,
+                          log_dir=log_dir)
+    if "simple_env" in cfg.simEval:
+        import simpler_env
+        task_name = "widowx_carrot_on_plate"  # @param ["google_robot_pick_coke_can", "google_robot_move_near", "google_robot_open_drawer", "google_robot_close_drawer", "widowx_spoon_on_towel", "widowx_carrot_on_plate", "widowx_stack_cube", "widowx_put_eggplant_in_basket"]
+        if 'env' in locals():
+            print("Closing existing env")
+            env.close()
+            del env
+        env = simpler_env.make(task_name)
+        env_unwrapped = env.env.env.env ## Updated gymnasium wrapper adds lots of wrappers.
+        results = eval_model_in_sim(cfg, model_.to(cfg.device), device=cfg.device, log_dir=log_dir,
+                                env=env, env_unwrapped=env_unwrapped,
+                                wandb=None, iter_=0, tokenizer=tokenizer, text_model=text_model)
+        print("results:", results)
+    # cbuffer.save(cfg.dataset.to_name)
+if __name__ == "__main__":
+    results = my_main()
+    print("results:", results)