Spaces:

robometer
/

rewardeval_ui

Running

App Files Files Community

Anthony Liang commited on Jan 14

Commit

cc5bab9

1 Parent(s): 3e462dd

update

Browse files

Files changed (4) hide show

app.py +70 -52
eval_utils.py +110 -6
eval_viz_utils.py +1 -1
samplers/eval/confusion_matrix.py +27 -27

app.py CHANGED Viewed

@@ -75,15 +75,17 @@ _server_state = {
 }
-def discover_available_models(base_url: str = "http://40.119.56.66", port_range: tuple = (8000, 8010)) -> List[Tuple[str, str]]:
     """Discover available models by pinging ports in the specified range.
     Returns:
         List of tuples: [(server_url, model_name), ...]
     """
     available_models = []
     start_port, end_port = port_range
     for port in range(start_port, end_port + 1):
         server_url = f"{base_url.rstrip('/')}:{port}"
         try:
@@ -108,7 +110,7 @@ def discover_available_models(base_url: str = "http://40.119.56.66", port_range:
         except requests.exceptions.RequestException:
             # Port not available, continue
             continue
     return available_models
@@ -116,7 +118,7 @@ def get_model_info_for_url(server_url: str) -> Optional[str]:
     """Get formatted model info for a given server URL."""
     if not server_url:
         return None
     try:
         model_info_url = server_url.rstrip("/") + "/model_info"
         model_info_response = requests.get(model_info_url, timeout=5.0)
@@ -325,7 +327,7 @@ def process_single_video(
     # Get server URL from state if not provided
     if not server_url:
         server_url = _server_state.get("server_url")
     if not server_url:
         return None, "Please select a model from the dropdown above and ensure it's connected."
@@ -435,7 +437,7 @@ def process_two_videos(
     # Get server URL from state if not provided
     if not server_url:
         server_url = _server_state.get("server_url")
     if not server_url:
         return "Please select a model from the dropdown above and ensure it's connected.", None, None
@@ -560,7 +562,7 @@ def process_two_videos(
             # - Video A as reference trajectory
             # - Video B as similar trajectory
             # diff_trajectory is None in inference mode (only need similarity between ref and sim)
             # Create SimilaritySample with Video A as ref and Video B as sim
             similarity_sample = SimilaritySample(
                 ref_trajectory=trajectory_a,
@@ -601,8 +603,6 @@ def process_two_videos(
         return f"Error processing videos: {str(e)}", None, None
 # Create Gradio interface
 try:
     # Try with theme (Gradio 4.0+)
@@ -633,10 +633,10 @@ with demo:
                 None,
                 {},  # Empty mapping
             )
         _server_state["base_url"] = base_url
         models = discover_available_models(base_url, port_range=(8000, 8010))
         if not models:
             return (
                 gr.update(choices=[], value=None),
@@ -645,7 +645,7 @@ with demo:
                 None,
                 {},  # Empty mapping
             )
         # Format choices: show model_name in dropdown
         # Store mapping of model_name to URL in state
         choices = []
@@ -653,17 +653,17 @@ with demo:
         for url, name in models:
             choices.append(name)
             url_map[name] = url
         # Auto-select first model
         selected_choice = choices[0] if choices else None
         selected_url = url_map.get(selected_choice) if selected_choice else None
         # Get model info for selected model
         model_info_text = get_model_info_for_url(selected_url) if selected_url else ""
         status_text = f"✅ Found {len(models)} model(s). Auto-selected first model."
         _server_state["server_url"] = selected_url
         return (
             gr.update(choices=choices, value=selected_choice),
             gr.update(value=status_text, visible=True),
@@ -680,23 +680,25 @@ with demo:
                 gr.update(value="", visible=True),
                 None,
             )
         # Get URL from mapping
         server_url = url_mapping.get(model_choice) if url_mapping else None
         if not server_url:
             return (
-                gr.update(value="Could not find server URL for selected model. Please rediscover models.", visible=True),
                 gr.update(value="", visible=True),
                 None,
             )
         # Get model info
         model_info_text = get_model_info_for_url(server_url) or ""
         status, health_data, _ = check_server_health(server_url)
         _server_state["server_url"] = server_url
         return (
             gr.update(value=status, visible=True),
             gr.update(value=model_info_text, visible=True),
@@ -706,16 +708,16 @@ with demo:
     # Use Gradio's built-in Sidebar component (collapsible by default)
     with gr.Sidebar():
         gr.Markdown("### 🔧 Model Configuration")
         base_url_input = gr.Textbox(
             label="Base Server URL",
             placeholder="http://40.119.56.66",
             value="http://40.119.56.66",
             interactive=True,
         )
         discover_btn = gr.Button("🔍 Discover Models", variant="primary", size="lg")
         model_dropdown = gr.Dropdown(
             label="Select Model",
             choices=[],
@@ -723,11 +725,9 @@ with demo:
             interactive=True,
             info="Models will be discovered on ports 8000-8010",
         )
-        server_status = gr.Markdown(
-            "Click 'Discover Models' to find available models"
-        )
         gr.Markdown("---")
         gr.Markdown("### 📋 Model Information")
         model_info_display = gr.Markdown("")
@@ -848,7 +848,9 @@ with demo:
                         gr.update(visible=False),
                     )
-                video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
@@ -937,7 +939,9 @@ with demo:
                 if dataset is None:
                     return gr.update(visible=False), gr.update(visible=False)
-                video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
@@ -1009,7 +1013,13 @@ with demo:
             analyze_single_btn.click(
                 fn=process_single_video,
-                inputs=[single_video_input, task_text_input, server_url_state, fps_input_single, use_frame_steps_single],
                 outputs=[progress_plot, info_output],
                 api_name="process_single_video",
             )
@@ -1103,7 +1113,7 @@ with demo:
                     with gr.Row():
                         video_a_display = gr.Video(label="Video A", height=400)
                         video_b_display = gr.Video(label="Video B", height=400)
                     # Result text at the bottom
                     result_text = gr.Markdown("")
@@ -1161,7 +1171,9 @@ with demo:
                         gr.update(visible=False),
                     )
-                video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
@@ -1246,7 +1258,9 @@ with demo:
                 if dataset is None:
                     return gr.update(visible=False), gr.update(visible=False)
-                video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
@@ -1302,7 +1316,9 @@ with demo:
                         gr.update(visible=False),
                     )
-                video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
@@ -1387,7 +1403,9 @@ with demo:
                 if dataset is None:
                     return gr.update(visible=False), gr.update(visible=False)
-                video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
@@ -1405,13 +1423,9 @@ with demo:
                     return gr.update(visible=False), gr.update(visible=False)
             # Video A dataset selection handlers
-            dataset_name_a.change(
-                fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a]
-            )
-            refresh_configs_btn_a.click(
-                fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a]
-            )
             load_dataset_btn_a.click(
                 fn=load_dataset_a,
@@ -1454,13 +1468,9 @@ with demo:
             )
             # Video B dataset selection handlers
-            dataset_name_b.change(
-                fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b]
-            )
-            refresh_configs_btn_b.click(
-                fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b]
-            )
             load_dataset_btn_b.click(
                 fn=load_dataset_b,
@@ -1504,7 +1514,15 @@ with demo:
             analyze_dual_btn.click(
                 fn=process_two_videos,
-                inputs=[video_a_input, video_b_input, task_text_dual, prediction_type, server_url_state, fps_input_dual, use_frame_steps_dual],
                 outputs=[result_text, video_a_display, video_b_display],
                 api_name="process_two_videos",
             )

 }
+def discover_available_models(
+    base_url: str = "http://40.119.56.66", port_range: tuple = (8000, 8010)
+) -> List[Tuple[str, str]]:
     """Discover available models by pinging ports in the specified range.
     Returns:
         List of tuples: [(server_url, model_name), ...]
     """
     available_models = []
     start_port, end_port = port_range
     for port in range(start_port, end_port + 1):
         server_url = f"{base_url.rstrip('/')}:{port}"
         try:
         except requests.exceptions.RequestException:
             # Port not available, continue
             continue
     return available_models
     """Get formatted model info for a given server URL."""
     if not server_url:
         return None
     try:
         model_info_url = server_url.rstrip("/") + "/model_info"
         model_info_response = requests.get(model_info_url, timeout=5.0)
     # Get server URL from state if not provided
     if not server_url:
         server_url = _server_state.get("server_url")
     if not server_url:
         return None, "Please select a model from the dropdown above and ensure it's connected."
     # Get server URL from state if not provided
     if not server_url:
         server_url = _server_state.get("server_url")
     if not server_url:
         return "Please select a model from the dropdown above and ensure it's connected.", None, None
             # - Video A as reference trajectory
             # - Video B as similar trajectory
             # diff_trajectory is None in inference mode (only need similarity between ref and sim)
             # Create SimilaritySample with Video A as ref and Video B as sim
             similarity_sample = SimilaritySample(
                 ref_trajectory=trajectory_a,
         return f"Error processing videos: {str(e)}", None, None
 # Create Gradio interface
 try:
     # Try with theme (Gradio 4.0+)
                 None,
                 {},  # Empty mapping
             )
         _server_state["base_url"] = base_url
         models = discover_available_models(base_url, port_range=(8000, 8010))
         if not models:
             return (
                 gr.update(choices=[], value=None),
                 None,
                 {},  # Empty mapping
             )
         # Format choices: show model_name in dropdown
         # Store mapping of model_name to URL in state
         choices = []
         for url, name in models:
             choices.append(name)
             url_map[name] = url
         # Auto-select first model
         selected_choice = choices[0] if choices else None
         selected_url = url_map.get(selected_choice) if selected_choice else None
         # Get model info for selected model
         model_info_text = get_model_info_for_url(selected_url) if selected_url else ""
         status_text = f"✅ Found {len(models)} model(s). Auto-selected first model."
         _server_state["server_url"] = selected_url
         return (
             gr.update(choices=choices, value=selected_choice),
             gr.update(value=status_text, visible=True),
                 gr.update(value="", visible=True),
                 None,
             )
         # Get URL from mapping
         server_url = url_mapping.get(model_choice) if url_mapping else None
         if not server_url:
             return (
+                gr.update(
+                    value="Could not find server URL for selected model. Please rediscover models.", visible=True
+                ),
                 gr.update(value="", visible=True),
                 None,
             )
         # Get model info
         model_info_text = get_model_info_for_url(server_url) or ""
         status, health_data, _ = check_server_health(server_url)
         _server_state["server_url"] = server_url
         return (
             gr.update(value=status, visible=True),
             gr.update(value=model_info_text, visible=True),
     # Use Gradio's built-in Sidebar component (collapsible by default)
     with gr.Sidebar():
         gr.Markdown("### 🔧 Model Configuration")
         base_url_input = gr.Textbox(
             label="Base Server URL",
             placeholder="http://40.119.56.66",
             value="http://40.119.56.66",
             interactive=True,
         )
         discover_btn = gr.Button("🔍 Discover Models", variant="primary", size="lg")
         model_dropdown = gr.Dropdown(
             label="Select Model",
             choices=[],
             interactive=True,
             info="Models will be discovered on ports 8000-8010",
         )
+        server_status = gr.Markdown("Click 'Discover Models' to find available models")
         gr.Markdown("---")
         gr.Markdown("### 📋 Model Information")
         model_info_display = gr.Markdown("")
                         gr.update(visible=False),
                     )
+                video_path, task, quality_label, partial_success = get_trajectory_video_path(
+                    dataset, index, dataset_name
+                )
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
                 if dataset is None:
                     return gr.update(visible=False), gr.update(visible=False)
+                video_path, task, quality_label, partial_success = get_trajectory_video_path(
+                    dataset, index, dataset_name
+                )
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
             analyze_single_btn.click(
                 fn=process_single_video,
+                inputs=[
+                    single_video_input,
+                    task_text_input,
+                    server_url_state,
+                    fps_input_single,
+                    use_frame_steps_single,
+                ],
                 outputs=[progress_plot, info_output],
                 api_name="process_single_video",
             )
                     with gr.Row():
                         video_a_display = gr.Video(label="Video A", height=400)
                         video_b_display = gr.Video(label="Video B", height=400)
                     # Result text at the bottom
                     result_text = gr.Markdown("")
                         gr.update(visible=False),
                     )
+                video_path, task, quality_label, partial_success = get_trajectory_video_path(
+                    dataset, index, dataset_name
+                )
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
                 if dataset is None:
                     return gr.update(visible=False), gr.update(visible=False)
+                video_path, task, quality_label, partial_success = get_trajectory_video_path(
+                    dataset, index, dataset_name
+                )
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
                         gr.update(visible=False),
                     )
+                video_path, task, quality_label, partial_success = get_trajectory_video_path(
+                    dataset, index, dataset_name
+                )
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
                 if dataset is None:
                     return gr.update(visible=False), gr.update(visible=False)
+                video_path, task, quality_label, partial_success = get_trajectory_video_path(
+                    dataset, index, dataset_name
+                )
                 if video_path:
                     # Build metadata text
                     metadata_lines = []
                     return gr.update(visible=False), gr.update(visible=False)
             # Video A dataset selection handlers
+            dataset_name_a.change(fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a])
+            refresh_configs_btn_a.click(fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a])
             load_dataset_btn_a.click(
                 fn=load_dataset_a,
             )
             # Video B dataset selection handlers
+            dataset_name_b.change(fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b])
+            refresh_configs_btn_b.click(fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b])
             load_dataset_btn_b.click(
                 fn=load_dataset_b,
             analyze_dual_btn.click(
                 fn=process_two_videos,
+                inputs=[
+                    video_a_input,
+                    video_b_input,
+                    task_text_dual,
+                    prediction_type,
+                    server_url_state,
+                    fps_input_dual,
+                    use_frame_steps_dual,
+                ],
                 outputs=[result_text, video_a_display, video_b_display],
                 api_name="process_two_videos",
             )

eval_utils.py CHANGED Viewed

@@ -15,8 +15,112 @@ import numpy as np
 import requests
 import torch
-from rfm.data.dataset_types import PreferenceSample, SimilaritySample, ProgressSample, Trajectory
-from rfm.data.datasets.helpers import linspace_subsample_frames, pad_trajectory_to_max_frames_np
 def extract_answer_from_text(text: str) -> str:
@@ -219,10 +323,10 @@ async def post_batch_npy_async(
 async def parse_npy_form_data(form_data: Any) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
     """Parse multipart form data to extract numpy arrays and other data.
     Args:
         form_data: FastAPI form data from request.form()
     Returns:
         Tuple of (numpy_arrays dict, other_data dict)
     """
@@ -271,7 +375,7 @@ def reconstruct_payload_from_npy(
         other_data: Dictionary of other form data
         trajectory_keys: List of trajectory keys to process (default: common keys)
         convert_embeddings_to_torch: Whether to convert embeddings to torch tensors
     Returns:
         List of reconstructed sample dictionaries
     """
@@ -284,7 +388,7 @@ def reconstruct_payload_from_npy(
             "traj_diff_trajectory",
             "trajectory",
         ]
     samples = []
     # Process each sample

 import requests
 import torch
+from dataset_types import PreferenceSample, SimilaritySample, ProgressSample, Trajectory
+def pad_trajectory_to_max_frames_np(
+    frames: np.ndarray, progress: List[float], max_frames: int, pad_from: str = "right"
+) -> Tuple[np.ndarray, List[float]]:
+    """Pad trajectory frames and progress to max_frames by repeating the first frame/progress if needed.
+    Args:
+        frames: Trajectory frames (numpy array)
+        progress: Progress values (list of floats)
+        max_frames: Target number of frames
+    Returns:
+        Tuple[np.ndarray, List[float]: (padded_frames, padded_progress)
+    """
+    current_frames = frames.shape[0]
+    if current_frames >= max_frames:
+        # No padding needed
+        return frames, progress
+    if pad_from == "left":
+        pad_frame = frames[0:1]  # Keep the batch dimension
+        pad_progress = progress[0]
+    else:
+        pad_frame = frames[-1:]
+        pad_progress = progress[-1]
+    # Calculate how many frames to pad
+    frames_to_pad = max_frames - current_frames
+    # Pad frames by repeating the first frame
+    if pad_from == "left":
+        padded_frames = np.concatenate([np.repeat(pad_frame, frames_to_pad, axis=0), frames], axis=0)
+        padded_progress = [pad_progress] * frames_to_pad + progress
+    else:
+        padded_frames = np.concatenate([frames, np.repeat(pad_frame, frames_to_pad, axis=0)], axis=0)
+        padded_progress = progress + [pad_progress] * frames_to_pad
+    return padded_frames, padded_progress
+def linspace_subsample_frames(
+    frames: np.ndarray, num_frames: int = 8, end_idx: Optional[int] = None
+) -> Tuple[np.ndarray, List[int]]:
+    """Uniformly subsample frames from a trajectory and return the indices.
+    This method takes the full trajectory (e.g., 64 frames) and uniformly subsamples
+    num_frames from it. The first and last frames are always included.
+    Args:
+        frames: Full trajectory frames (N frames)
+        num_frames: Number of frames to subsample (default: 8)
+        end_idx: Optional end index to subsample up to (if None, uses total_frames - 1)
+    Returns:
+        Tuple[np.ndarray, List[int]: (subsampled_frames, subsampled_indices)
+    """
+    if hasattr(frames, "shape"):
+        total_frames = frames.shape[0]
+    else:
+        total_frames = len(frames)
+    if total_frames <= 0:
+        return frames, []
+    # Use end_idx if provided, otherwise use full trajectory
+    if end_idx is not None:
+        end_idx = min(end_idx, total_frames - 1)
+        frames_to_subsample = frames[: end_idx + 1]
+        effective_total = end_idx + 1
+    else:
+        frames_to_subsample = frames
+        effective_total = total_frames
+    if effective_total <= num_frames:
+        # If we have fewer (or equal) frames than requested, return all frames
+        indices = list(range(effective_total))
+        return frames_to_subsample, indices
+    # Special case: if num_frames == 1, always take the last frame
+    if num_frames == 1:
+        indices = [effective_total - 1]
+        subsampled_frames = frames_to_subsample[indices]
+        return subsampled_frames, indices
+    # Evenly spaced indices from 0 to effective_total-1, inclusive
+    indices_np = np.linspace(0, effective_total - 1, num_frames)
+    indices = np.rint(indices_np).astype(int).tolist()
+    # Enforce first and last explicitly
+    indices[0] = 0
+    indices[-1] = effective_total - 1
+    # Ensure indices are strictly non-decreasing and within bounds
+    for k in range(1, len(indices)):
+        if indices[k] < indices[k - 1]:
+            indices[k] = indices[k - 1]
+        if indices[k] >= effective_total:
+            indices[k] = effective_total - 1
+    # Subsample frames
+    subsampled_frames = frames_to_subsample[indices]
+    return subsampled_frames, indices
 def extract_answer_from_text(text: str) -> str:
 async def parse_npy_form_data(form_data: Any) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
     """Parse multipart form data to extract numpy arrays and other data.
     Args:
         form_data: FastAPI form data from request.form()
     Returns:
         Tuple of (numpy_arrays dict, other_data dict)
     """
         other_data: Dictionary of other form data
         trajectory_keys: List of trajectory keys to process (default: common keys)
         convert_embeddings_to_torch: Whether to convert embeddings to torch tensors
     Returns:
         List of reconstructed sample dictionaries
     """
             "traj_diff_trajectory",
             "trajectory",
         ]
     samples = []
     # Process each sample

eval_viz_utils.py CHANGED Viewed

@@ -180,7 +180,7 @@ def extract_frames(video_path: str, fps: float = 1.0, max_frames: int = 64) -> n
         # Clamp to [1, total_frames]
         desired_frames = max(1, min(desired_frames, total_frames))
         # IMPORTANT: Cap at max_frames to prevent memory issues
         # This is critical when fps is high or videos are long
         if desired_frames > max_frames:

         # Clamp to [1, total_frames]
         desired_frames = max(1, min(desired_frames, total_frames))
         # IMPORTANT: Cap at max_frames to prevent memory issues
         # This is critical when fps is high or videos are long
         if desired_frames > max_frames:

samplers/eval/confusion_matrix.py CHANGED Viewed

@@ -60,7 +60,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
     def _generate_all_sample_indices(self) -> list[dict]:
         """Generate all possible task-trajectory pair sample indices.
         If multiple data sources exist, samples N random trajectories from each data source.
         Prioritizes different video tasks first, then prioritizes different language instructions
         when creating pairs.
@@ -73,7 +73,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
         # Sample trajectories per data source (prioritizing different video tasks)
         sampled_trajectories, stats = self._sample_trajectories_by_data_source()
         rank_0_print(
             f"Processing {len(sampled_trajectories)} trajectories for confusion matrix analysis",
             verbose=self.verbose,
@@ -88,7 +88,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
         # Create task-trajectory pairs with prioritized language instruction pairing
         video_task_count = Counter()
         for traj_idx in sampled_trajectories:
             traj = self.dataset[traj_idx]
             video_task = traj["task"]
@@ -98,7 +98,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
             #     continue
             video_task_count[video_task] += 1
             # Pair this trajectory with all language tasks (shuffled for variety)
             traj_id = traj.get("id", str(traj_idx))
             for lang_task in shuffled_lang_tasks:
@@ -117,15 +117,15 @@ class ConfusionMatrixSampler(RFMBaseSampler):
         rank_0_print(f"Generated {len(sample_indices)} task-trajectory pairs", verbose=self.verbose)
         rank_0_print(f"  Video tasks sampled: {dict(video_task_count)}", verbose=self.verbose)
         rank_0_print(f"  Trajectories per video task: {dict(sorted(video_task_count.items()))}", verbose=self.verbose)
         return sample_indices
     def _sample_trajectories_by_data_source(self) -> Tuple[list[int], dict]:
         """Sample N random trajectories from each data source, prioritizing different video tasks.
         When sampling N trajectories, first selects one trajectory from each unique video task,
         then repeats in round-robin fashion until N trajectories are sampled.
         Returns:
             Tuple of (list of sampled trajectory indices, stats dictionary)
         """
@@ -135,7 +135,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
             "by_task": Counter(),
             "traj_to_task": {},
         }
         # Group robot trajectories by data source, then by video task
         trajectories_by_source_and_task = defaultdict(lambda: defaultdict(list))
         for traj_idx in self.robot_trajectories:
@@ -143,7 +143,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
             data_source = traj.get("data_source", "unknown")
             video_task = traj.get("task", "unknown")
             trajectories_by_source_and_task[data_source][video_task].append(traj_idx)
         rank_0_print(
             f"Found {len(trajectories_by_source_and_task)} data sources: {list(trajectories_by_source_and_task.keys())}",
             verbose=self.verbose,
@@ -154,17 +154,17 @@ class ConfusionMatrixSampler(RFMBaseSampler):
             # Shuffle trajectories within each task for randomization
             for task in tasks_to_indices:
                 self._local_random.shuffle(tasks_to_indices[task])
             # Get all unique tasks for this data source
             all_tasks = list(tasks_to_indices.keys())
             self._local_random.shuffle(all_tasks)  # Randomize task order too
             source_stats = {
                 "total_available": sum(len(indices) for indices in tasks_to_indices.values()),
                 "tasks_available": {task: len(indices) for task, indices in tasks_to_indices.items()},
                 "tasks_sampled": Counter(),
             }
             if self.n_trajectories_per_source is None:
                 # Use all available trajectories
                 sampled_from_source = []
@@ -172,7 +172,7 @@ class ConfusionMatrixSampler(RFMBaseSampler):
                     sampled_from_source.extend(indices)
                     source_stats["tasks_sampled"][task] = len(indices)
                     stats["by_task"][task] += len(indices)
                 rank_0_print(
                     f"  Data source '{data_source}': Using all {len(sampled_from_source)} trajectories",
                     verbose=self.verbose,
@@ -181,18 +181,18 @@ class ConfusionMatrixSampler(RFMBaseSampler):
                 # Sample N trajectories using round-robin to prioritize different tasks
                 n_to_sample = min(self.n_trajectories_per_source, source_stats["total_available"])
                 sampled_from_source = []
                 # Round-robin sampling: first get one from each task, then repeat
                 task_iterators = {task: iter(indices) for task, indices in tasks_to_indices.items()}
                 task_list = all_tasks.copy()
                 round_idx = 0
                 while len(sampled_from_source) < n_to_sample:
                     # If we've gone through all tasks once, reshuffle for next round
                     if round_idx >= len(task_list):
                         round_idx = 0
                         self._local_random.shuffle(task_list)
                     # Try to get one trajectory from current task
                     task = task_list[round_idx]
                     try:
@@ -206,9 +206,9 @@ class ConfusionMatrixSampler(RFMBaseSampler):
                         if not task_list:
                             break  # All tasks exhausted
                         continue
                     round_idx += 1
                 rank_0_print(
                     f"  Data source '{data_source}': Sampled {len(sampled_from_source)} out of {source_stats['total_available']} trajectories",
                     verbose=self.verbose,
@@ -217,13 +217,13 @@ class ConfusionMatrixSampler(RFMBaseSampler):
                     f"    Tasks sampled: {dict(sorted(source_stats['tasks_sampled'].items()))}",
                     verbose=self.verbose,
                 )
             # Track trajectory to task mapping for stats
             for traj_idx in sampled_from_source:
                 traj = self.dataset[traj_idx]
                 traj_id = traj.get("id", str(traj_idx))
                 stats["traj_to_task"][traj_id] = traj.get("task", "unknown")
             sampled_indices.extend(sampled_from_source)
             stats["by_source"][data_source] = source_stats
@@ -231,33 +231,33 @@ class ConfusionMatrixSampler(RFMBaseSampler):
     def _print_sampling_stats(self, stats: dict):
         """Print detailed statistics about sampled trajectories.
         Args:
             stats: Statistics dictionary from _sample_trajectories_by_data_source
         """
         if not self.verbose:
             return
         rank_0_print("\n=== Confusion Matrix Sampling Statistics ===", verbose=self.verbose)
         # Overall task statistics
         rank_0_print(f"\nOverall trajectories per video task:", verbose=self.verbose)
         for task, count in sorted(stats["by_task"].items()):
             rank_0_print(f"  {task}: {count} trajectories", verbose=self.verbose)
         # Per data source statistics
         rank_0_print(f"\nPer data source breakdown:", verbose=self.verbose)
         for data_source, source_stats in stats["by_source"].items():
             rank_0_print(f"  Data source: {data_source}", verbose=self.verbose)
             rank_0_print(f"    Total available: {source_stats['total_available']}", verbose=self.verbose)
             rank_0_print(f"    Tasks available: {len(source_stats['tasks_available'])}", verbose=self.verbose)
-            for task, count in sorted(source_stats['tasks_available'].items()):
-                sampled_count = source_stats['tasks_sampled'].get(task, 0)
                 rank_0_print(
                     f"      {task}: {sampled_count}/{count} trajectories sampled",
                     verbose=self.verbose,
                 )
         rank_0_print("=" * 50, verbose=self.verbose)
     def _generate_sample_from_indices(self, sample_idx_info: dict) -> PreferenceSample:

     def _generate_all_sample_indices(self) -> list[dict]:
         """Generate all possible task-trajectory pair sample indices.
         If multiple data sources exist, samples N random trajectories from each data source.
         Prioritizes different video tasks first, then prioritizes different language instructions
         when creating pairs.
         # Sample trajectories per data source (prioritizing different video tasks)
         sampled_trajectories, stats = self._sample_trajectories_by_data_source()
         rank_0_print(
             f"Processing {len(sampled_trajectories)} trajectories for confusion matrix analysis",
             verbose=self.verbose,
         # Create task-trajectory pairs with prioritized language instruction pairing
         video_task_count = Counter()
         for traj_idx in sampled_trajectories:
             traj = self.dataset[traj_idx]
             video_task = traj["task"]
             #     continue
             video_task_count[video_task] += 1
             # Pair this trajectory with all language tasks (shuffled for variety)
             traj_id = traj.get("id", str(traj_idx))
             for lang_task in shuffled_lang_tasks:
         rank_0_print(f"Generated {len(sample_indices)} task-trajectory pairs", verbose=self.verbose)
         rank_0_print(f"  Video tasks sampled: {dict(video_task_count)}", verbose=self.verbose)
         rank_0_print(f"  Trajectories per video task: {dict(sorted(video_task_count.items()))}", verbose=self.verbose)
         return sample_indices
     def _sample_trajectories_by_data_source(self) -> Tuple[list[int], dict]:
         """Sample N random trajectories from each data source, prioritizing different video tasks.
         When sampling N trajectories, first selects one trajectory from each unique video task,
         then repeats in round-robin fashion until N trajectories are sampled.
         Returns:
             Tuple of (list of sampled trajectory indices, stats dictionary)
         """
             "by_task": Counter(),
             "traj_to_task": {},
         }
         # Group robot trajectories by data source, then by video task
         trajectories_by_source_and_task = defaultdict(lambda: defaultdict(list))
         for traj_idx in self.robot_trajectories:
             data_source = traj.get("data_source", "unknown")
             video_task = traj.get("task", "unknown")
             trajectories_by_source_and_task[data_source][video_task].append(traj_idx)
         rank_0_print(
             f"Found {len(trajectories_by_source_and_task)} data sources: {list(trajectories_by_source_and_task.keys())}",
             verbose=self.verbose,
             # Shuffle trajectories within each task for randomization
             for task in tasks_to_indices:
                 self._local_random.shuffle(tasks_to_indices[task])
             # Get all unique tasks for this data source
             all_tasks = list(tasks_to_indices.keys())
             self._local_random.shuffle(all_tasks)  # Randomize task order too
             source_stats = {
                 "total_available": sum(len(indices) for indices in tasks_to_indices.values()),
                 "tasks_available": {task: len(indices) for task, indices in tasks_to_indices.items()},
                 "tasks_sampled": Counter(),
             }
             if self.n_trajectories_per_source is None:
                 # Use all available trajectories
                 sampled_from_source = []
                     sampled_from_source.extend(indices)
                     source_stats["tasks_sampled"][task] = len(indices)
                     stats["by_task"][task] += len(indices)
                 rank_0_print(
                     f"  Data source '{data_source}': Using all {len(sampled_from_source)} trajectories",
                     verbose=self.verbose,
                 # Sample N trajectories using round-robin to prioritize different tasks
                 n_to_sample = min(self.n_trajectories_per_source, source_stats["total_available"])
                 sampled_from_source = []
                 # Round-robin sampling: first get one from each task, then repeat
                 task_iterators = {task: iter(indices) for task, indices in tasks_to_indices.items()}
                 task_list = all_tasks.copy()
                 round_idx = 0
                 while len(sampled_from_source) < n_to_sample:
                     # If we've gone through all tasks once, reshuffle for next round
                     if round_idx >= len(task_list):
                         round_idx = 0
                         self._local_random.shuffle(task_list)
                     # Try to get one trajectory from current task
                     task = task_list[round_idx]
                     try:
                         if not task_list:
                             break  # All tasks exhausted
                         continue
                     round_idx += 1
                 rank_0_print(
                     f"  Data source '{data_source}': Sampled {len(sampled_from_source)} out of {source_stats['total_available']} trajectories",
                     verbose=self.verbose,
                     f"    Tasks sampled: {dict(sorted(source_stats['tasks_sampled'].items()))}",
                     verbose=self.verbose,
                 )
             # Track trajectory to task mapping for stats
             for traj_idx in sampled_from_source:
                 traj = self.dataset[traj_idx]
                 traj_id = traj.get("id", str(traj_idx))
                 stats["traj_to_task"][traj_id] = traj.get("task", "unknown")
             sampled_indices.extend(sampled_from_source)
             stats["by_source"][data_source] = source_stats
     def _print_sampling_stats(self, stats: dict):
         """Print detailed statistics about sampled trajectories.
         Args:
             stats: Statistics dictionary from _sample_trajectories_by_data_source
         """
         if not self.verbose:
             return
         rank_0_print("\n=== Confusion Matrix Sampling Statistics ===", verbose=self.verbose)
         # Overall task statistics
         rank_0_print(f"\nOverall trajectories per video task:", verbose=self.verbose)
         for task, count in sorted(stats["by_task"].items()):
             rank_0_print(f"  {task}: {count} trajectories", verbose=self.verbose)
         # Per data source statistics
         rank_0_print(f"\nPer data source breakdown:", verbose=self.verbose)
         for data_source, source_stats in stats["by_source"].items():
             rank_0_print(f"  Data source: {data_source}", verbose=self.verbose)
             rank_0_print(f"    Total available: {source_stats['total_available']}", verbose=self.verbose)
             rank_0_print(f"    Tasks available: {len(source_stats['tasks_available'])}", verbose=self.verbose)
+            for task, count in sorted(source_stats["tasks_available"].items()):
+                sampled_count = source_stats["tasks_sampled"].get(task, 0)
                 rank_0_print(
                     f"      {task}: {sampled_count}/{count} trajectories sampled",
                     verbose=self.verbose,
                 )
         rank_0_print("=" * 50, verbose=self.verbose)
     def _generate_sample_from_indices(self, sample_idx_info: dict) -> PreferenceSample: