Spaces:

robometer
/

rewardeval_ui

Running

App Files Files Community

Anthony Liang commited on Jan 14

Commit

3e462dd

1 Parent(s): e498336

getting rid of dependencies

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +4 -4
dataset_types.py +78 -0
eval_utils.py +400 -0
eval_viz_utils.py +205 -0
requirements.txt +1 -2
samplers/README.md +182 -0
samplers/__init__.py +23 -0
samplers/__pycache__/__init__.cpython-310.pyc +0 -0
samplers/__pycache__/__init__.cpython-311.pyc +0 -0
samplers/__pycache__/base.cpython-310.pyc +0 -0
samplers/__pycache__/base.cpython-311.pyc +0 -0
samplers/__pycache__/confusion_matrix.cpython-310.pyc +0 -0
samplers/__pycache__/confusion_matrix.cpython-311.pyc +0 -0
samplers/__pycache__/pref.cpython-310.pyc +0 -0
samplers/__pycache__/pref.cpython-311.pyc +0 -0
samplers/__pycache__/progress.cpython-310.pyc +0 -0
samplers/__pycache__/progress.cpython-311.pyc +0 -0
samplers/__pycache__/progress_default.cpython-310.pyc +0 -0
samplers/__pycache__/progress_default.cpython-311.pyc +0 -0
samplers/__pycache__/quality_preference.cpython-310.pyc +0 -0
samplers/__pycache__/quality_preference.cpython-311.pyc +0 -0
samplers/__pycache__/reward_alignment.cpython-310.pyc +0 -0
samplers/__pycache__/reward_alignment.cpython-311.pyc +0 -0
samplers/__pycache__/roboarena.cpython-310.pyc +0 -0
samplers/__pycache__/sim.cpython-310.pyc +0 -0
samplers/__pycache__/sim.cpython-311.pyc +0 -0
samplers/__pycache__/success_failure.cpython-310.pyc +0 -0
samplers/__pycache__/success_failure.cpython-311.pyc +0 -0
samplers/base.py +753 -0
samplers/eval/__pycache__/base_pref.cpython-310.pyc +0 -0
samplers/eval/__pycache__/base_pref.cpython-311.pyc +0 -0
samplers/eval/__pycache__/confusion_matrix.cpython-310.pyc +0 -0
samplers/eval/__pycache__/confusion_matrix.cpython-311.pyc +0 -0
samplers/eval/__pycache__/progress_default.cpython-310.pyc +0 -0
samplers/eval/__pycache__/progress_default.cpython-311.pyc +0 -0
samplers/eval/__pycache__/progress_policy_ranking.cpython-310.pyc +0 -0
samplers/eval/__pycache__/progress_policy_ranking.cpython-311.pyc +0 -0
samplers/eval/__pycache__/quality_preference.cpython-310.pyc +0 -0
samplers/eval/__pycache__/quality_preference.cpython-311.pyc +0 -0
samplers/eval/__pycache__/reward_alignment.cpython-310.pyc +0 -0
samplers/eval/__pycache__/reward_alignment.cpython-311.pyc +0 -0
samplers/eval/__pycache__/roboarena_quality_preference.cpython-310.pyc +0 -0
samplers/eval/__pycache__/roboarena_quality_preference.cpython-311.pyc +0 -0
samplers/eval/__pycache__/similarity_score.cpython-310.pyc +0 -0
samplers/eval/__pycache__/similarity_score.cpython-311.pyc +0 -0
samplers/eval/base_pref.py +73 -0
samplers/eval/confusion_matrix.py +299 -0
samplers/eval/progress_policy_ranking.py +231 -0
samplers/eval/quality_preference.py +219 -0
samplers/eval/reward_alignment.py +147 -0

app.py CHANGED Viewed

@@ -25,9 +25,9 @@ import numpy as np
 import requests
 from typing import Any, List, Optional, Tuple
-from rfm.data.dataset_types import Trajectory, ProgressSample, PreferenceSample, SimilaritySample
-from rfm.evals.eval_utils import build_payload, post_batch_npy
-from rfm.evals.eval_viz_utils import create_combined_progress_success_plot, extract_frames
 from datasets import load_dataset as load_dataset_hf, get_dataset_config_names
 logger = logging.getLogger(__name__)
@@ -514,7 +514,7 @@ def process_two_videos(
         elif prediction_type == "progress":
             # Create ProgressSamples for both videos
-            from rfm.data.dataset_types import ProgressSample
             progress_sample_a = ProgressSample(
                 trajectory=trajectory_a,

 import requests
 from typing import Any, List, Optional, Tuple
+from dataset_types import Trajectory, ProgressSample, PreferenceSample, SimilaritySample
+from eval_utils import build_payload, post_batch_npy
+from eval_viz_utils import create_combined_progress_success_plot, extract_frames
 from datasets import load_dataset as load_dataset_hf, get_dataset_config_names
 logger = logging.getLogger(__name__)
         elif prediction_type == "progress":
             # Create ProgressSamples for both videos
+            from dataset_types import ProgressSample
             progress_sample_a = ProgressSample(
                 trajectory=trajectory_a,

dataset_types.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python3
+"""
+Dataclasses for RFM model dataset trajectory structures.
+Defines the standard format for HuggingFace dataset trajectories.
+"""
+from typing import Any, Union, List, Dict, Tuple, Optional
+import numpy as np
+from pydantic import BaseModel, ConfigDict
+import torch
+class Trajectory(BaseModel):
+    """Trajectory structure containing frames, metadata, and progress information."""
+    # Core trajectory fields
+    frames: Union[List[str], np.ndarray, None] = None
+    frames_shape: Optional[Tuple] = None
+    # If embeddings are precomputed
+    embeddings_path: Optional[str] = None
+    video_embeddings: Union[torch.Tensor, np.ndarray, None] = None
+    text_embedding: Union[torch.Tensor, np.ndarray, None] = None
+    id: Optional[str] = None
+    task: Optional[str] = None
+    lang_vector: Union[np.ndarray, List[float], None] = None
+    data_source: Optional[str] = None
+    quality_label: Optional[str] = None
+    is_robot: Optional[bool] = None
+    # Progress and metadata
+    # Can be List[float] for continuous progress, np.ndarray, or List[np.ndarray] for C51 discrete distributions
+    target_progress: Optional[Union[List[float], List[torch.Tensor], torch.Tensor, None]] = None
+    partial_success: Optional[Union[float, torch.Tensor]] = None  # float for continuous, Tensor for C51 discrete
+    success_label: Optional[List[float]] = None  # Success labels for each frame (1.0 for success, 0.0 for failure)
+    metadata: Optional[Dict[str, Any]] = None
+    data_gen_strategy: Optional[str] = None
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+class ProgressSample(BaseModel):
+    """Sample structure for progress evaluation."""
+    trajectory: Trajectory
+    sample_type: str = "progress"
+    data_gen_strategy: Optional[str] = None
+    resample_attempts: int = 1
+class PreferenceSample(BaseModel):
+    """Sample structure for preference prediction: chosen vs rejected where chosen is preferred."""
+    # Trajectories
+    chosen_trajectory: Trajectory
+    rejected_trajectory: Trajectory
+    sample_type: str = "preference"
+    data_gen_strategy: Optional[str] = None
+    resample_attempts: int = 1
+class SimilaritySample(BaseModel):
+    """Sample structure for similarity scoring: traj_sim and traj_diff ranked against o^ref."""
+    # Trajectories
+    ref_trajectory: Trajectory  # o^ref
+    sim_trajectory: Trajectory  # Similar trajectory
+    diff_trajectory: Optional[Trajectory] = None  # Different trajectory (optional in inference mode)
+    sample_type: str = "similarity"
+    data_gen_strategy: Optional[str] = None
+    resample_attempts: int = 1
+SampleType = Union[PreferenceSample, SimilaritySample, ProgressSample]

eval_utils.py ADDED Viewed

	@@ -0,0 +1,400 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import re
+import torch
+import io
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Union, Optional, Tuple
+from datetime import datetime
+import aiohttp
+import numpy as np
+import requests
+import torch
+from rfm.data.dataset_types import PreferenceSample, SimilaritySample, ProgressSample, Trajectory
+from rfm.data.datasets.helpers import linspace_subsample_frames, pad_trajectory_to_max_frames_np
+def extract_answer_from_text(text: str) -> str:
+    """Extract answer from text using <ans> tags."""
+    m = re.search(r"<ans>(.*?)</ans>", text, re.DOTALL)
+    ans = m.group(1).strip() if m else ""
+    return ans
+def raw_dict_to_sample(
+    raw_data: Union[Tuple[Dict[str, Any], Dict[str, Any]], Dict[str, Any]],
+    max_frames: int = 16,
+    sample_type: str = "progress",
+) -> Union[ProgressSample, PreferenceSample]:
+    """
+    Convert raw data dictionary to a ProgressSample or PreferenceSample.
+    Args:
+        raw_data: Dict with 'frames', 'task', 'id', 'metadata', 'video_embeddings', 'text_embedding' or Tuple of (Dict[str, Any], Dict[str, Any])
+        max_frames: Maximum number of frames to use (default: 16)
+        sample_type: Either "progress" or "preference" (default: "progress")
+    Returns:
+        ProgressSample or PreferenceSample
+    """
+    def _build_trajectory(raw_data: Dict[str, Any], num_frames: int) -> Trajectory:
+        processed_item: Dict[str, Any] = {}
+        # Process frames
+        frames_array = raw_data["frames"]
+        # Ensure we have the correct shape: (T, H, W, C)
+        if len(frames_array.shape) != 4:
+            raise ValueError(f"Expected 4D array (T, H, W, C), got shape {frames_array.shape}")
+        # Convert from CxHxW to HxWxC if needed
+        if frames_array.shape[1] == 3:
+            frames_array = np.transpose(frames_array, (0, 2, 3, 1))
+        frames_array, _ = linspace_subsample_frames(frames_array, num_frames)
+        dummy_progress = [0.0] * len(frames_array)
+        frames_array, _ = pad_trajectory_to_max_frames_np(frames_array, dummy_progress, num_frames, pad_from="right")
+        if frames_array.size == 0:
+            raise ValueError("No frames processed for example")
+        processed_item["frames"] = frames_array
+        processed_item["frames_shape"] = frames_array.shape
+        processed_item["task"] = raw_data["task"]
+        processed_item["lang_vector"] = None
+        processed_item["metadata"] = raw_data.get("metadata", None)
+        # Process video embeddings using same helper functions
+        video_embeddings = raw_data.get("video_embeddings")
+        if video_embeddings is not None:
+            video_embeddings, _ = linspace_subsample_frames(video_embeddings, num_frames)
+            dummy_progress_emb = [0.0] * len(video_embeddings)
+            video_embeddings, _ = pad_trajectory_to_max_frames_np(
+                video_embeddings, dummy_progress_emb, num_frames, pad_from="right"
+            )
+        text_embedding = raw_data.get("text_embedding")
+        # Convert to tensors if they are numpy arrays
+        if video_embeddings is not None and isinstance(video_embeddings, np.ndarray):
+            video_embeddings = torch.tensor(video_embeddings)
+        if text_embedding is not None and isinstance(text_embedding, np.ndarray):
+            text_embedding = torch.tensor(text_embedding)
+        processed_item["video_embeddings"] = video_embeddings
+        processed_item["text_embedding"] = text_embedding
+        processed_item["video_shape"] = video_embeddings.shape if video_embeddings is not None else None
+        processed_item["text_shape"] = text_embedding.shape if text_embedding is not None else None
+        trajectory = Trajectory(**processed_item)
+        return trajectory
+    if sample_type == "progress":
+        assert isinstance(raw_data, dict), "raw_data must be a dictionary"
+        trajectory = _build_trajectory(raw_data=raw_data, num_frames=max_frames)
+        return ProgressSample(trajectory=trajectory)
+    elif sample_type == "preference":
+        assert isinstance(raw_data, tuple), "raw_data must be a tuple"
+        assert len(raw_data) == 2, "raw_data must be a tuple of two dictionaries"
+        trajectories: List[Trajectory] = []
+        for trajectory_data in raw_data:
+            trajectory = _build_trajectory(raw_data=trajectory_data, num_frames=max_frames)
+            trajectories.append(trajectory)
+        return PreferenceSample(chosen_trajectory=trajectories[0], rejected_trajectory=trajectories[1])
+    else:
+        raise ValueError(f"Unsupported sample_type: {sample_type}")
+def build_payload(
+    samples: list[PreferenceSample | SimilaritySample | ProgressSample],
+) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    """Build a payload with numpy array handling.
+    Args:
+        samples: List of samples to convert
+    Returns:
+        Tuple of (files, sample_data) where:
+        - files: Dict of numpy arrays converted to .npy format
+        - sample_data: List of sample dictionaries with numpy arrays replaced by file references
+    """
+    files = {}
+    sample_data = []
+    for sample_idx, sample in enumerate(samples):
+        # Copy the original sample and handle numpy arrays
+        processed_sample = sample.model_dump().copy()
+        # Handle trajectory objects with numpy arrays
+        for key in [
+            "chosen_trajectory",
+            "rejected_trajectory",
+            "reference_trajectory",
+            "traj_sim_trajectory",
+            "traj_diff_trajectory",
+            "trajectory",
+        ]:
+            if key in processed_sample and isinstance(processed_sample[key], dict):
+                trajectory = processed_sample[key]
+                # Convert numpy arrays to .npy files
+                numpy_fields = ["frames", "lang_vector", "video_embeddings", "text_embedding"]
+                for field_name in numpy_fields:
+                    # if it is a tensor, first convert it to a numpy array
+                    if field_name in trajectory and isinstance(trajectory[field_name], torch.Tensor):
+                        trajectory[field_name] = trajectory[field_name].numpy()
+                    if field_name in trajectory and isinstance(trajectory[field_name], np.ndarray):
+                        # Convert numpy array to .npy file
+                        buf = io.BytesIO()
+                        np.save(buf, trajectory[field_name])
+                        buf.seek(0)
+                        file_key = f"sample_{sample_idx}_{key}_{field_name}"
+                        files[file_key] = (
+                            f"sample_{sample_idx}_{key}_{field_name}.npy",
+                            buf,
+                            "application/octet-stream",
+                        )
+                        trajectory[field_name] = {"__numpy_file__": file_key}
+        sample_data.append(processed_sample)
+    return files, sample_data
+def post_batch(url: str, payload: dict[str, Any], timeout_s: float = 120.0) -> dict[str, Any]:
+    """POST a batch payload to the evaluation server and return parsed JSON."""
+    resp = requests.post(url.rstrip("/") + "/evaluate_batch", json=payload, timeout=timeout_s)
+    resp.raise_for_status()
+    return resp.json()
+def post_batch_npy(
+    url: str, files: dict[str, Any], sample_data: list[dict[str, Any]], timeout_s: float = 120.0
+) -> dict[str, Any]:
+    """POST batch using .npy format for numpy arrays."""
+    # Convert sample_data to form data
+    data = {f"sample_{i}": json.dumps(sample) for i, sample in enumerate(sample_data)}
+    # Send as multipart form data
+    resp = requests.post(url.rstrip("/") + "/evaluate_batch_npy", files=files, data=data, timeout=timeout_s)
+    resp.raise_for_status()
+    return resp.json()
+async def post_batch_npy_async(
+    session: aiohttp.ClientSession,
+    url: str,
+    files: dict[str, Any],
+    sample_data: list[dict[str, Any]],
+    timeout_s: float = 120.0,
+) -> dict[str, Any]:
+    """Async version of post_batch_npy using aiohttp."""
+    # Create FormData for aiohttp
+    form_data = aiohttp.FormData()
+    # Add files
+    for key, (filename, file_obj, content_type) in files.items():
+        form_data.add_field(key, file_obj, filename=filename, content_type=content_type)
+    # Add sample data
+    for i, sample in enumerate(sample_data):
+        form_data.add_field(f"sample_{i}", json.dumps(sample))
+    headers = {"Connection": "close"}
+    # Send as multipart form data using aiohttp
+    timeout = aiohttp.ClientTimeout(total=timeout_s)
+    async with session.post(
+        url.rstrip("/") + "/evaluate_batch_npy", data=form_data, timeout=timeout, headers=headers
+    ) as resp:
+        resp.raise_for_status()
+        return await resp.json()
+async def parse_npy_form_data(form_data: Any) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
+    """Parse multipart form data to extract numpy arrays and other data.
+    Args:
+        form_data: FastAPI form data from request.form()
+    Returns:
+        Tuple of (numpy_arrays dict, other_data dict)
+    """
+    numpy_arrays = {}
+    other_data = {}
+    for key, value in form_data.items():
+        # Check if this is a file upload (UploadFile object)
+        if hasattr(value, "filename") and value.filename:
+            # This is a file upload
+            if value.filename.endswith(".npy"):
+                # Load .npy file (await async read)
+                content = await value.read()
+                buf = io.BytesIO(content)
+                array = np.load(buf)
+                numpy_arrays[key] = array
+            else:
+                # Non-.npy file, skip for now
+                continue
+        else:
+            # This is a string value (form field)
+            try:
+                # Try to parse as JSON
+                other_data[key] = json.loads(value)
+            except (json.JSONDecodeError, TypeError):
+                # Keep as string if not JSON
+                other_data[key] = value
+    return numpy_arrays, other_data
+def reconstruct_payload_from_npy(
+    numpy_arrays: Dict[str, np.ndarray],
+    other_data: Dict[str, Any],
+    trajectory_keys: Optional[List[str]] = None,
+    convert_embeddings_to_torch: bool = False,
+) -> List[Dict[str, Any]]:
+    """Reconstruct the original payload structure from .npy files and form data.
+    The client sends data in this format:
+    - Files: sample_0_chosen_trajectory_frames.npy, sample_0_trajectory_frames.npy, etc.
+    - Data: sample_0, sample_1, etc. (each containing the full sample JSON with numpy file references)
+    Args:
+        numpy_arrays: Dictionary of numpy arrays loaded from .npy files
+        other_data: Dictionary of other form data
+        trajectory_keys: List of trajectory keys to process (default: common keys)
+        convert_embeddings_to_torch: Whether to convert embeddings to torch tensors
+    Returns:
+        List of reconstructed sample dictionaries
+    """
+    if trajectory_keys is None:
+        trajectory_keys = [
+            "chosen_trajectory",
+            "rejected_trajectory",
+            "reference_trajectory",
+            "traj_sim_trajectory",
+            "traj_diff_trajectory",
+            "trajectory",
+        ]
+    samples = []
+    # Process each sample
+    for i in range(len(other_data)):
+        sample_key = f"sample_{i}"
+        if sample_key in other_data:
+            # Get the sample data - might already be parsed or might be a string
+            sample_data = other_data[sample_key]
+            if isinstance(sample_data, str):
+                # Parse the sample JSON if it's a string
+                sample_data = json.loads(sample_data)
+            # Replace numpy file references with actual arrays
+            for key, value in sample_data.items():
+                if key in trajectory_keys:
+                    if isinstance(value, dict):
+                        for traj_key, traj_value in value.items():
+                            if isinstance(traj_value, dict) and traj_value.get("__numpy_file__"):
+                                # Replace with actual numpy array
+                                file_key = traj_value["__numpy_file__"]
+                                if file_key in numpy_arrays:
+                                    value[traj_key] = numpy_arrays[file_key]
+                            # Convert embeddings to torch if requested
+                            if convert_embeddings_to_torch and traj_key in ["video_embeddings", "text_embedding"]:
+                                if traj_key in value and value[traj_key] is not None:
+                                    if isinstance(value[traj_key], np.ndarray):
+                                        value[traj_key] = torch.tensor(value[traj_key])
+                                    elif isinstance(value[traj_key], list):
+                                        value[traj_key] = torch.tensor(value[traj_key])
+            samples.append(sample_data)
+    return samples
+def find_video_files(directory: str) -> list[str]:
+    """Find all video files in a directory.
+    Args:
+        directory: Path to directory containing video files
+    Returns:
+        List of paths to video files
+    """
+    video_extensions = {".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv", ".m4v"}
+    video_files = []
+    directory_path = Path(directory)
+    if not directory_path.is_dir():
+        return []
+    for file_path in directory_path.iterdir():
+        if file_path.is_file() and file_path.suffix.lower() in video_extensions:
+            video_files.append(str(file_path))
+    video_files.sort()
+    return video_files
+def infer_task_from_video_name(video_path: str) -> str:
+    """Infer task name from video filename.
+    Task is everything before the comma (if comma exists), or everything before success/fail/failure.
+    Args:
+        video_path: Path to video file
+    Returns:
+        Inferred task name
+    """
+    video_name = Path(video_path).stem  # Get filename without extension
+    # If there's a comma, task is everything before the comma
+    if "," in video_name:
+        task_part = video_name.split(",")[0]
+    else:
+        # Otherwise, split by underscore and remove success/fail/failure suffixes
+        parts = video_name.split("_")
+        filtered_parts = []
+        for part in parts:
+            part_lower = part.lower()
+            if part_lower not in ["success", "fail", "failure"]:
+                filtered_parts.append(part)
+        if not filtered_parts:
+            return "Complete the task"
+        task_part = "_".join(filtered_parts)
+    # Split by underscore and join with spaces
+    task_words = task_part.split("_")
+    task = " ".join(task_words)
+    if task:
+        # Capitalize first letter of first word, keep rest as is
+        task = task[0].upper() + task[1:] if len(task) > 1 else task.upper()
+    else:
+        task = "Complete the task"
+    return task
+def setup_output_directory(output_dir: Optional[str], video_path: Optional[str] = None) -> str:
+    """Create output directory and return path."""
+    if output_dir:
+        save_dir = output_dir
+    else:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        save_dir = os.path.join(".", f"eval_outputs/{timestamp}")
+    os.makedirs(save_dir, exist_ok=True)
+    return save_dir

eval_viz_utils.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python3
+"""
+Utility functions for visualization in RFM evaluations.
+"""
+from typing import Optional
+import os
+import logging
+import tempfile
+import numpy as np
+import matplotlib.pyplot as plt
+import decord
+logger = logging.getLogger(__name__)
+def create_combined_progress_success_plot(
+    progress_pred: np.ndarray,
+    num_frames: int,
+    success_binary: Optional[np.ndarray] = None,
+    success_probs: Optional[np.ndarray] = None,
+    success_labels: Optional[np.ndarray] = None,
+    is_discrete_mode: bool = False,
+    title: Optional[str] = None,
+    loss: Optional[float] = None,
+    pearson: Optional[float] = None,
+) -> plt.Figure:
+    """Create a combined plot with progress, success binary, and success probabilities.
+    This function creates a unified plot with 1 subplot (progress only) or 3 subplots
+    (progress, success binary, success probs), similar to the one used in compile_results.py.
+    Args:
+        progress_pred: Progress predictions array
+        num_frames: Number of frames
+        success_binary: Optional binary success predictions
+        success_probs: Optional success probability predictions
+        success_labels: Optional ground truth success labels
+        is_discrete_mode: Whether progress is in discrete mode (deprecated, kept for compatibility)
+        title: Optional title for the plot (if None, auto-generated from loss/pearson)
+        loss: Optional loss value to display in title
+        pearson: Optional pearson correlation to display in title
+    Returns:
+        matplotlib Figure object
+    """
+    # Determine if we should show success plots
+    has_success_binary = success_binary is not None and len(success_binary) == len(progress_pred)
+    if has_success_binary:
+        # Three subplots: progress, success (binary), success_probs
+        fig, axs = plt.subplots(1, 3, figsize=(15, 3.5))
+        ax = axs[0]  # Progress subplot
+        ax2 = axs[1]  # Success subplot (binary)
+        ax3 = axs[2]  # Success probs subplot
+    else:
+        # Single subplot: progress only
+        fig, ax = plt.subplots(figsize=(6, 3.5))
+        ax2 = None
+        ax3 = None
+    # Plot progress
+    ax.plot(progress_pred, linewidth=2)
+    ax.set_ylabel("Progress")
+    # Build title
+    if title is None:
+        title_parts = ["Progress"]
+        if loss is not None:
+            title_parts.append(f"Loss: {loss:.3f}")
+        if pearson is not None:
+            title_parts.append(f"Pearson: {pearson:.2f}")
+        title = ", ".join(title_parts)
+    fig.suptitle(title)
+    # Set y-limits and ticks (always continuous since discrete is converted before this function)
+    ax.set_ylim(0, 1)
+    ax.spines["right"].set_visible(False)
+    ax.spines["top"].set_visible(False)
+    y_ticks = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
+    ax.set_yticks(y_ticks)
+    # Setup success binary subplot
+    if ax2 is not None:
+        ax2.step(range(len(success_binary)), success_binary, where="post", linewidth=2, label="Predicted", color="blue")
+        # Add ground truth success labels as green line if available
+        if success_labels is not None and len(success_labels) == len(success_binary):
+            ax2.step(
+                range(len(success_labels)),
+                success_labels,
+                where="post",
+                linewidth=2,
+                label="Ground Truth",
+                color="green",
+            )
+        ax2.set_ylabel("Success (Binary)")
+        ax2.set_ylim(-0.05, 1.05)
+        ax2.spines["right"].set_visible(False)
+        ax2.spines["top"].set_visible(False)
+        ax2.set_yticks([0, 1])
+        ax2.legend()
+    # Setup success probs subplot if available
+    if ax3 is not None and success_probs is not None:
+        ax3.plot(range(len(success_probs)), success_probs, linewidth=2, label="Success Prob", color="purple")
+        # Add ground truth success labels as green line if available
+        if success_labels is not None and len(success_labels) == len(success_probs):
+            ax3.step(
+                range(len(success_labels)),
+                success_labels,
+                where="post",
+                linewidth=2,
+                label="Ground Truth",
+                color="green",
+                linestyle="--",
+            )
+        ax3.set_ylabel("Success Probability")
+        ax3.set_ylim(-0.05, 1.05)
+        ax3.spines["right"].set_visible(False)
+        ax3.spines["top"].set_visible(False)
+        ax3.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
+        ax3.legend()
+    plt.tight_layout()
+    return fig
+def extract_frames(video_path: str, fps: float = 1.0, max_frames: int = 64) -> np.ndarray:
+    """Extract frames from video file as numpy array (T, H, W, C).
+    Supports both local file paths and URLs (e.g., HuggingFace Hub URLs).
+    Uses the provided ``fps`` to control how densely frames are sampled from
+    the underlying video, but caps the total number of frames at ``max_frames``
+    to prevent memory issues.
+    Args:
+        video_path: Path to video file or URL
+        fps: Frames per second to extract (default: 1.0)
+        max_frames: Maximum number of frames to extract (default: 64). This prevents
+            memory issues with long videos or high FPS settings.
+    Returns:
+        numpy array of shape (T, H, W, C) containing extracted frames, or None if error
+    """
+    if video_path is None:
+        return None
+    if isinstance(video_path, tuple):
+        video_path = video_path[0]
+    # Check if it's a URL or local file
+    is_url = video_path.startswith(("http://", "https://"))
+    is_local_file = os.path.exists(video_path) if not is_url else False
+    if not is_url and not is_local_file:
+        logger.warning(f"Video path does not exist: {video_path}")
+        return None
+    try:
+        # decord.VideoReader can handle both local files and URLs
+        vr = decord.VideoReader(video_path, num_threads=1)
+        total_frames = len(vr)
+        # Determine native FPS; fall back to a reasonable default if unavailable
+        try:
+            native_fps = float(vr.get_avg_fps())
+        except Exception:
+            native_fps = 1.0
+        # If user-specified fps is invalid or None, default to native fps
+        if fps is None or fps <= 0:
+            fps = native_fps
+        # Compute how many frames we want based on desired fps
+        # num_frames ≈ total_duration * fps = total_frames * (fps / native_fps)
+        if native_fps > 0:
+            desired_frames = int(round(total_frames * (fps / native_fps)))
+        else:
+            desired_frames = total_frames
+        # Clamp to [1, total_frames]
+        desired_frames = max(1, min(desired_frames, total_frames))
+        # IMPORTANT: Cap at max_frames to prevent memory issues
+        # This is critical when fps is high or videos are long
+        if desired_frames > max_frames:
+            logger.warning(
+                f"Requested {desired_frames} frames but capping at {max_frames} "
+                f"to prevent memory issues (video has {total_frames} frames at {native_fps:.2f} fps, "
+                f"requested extraction at {fps:.2f} fps)"
+            )
+            desired_frames = max_frames
+        # Evenly sample indices to match the desired number of frames
+        if desired_frames == total_frames:
+            frame_indices = list(range(total_frames))
+        else:
+            frame_indices = np.linspace(0, total_frames - 1, desired_frames, dtype=int).tolist()
+        frames_array = vr.get_batch(frame_indices).asnumpy()  # Shape: (T, H, W, C)
+        del vr
+        return frames_array
+    except Exception as e:
+        logger.error(f"Error extracting frames from {video_path}: {e}")
+        return None

requirements.txt CHANGED Viewed

@@ -26,8 +26,7 @@ watchfiles  # For file watching during development
 # RFM package (installed from git repository)
 # For local development, you can also install with: pip install -e ../ (from parent directory)
---ignore-requires-python
-git+https://github.com/aliang8/reward_fm.git@anthony_working
 # Make sure a newer version of gradio is installed
 gradio==4.44.0

 # RFM package (installed from git repository)
 # For local development, you can also install with: pip install -e ../ (from parent directory)
+# git+https://github.com/aliang8/reward_fm.git@anthony_working
 # Make sure a newer version of gradio is installed
 gradio==4.44.0

samplers/README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+# Sampler Strategies Documentation
+This document summarizes the data generation strategies used by each sampler type in the RFM data pipeline.
+## Overview
+The codebase contains three main sampler types:
+- **SimSampler**: Generates similarity scoring samples
+- **PrefSampler**: Generates preference prediction samples
+- **ProgressSampler**: Generates progress prediction samples
+Each sampler implements multiple strategies for generating training data, with automatic retry logic and strategy rebalancing on failure.
+---
+## SimSampler (Similarity Scoring)
+The `SimSampler` creates similarity scoring samples where two trajectories (`o^1` and `o^2`) are ranked against a reference trajectory (`o^ref`). The goal is to learn that `o^1` should be ranked higher than `o^2`.
+### Strategies
+#### 1. **REWIND**
+- **Description**: Creates a similarity sample where `o^1` is an optimal trajectory from the same task, and `o^2` is a rewound (subsampled) version of the reference trajectory.
+- **Purpose**: Learn to distinguish between optimal and suboptimal trajectories from the same task.
+- **Implementation**:
+  - `traj_sim`: Optimal trajectory from same task (via `_get_same_task_optimal`)
+  - `traj_diff`: Rewound trajectory from reference (via `subsample_rewind`)
+#### 2. **SUBOPTIMAL**
+- **Description**: Creates a similarity sample where `o^1` is an optimal trajectory from the same task, and `o^2` is a suboptimal/failure trajectory from the same task.
+- **Purpose**: Learn to distinguish between optimal and suboptimal trajectories from the same task.
+- **Conditions**: Only available when:
+  - Data source is in the failure category (`is_failure_ds`)
+  - Probability is boosted by 2x for failure category data sources
+- **Implementation**:
+  - `traj_sim`: Optimal trajectory from same task (via `_get_same_task_optimal`)
+  - `traj_diff`: Suboptimal trajectory from same task (via `_get_same_task_suboptimal`)
+#### 3. **PAIRED_HUMAN_ROBOT**
+- **Description**: Creates a similarity sample where `o^1` is a paired human/robot trajectory (opposite type from reference, same task), and `o^2` is from a different task.
+- **Purpose**: Learn to distinguish between same-task and different-task trajectories, leveraging paired human/robot demonstrations.
+- **Conditions**: Only available when:
+  - Data source is in the paired category (`is_paired_ds`)
+  - Paired human/robot data exists for the task
+  - Probability is boosted by 2x for paired category data sources
+- **Implementation**:
+  - `traj_sim`: Paired human/robot trajectory (via `_get_paired_human_robot_traj`)
+  - `traj_diff`: Trajectory from different task (via `_get_different_video_traj`)
+### Strategy Selection
+- Strategies are selected probabilistically based on `similarity_strategy_ratio` configuration
+- Probabilities are rebalanced when strategies fail
+- Strategies are removed after 4 consecutive failures
+- Maximum 10 total attempts per sample generation
+### Reference Trajectory Requirements
+- For non-RoboArena: Must have `quality_label == "successful"`
+- For RoboArena: Must have `partial_success` field present
+---
+## PrefSampler (Preference Prediction)
+The `PrefSampler` creates preference prediction samples with a chosen (preferred) trajectory and a rejected (suboptimal) trajectory.
+### Strategies
+#### 1. **REWIND**
+- **Description**: Uses the same optimal trajectory for both chosen and rejected, but applies rewind subsampling to the rejected trajectory.
+- **Purpose**: Learn that full trajectories are preferred over truncated/rewound versions.
+- **Implementation**:
+  - `chosen_trajectory`: Original optimal trajectory (forward subsampling)
+  - `rejected_trajectory`: Same trajectory with `subsample_rewind` strategy
+#### 2. **SUBOPTIMAL**
+- **Description**: Uses an optimal trajectory as chosen and a suboptimal/failure trajectory from the same task as rejected.
+- **Purpose**: Learn to prefer optimal trajectories over suboptimal ones from the same task.
+- **Conditions**: Only available when suboptimal trajectories exist for the task
+- **Implementation**:
+  - `chosen_trajectory`: Optimal trajectory
+  - `rejected_trajectory`: Suboptimal trajectory from same task (via `_get_same_task_suboptimal`)
+#### 3. **DIFFERENT_TASK**
+- **Description**: Uses an optimal trajectory as chosen and a trajectory from a completely different task as rejected.
+- **Purpose**: Learn that trajectories from the same task are preferred over trajectories from different tasks.
+- **Implementation**:
+  - `chosen_trajectory`: Optimal trajectory
+  - `rejected_trajectory`: Trajectory from different task (via `_get_different_video_traj`)
+  - **Note**: Rejected trajectory's `target_progress` is set to `[0.0]` for all timesteps
+#### 4. **REVERSE_PROGRESS**
+- **Description**: Uses the same optimal trajectory for both chosen and rejected, but applies reverse uniform sampling to the rejected trajectory.
+- **Purpose**: Learn that forward progress is preferred over reverse progress.
+- **Implementation**:
+  - `chosen_trajectory`: Original optimal trajectory (forward subsampling)
+  - `rejected_trajectory`: Same trajectory with `subsample_reverse` strategy
+#### 5. **ROBOARENA_PARTIAL_SUCCESS**
+- **Description**: Uses two trajectories from the same task with different `partial_success` values. The trajectory with higher `partial_success` becomes chosen, and the one with lower `partial_success` becomes rejected.
+- **Purpose**: Learn to prefer trajectories with higher partial success scores (RoboArena-specific).
+- **Conditions**: Only available for RoboArena trajectories (has `partial_success` field and data_source contains "roboarena")
+- **Implementation**:
+  - Finds a different trajectory from same task (via `_get_different_partial_success_traj`)
+  - Swaps trajectories if found trajectory has higher `partial_success`
+  - `chosen_trajectory`: Trajectory with higher `partial_success`
+  - `rejected_trajectory`: Trajectory with lower `partial_success`
+### Special Handling
+- **Non-successful trajectories**: If a trajectory has `quality_label != "successful"` (and is not RoboArena), it is automatically used as the rejected trajectory, with an optimal trajectory from the same task as the chosen trajectory.
+### Strategy Selection
+- Strategies are selected probabilistically based on `preference_strategy_ratio` configuration
+- Probabilities are rebalanced when strategies fail
+- Strategies are removed after 3 consecutive failures
+- Maximum 10 total attempts per sample generation
+---
+## ProgressSampler (Progress Prediction)
+The `ProgressSampler` creates progress prediction samples from a single trajectory, applying different subsampling strategies to create training data.
+### Strategies
+#### 1. **DIFFERENT_TASK_INSTRUCTION**
+- **Description**: Uses a trajectory from a different task, but keeps the original task's embeddings and instruction.
+- **Purpose**: Learn that progress should be 0.0 when the trajectory doesn't match the task instruction.
+- **Implementation**:
+  - Gets trajectory from different task (via `_get_different_task_instruction`)
+  - Replaces embeddings with original task's embeddings
+  - Sets `target_progress = [0.0]` for all timesteps
+  - Uses forward subsampling
+#### 2. **FORWARD_PROGRESS**
+- **Description**: Samples the same trajectory with forward direction (start < middle < end).
+- **Purpose**: Learn normal forward progress patterns.
+- **Implementation**:
+  - Uses same trajectory with `subsample_forward` strategy
+  - Progress increases from start to end
+#### 3. **REVERSE_PROGRESS**
+- **Description**: Samples the same trajectory with reverse direction (end < middle < start).
+- **Purpose**: Learn to handle reverse progress scenarios.
+- **Implementation**:
+  - Uses same trajectory with `subsample_reverse` strategy
+  - Progress decreases from start to end
+#### 4. **REWIND**
+- **Description**: Samples the same trajectory with rewind direction (start < end < middle).
+- **Purpose**: Learn to handle non-monotonic progress patterns.
+- **Implementation**:
+  - Uses same trajectory with `subsample_rewind` strategy
+  - Progress pattern: increases, then decreases
+### Strategy Selection
+- Strategies are selected probabilistically based on `progress_strategy_ratio` configuration
+- Probabilities are rebalanced when strategies fail
+- Failed strategies are immediately removed (no retry count threshold)
+- Maximum 10 total attempts per sample generation
+---
+## Common Features
+### Retry Logic
+All samplers implement retry logic with:
+- Maximum attempt limits (typically 10 attempts)
+- Strategy-specific retry counts (3-4 attempts per strategy)
+- Automatic strategy removal after consecutive failures
+- Probability rebalancing when strategies are removed
+### Subsample Strategies
+Common subsampling strategies used across samplers:
+- `subsample_forward`: Normal forward sampling (start → end)
+- `subsample_reverse`: Reverse sampling (end → start)
+- `subsample_rewind`: Rewind sampling (start → end → start)
+### Data Source Filtering
+- Strategies may be filtered or boosted based on data source categories:
+  - **Failure category**: Boosts SUBOPTIMAL strategy probability by 2x
+  - **Paired category**: Boosts PAIRED_HUMAN_ROBOT strategy probability by 2x
+  - **RoboArena**: Special handling for `partial_success` field

samplers/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from rfm.data.samplers.base import RFMBaseSampler
+from rfm.data.samplers.pref import PrefSampler
+from rfm.data.samplers.sim import SimSampler
+from rfm.data.samplers.progress import ProgressSampler
+from rfm.data.samplers.eval.confusion_matrix import ConfusionMatrixSampler
+from rfm.data.samplers.eval.progress_policy_ranking import ProgressPolicyRankingSampler
+from rfm.data.samplers.eval.reward_alignment import RewardAlignmentSampler
+from rfm.data.samplers.eval.quality_preference import QualityPreferenceSampler
+from rfm.data.samplers.eval.roboarena_quality_preference import RoboArenaQualityPreferenceSampler
+from rfm.data.samplers.eval.similarity_score import SimilarityScoreSampler
+__all__ = [
+    "RFMBaseSampler",
+    "PrefSampler",
+    "SimSampler",
+    "ProgressSampler",
+    "ConfusionMatrixSampler",
+    "ProgressPolicyRankingSampler",
+    "RewardAlignmentSampler",
+    "QualityPreferenceSampler",
+    "RoboArenaQualityPreferenceSampler",
+    "SimilarityScoreSampler",
+]

samplers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

samplers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.2 kB). View file

samplers/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (19.8 kB). View file

samplers/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (33.1 kB). View file

samplers/__pycache__/confusion_matrix.cpython-310.pyc ADDED Viewed

Binary file (4.58 kB). View file

samplers/__pycache__/confusion_matrix.cpython-311.pyc ADDED Viewed

Binary file (7.71 kB). View file

samplers/__pycache__/pref.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

samplers/__pycache__/pref.cpython-311.pyc ADDED Viewed

Binary file (16.9 kB). View file

samplers/__pycache__/progress.cpython-310.pyc ADDED Viewed

Binary file (5.11 kB). View file

samplers/__pycache__/progress.cpython-311.pyc ADDED Viewed

Binary file (8.45 kB). View file

samplers/__pycache__/progress_default.cpython-310.pyc ADDED Viewed

Binary file (3.98 kB). View file

samplers/__pycache__/progress_default.cpython-311.pyc ADDED Viewed

Binary file (6.48 kB). View file

samplers/__pycache__/quality_preference.cpython-310.pyc ADDED Viewed

Binary file (5.49 kB). View file

samplers/__pycache__/quality_preference.cpython-311.pyc ADDED Viewed

Binary file (9.6 kB). View file

samplers/__pycache__/reward_alignment.cpython-310.pyc ADDED Viewed

Binary file (4.5 kB). View file

samplers/__pycache__/reward_alignment.cpython-311.pyc ADDED Viewed

Binary file (7.15 kB). View file

samplers/__pycache__/roboarena.cpython-310.pyc ADDED Viewed

Binary file (4.82 kB). View file

samplers/__pycache__/sim.cpython-310.pyc ADDED Viewed

Binary file (11.6 kB). View file

samplers/__pycache__/sim.cpython-311.pyc ADDED Viewed

Binary file (19.6 kB). View file

samplers/__pycache__/success_failure.cpython-310.pyc ADDED Viewed

Binary file (4.27 kB). View file

samplers/__pycache__/success_failure.cpython-311.pyc ADDED Viewed

Binary file (7.05 kB). View file

samplers/base.py ADDED Viewed

	@@ -0,0 +1,753 @@

+#!/usr/bin/env python3
+from typing import Optional, Dict, Any, List, Set, Tuple, Union
+import numpy as np
+import random
+import torch
+from random import Random
+from datasets import Dataset
+from rfm.configs.experiment_configs import DataConfig
+from rfm.data.datasets.helpers import (
+    load_frames_from_npz,
+    get_segment_indices_with_middle,
+    compute_progress_from_segment,
+    pad_trajectory_to_max_frames_torch,
+    pad_trajectory_to_max_frames_np,
+    compute_success_labels,
+    create_trajectory_from_dict,
+    load_embeddings_from_path,
+    linspace_subsample_frames,
+    convert_continuous_to_discrete_bins,
+)
+from rfm.data.dataset_types import Trajectory
+from rfm.utils.logger import get_logger
+logger = get_logger()
+class RFMBaseSampler:
+    """Base sampler class that provides trajectory retrieval functions for generating samples."""
+    def __init__(
+        self,
+        config: DataConfig,
+        dataset: Dataset,
+        combined_indices: Dict[str, Any],
+        dataset_success_cutoff_map: Optional[Dict[str, float]] = None,
+        verbose: bool = True,
+        random_seed: int = 42,
+    ):
+        """Initialize sampler with dataset and indices.
+        Args:
+            config: Configuration object
+            dataset: The loaded dataset
+            combined_indices: Dictionary of combined indices from dataset loading
+            dataset_success_cutoff_map: Dictionary mapping dataset names to success cutoff percentages
+            verbose: Verbose flag
+            random_seed: Random seed for deterministic sampling. Creates a local Random instance to avoid affecting global random state.
+        """
+        self.config = config
+        self.dataset = dataset
+        self.verbose = verbose
+        self.dataset_success_cutoff_map = dataset_success_cutoff_map or {}
+        self._local_random = Random(random_seed)
+        self._cached_ids = self.dataset["id"]
+        self._cached_is_robot = self.dataset["is_robot"]
+        # Build indices from combined_indices
+        self._build_indices(combined_indices)
+    def _build_indices(self, combined_indices):
+        """Build all index mappings from combined_indices.
+        Args:
+            combined_indices: Dictionary of combined indices from dataset loading
+        """
+        # Initialize index mappings from the loaded indices
+        self.robot_trajectories = combined_indices["robot_trajectories"]
+        self.human_trajectories = combined_indices["human_trajectories"]
+        self.optimal_by_task = combined_indices["optimal_by_task"]
+        self.suboptimal_by_task = combined_indices["suboptimal_by_task"]
+        self.quality_indices = combined_indices["quality_indices"]
+        self.task_indices = combined_indices["task_indices"]
+        self.source_indices = combined_indices["source_indices"]
+        self.partial_success_indices = combined_indices["partial_success_indices"]
+        self.paired_human_robot_by_task = combined_indices["paired_human_robot_by_task"]
+        self.tasks_with_multiple_quality_labels = combined_indices["tasks_with_multiple_quality_labels"]
+        # Build mapping from data source -> available task instructions
+        self._build_tasks_by_data_source()
+    def _build_tasks_by_data_source(self):
+        """Cache mapping from data_source to available task instructions."""
+        self.tasks_by_data_source: Dict[str, List[str]] = {}
+        all_tasks = self.dataset["task"]
+        all_sources = self.dataset["data_source"]
+        source_to_tasks: Dict[str, Set[str]] = {}
+        for task, source in zip(all_tasks, all_sources):
+            if task is None or source is None:
+                continue
+            if source not in source_to_tasks:
+                source_to_tasks[source] = set()
+            source_to_tasks[source].add(task)
+        self.tasks_by_data_source = {source: list(tasks) for source, tasks in source_to_tasks.items()}
+    def _generate_sample(self, item):
+        """Generate a sample from an item.
+        This method should be overridden by subclasses to implement their specific
+        sample generation logic.
+        Args:
+            item: An item from the dataset (typically a trajectory dict)
+        Returns:
+            A sample object (e.g., PreferenceSample, SimilaritySample, ProgressSample)
+        """
+        raise NotImplementedError("Subclasses must implement _generate_sample")
+    def _get_same_task_optimal(self, ref_traj: dict) -> dict | None:
+        """Get optimal trajectory from same task (different from ref).
+        Args:
+            ref_traj: Reference trajectory
+        Returns:
+            Same task optimal trajectory dict or None if not available
+        """
+        task_name = ref_traj["task"]
+        same_task_optimal_indices = self.optimal_by_task.get(task_name, [])
+        if not same_task_optimal_indices:
+            logger.trace(f"[BASE SAMPLER] _get_same_task_optimal: No optimal indices for task '{task_name}'")
+            return None
+        # Use cached IDs to check without loading full trajectories
+        chosen_id = ref_traj["id"]
+        random_idx = random.choice(same_task_optimal_indices)
+        # Retry if the selected trajectory has the same ID as ref
+        max_retries = min(10, len(same_task_optimal_indices))
+        retries = 0
+        while self._cached_ids[random_idx] == chosen_id and retries < max_retries:
+            random_idx = random.choice(same_task_optimal_indices)
+            retries += 1
+        # If still matches after retries, fall back to filtering
+        if self._cached_ids[random_idx] == chosen_id:
+            filtered_indices = [idx for idx in same_task_optimal_indices if self._cached_ids[idx] != chosen_id]
+            if filtered_indices:
+                random_idx = random.choice(filtered_indices)
+            else:
+                # No other trajectories available
+                logger.trace(
+                    f"[BASE SAMPLER] _get_same_task_optimal: All trajectories have same ID '{chosen_id}' for task '{task_name}'"
+                )
+                return None
+        result = self.dataset[random_idx]
+        logger.trace(
+            f"[BASE SAMPLER] _get_same_task_optimal: Found trajectory {result.get('id', 'unknown')} for task '{task_name}'"
+        )
+        return result
+    def _get_same_task_suboptimal(self, ref_traj: dict) -> dict | None:
+        """Get suboptimal trajectory from same task.
+        For trajectories with partial_success, uses partial_success logic instead of quality_label logic.
+        Args:
+            ref_traj: Reference trajectory
+        Returns:
+            Suboptimal trajectory dict or None if not available
+        """
+        # Check if this trajectory uses partial_success
+        use_partial_success = ref_traj.get("partial_success") is not None
+        if use_partial_success:
+            # For trajectories with partial_success, use partial_success logic
+            return self._get_different_partial_success_traj(ref_traj)
+        # For trajectories without partial_success, use the standard suboptimal logic
+        task_name = ref_traj["task"]
+        same_task_suboptimal_indices = self.suboptimal_by_task.get(task_name, [])
+        if not same_task_suboptimal_indices:
+            logger.trace(f"[BASE SAMPLER] _get_same_task_suboptimal: No suboptimal indices for task '{task_name}'")
+            return None
+        # Use cached IDs to check without loading full trajectories
+        chosen_id = ref_traj["id"]
+        random_idx = random.choice(same_task_suboptimal_indices)
+        # Retry if the selected trajectory has the same ID as ref
+        max_retries = min(10, len(same_task_suboptimal_indices))
+        retries = 0
+        while self._cached_ids[random_idx] == chosen_id and retries < max_retries:
+            random_idx = random.choice(same_task_suboptimal_indices)
+            retries += 1
+        # If still matches after retries, fall back to filtering
+        if self._cached_ids[random_idx] == chosen_id:
+            filtered_indices = [idx for idx in same_task_suboptimal_indices if self._cached_ids[idx] != chosen_id]
+            if filtered_indices:
+                random_idx = random.choice(filtered_indices)
+            else:
+                # No other trajectories available
+                logger.trace(
+                    f"[BASE SAMPLER] _get_same_task_suboptimal: All trajectories have same ID '{chosen_id}' for task '{task_name}'"
+                )
+                return None
+        result = self.dataset[random_idx]
+        logger.trace(
+            f"[BASE SAMPLER] _get_same_task_suboptimal: Found trajectory {result.get('id', 'unknown')} for task '{task_name}'"
+        )
+        return result
+    def _get_different_video_traj(self, ref_traj: dict) -> dict | None:
+        """Get trajectory from different task.
+        Args:
+            ref_traj: Reference trajectory
+        Returns:
+            Different task trajectory dict or None if not available
+        """
+        same_source_prob = self.config.traj_same_source_prob
+        data_source = ref_traj.get("data_source")
+        other_tasks = []
+        if data_source and data_source in self.tasks_by_data_source and random.random() < same_source_prob:
+            other_tasks = [task for task in self.tasks_by_data_source[data_source] if task != ref_traj["task"]]
+        if not other_tasks:
+            other_tasks = [task for task in self.optimal_by_task.keys() if task != ref_traj["task"]]
+        if not other_tasks:
+            logger.trace(
+                f"[BASE SAMPLER] _get_different_video_traj: No other tasks available (ref task: '{ref_traj['task']}')"
+            )
+            return None
+        # Try up to 2 times to find a valid task
+        max_retries = 2
+        other_task_indices = None
+        other_task = None
+        for attempt in range(max_retries):
+            other_task = random.choice(other_tasks)
+            if other_task not in self.optimal_by_task:
+                logger.trace(
+                    f"[BASE SAMPLER] _get_different_video_traj: Attempt {attempt + 1}/{max_retries}: Task '{other_task}' not found in optimal_by_task"
+                )
+                continue
+            other_task_indices = self.optimal_by_task[other_task]
+            if not other_task_indices:
+                logger.trace(
+                    f"[BASE SAMPLER] _get_different_video_traj: Attempt {attempt + 1}/{max_retries}: Task '{other_task}' has no optimal indices"
+                )
+                continue
+            # Found a valid task with indices
+            break
+        if other_task_indices is None or not other_task_indices:
+            logger.trace(
+                f"[BASE SAMPLER] _get_different_video_traj: Failed to find valid task after {max_retries} attempts"
+            )
+            return None
+        other_idx = random.choice(other_task_indices)
+        result = self.dataset[other_idx]
+        logger.trace(
+            f"[BASE SAMPLER] _get_different_video_traj: Found trajectory {result.get('id', 'unknown')} from task '{other_task}'"
+        )
+        return result
+    def _get_different_task_instruction(self, ref_traj: dict) -> dict | None:
+        """Get the same trajectory but with a different task instruction.
+        Args:
+            ref_traj: Reference trajectory
+        Returns:
+            Trajectory dict with different task instruction or None if not available
+        """
+        same_source_prob = self.config.traj_same_source_prob
+        data_source = ref_traj.get("data_source")
+        candidate_tasks = []
+        if data_source and data_source in self.tasks_by_data_source and random.random() < same_source_prob:
+            candidate_tasks = [task for task in self.tasks_by_data_source[data_source] if task != ref_traj["task"]]
+        if not candidate_tasks:
+            candidate_tasks = [task for task in self.optimal_by_task.keys() if task != ref_traj["task"]]
+        if not candidate_tasks:
+            logger.trace(
+                f"[BASE SAMPLER] _get_different_task_instruction: No candidate tasks available (ref task: '{ref_traj['task']}')"
+            )
+            return None
+        other_task = random.choice(candidate_tasks)
+        # Get embeddings_path and lang_vector from a random trajectory with the other_task
+        other_task_indices = self.optimal_by_task.get(other_task, [])
+        if not other_task_indices:
+            logger.trace(f"[BASE SAMPLER] _get_different_task_instruction: Task '{other_task}' has no optimal indices")
+            return None
+        other_task_idx = random.choice(other_task_indices)
+        other_task_traj = self.dataset[other_task_idx]
+        # Create a copy of the trajectory with the task changed
+        # Use embeddings_path and lang_vector from the other_task trajectory
+        new_traj = ref_traj.copy()
+        new_traj["task"] = other_task
+        # Get embeddings_path and lang_vector from a random trajectory with the other_task
+        if "embeddings_path" in other_task_traj:
+            new_traj["embeddings_path"] = other_task_traj["embeddings_path"]
+        if "lang_vector" in other_task_traj:
+            new_traj["lang_vector"] = other_task_traj["lang_vector"]
+        return new_traj
+    def _get_paired_human_robot_traj(self, ref_traj: dict) -> dict | None:
+        """Get paired human/robot trajectory for the same task.
+        Given a reference trajectory, if it's a robot trajectory, returns a human trajectory
+        from the same task. If it's a human trajectory, returns a robot trajectory from the
+        same task.
+        Args:
+            ref_traj: Reference trajectory (can be robot or human)
+        Returns:
+            Paired trajectory dict (opposite type) or None if not available
+        """
+        task = ref_traj["task"]
+        is_robot = ref_traj.get("is_robot", True)
+        if task not in self.paired_human_robot_by_task:
+            logger.trace(
+                f"[BASE SAMPLER] _get_paired_human_robot_traj: Task '{task}' not in paired_human_robot_by_task"
+            )
+            return None
+        task_pairs = self.paired_human_robot_by_task[task]
+        # Get opposite type
+        opposite_key = "human" if is_robot else "robot"
+        opposite_indices = task_pairs.get(opposite_key, [])
+        if not opposite_indices:
+            logger.trace(f"[BASE SAMPLER] _get_paired_human_robot_traj: No {opposite_key} indices for task '{task}'")
+            return None
+        # Sample a paired trajectory and verify it's different from reference
+        chosen_id = ref_traj["id"]
+        available_indices = opposite_indices.copy()
+        paired_traj = None
+        # Add retry limit to prevent infinite loops
+        max_retries = min(len(available_indices), 10)
+        retries = 0
+        logger.trace(
+            f"[BASE SAMPLER] _get_paired_human_robot_traj: Looking for {opposite_key} trajectory (chosen_id: {chosen_id}, available: {len(available_indices)})"
+        )
+        while (paired_traj is None or paired_traj.get("id") == chosen_id) and retries < max_retries:
+            retries += 1
+            if not available_indices:
+                logger.trace(
+                    f"[BASE SAMPLER] _get_paired_human_robot_traj: No more available indices after {retries} retries"
+                )
+                return None
+            paired_idx = random.choice(available_indices)
+            paired_traj = self.dataset[paired_idx]
+            # If it matches, remove this index and try again
+            if paired_traj.get("id") == chosen_id:
+                available_indices = [idx for idx in available_indices if idx != paired_idx]
+                paired_traj = None
+                continue
+        # If we exhausted retries without finding a valid trajectory, return None
+        if paired_traj is None or paired_traj.get("id") == chosen_id:
+            logger.trace(
+                f"[BASE SAMPLER] _get_paired_human_robot_traj: Failed to find valid paired trajectory after {max_retries} retries"
+            )
+            return None
+        logger.trace(
+            f"[BASE SAMPLER] _get_paired_human_robot_traj: Found paired trajectory {paired_traj.get('id', 'unknown')} on retry {retries}"
+        )
+        return paired_traj
+    def _get_different_partial_success_traj(self, ref_traj: dict) -> dict | None:
+        """Get trajectory from same task with different partial_success.
+        Finds trajectories with either higher or lower partial_success than the reference,
+        using absolute difference for threshold checking.
+        Args:
+            ref_traj: Reference trajectory
+        Returns:
+            Trajectory dict with different partial_success from same task or None if not available
+        """
+        task_name = ref_traj["task"]
+        ref_partial_success = ref_traj.get("partial_success")
+        # Check if partial_success is available
+        if ref_partial_success is None:
+            logger.trace(
+                f"[BASE SAMPLER] _get_different_partial_success_traj: No partial_success for trajectory {ref_traj.get('id', 'unknown')}"
+            )
+            return None
+        # Get minimum threshold from config
+        min_threshold = getattr(self.config, "partial_success_threshold", 0.2)
+        # Get all trajectories from the same task
+        same_task_indices = self.task_indices.get(task_name, [])
+        if not same_task_indices:
+            logger.trace(
+                f"[BASE SAMPLER] _get_different_partial_success_traj: No trajectories found for task '{task_name}'"
+            )
+            return None
+        # Filter to trajectories with different partial_success that meet the threshold requirement
+        # Uses absolute difference to allow both higher and lower partial_success
+        chosen_id = ref_traj["id"]
+        candidate_indices = []
+        for idx in same_task_indices:
+            # Skip if same trajectory
+            if self._cached_ids[idx] == chosen_id:
+                continue
+            # Get partial_success for this trajectory
+            traj_dict = self.dataset[idx]
+            traj_partial_success = traj_dict.get("partial_success", None)
+            if traj_partial_success is None:
+                logger.trace(
+                    f"[BASE SAMPLER] _get_different_partial_success_traj: No partial_success for trajectory {traj_dict.get('id', 'unknown')}, task '{task_name}'"
+                )
+                continue
+            # Include if partial_success differs from reference by at least the threshold (using abs)
+            partial_success_diff = abs(ref_partial_success - traj_partial_success)
+            if partial_success_diff >= min_threshold:
+                candidate_indices.append(idx)
+        if not candidate_indices:
+            logger.trace(
+                f"[BASE SAMPLER] _get_different_partial_success_traj: No trajectories with different partial_success (threshold: {min_threshold}) for task '{task_name}' (ref: {ref_partial_success})"
+            )
+            return None
+        # Randomly select from candidates
+        selected_idx = random.choice(candidate_indices)
+        result = self.dataset[selected_idx]
+        result_partial_success = result.get("partial_success")
+        # If ref_partial_success is 1.0, direction is always "lower" since 1.0 is the maximum
+        if ref_partial_success == 1.0:
+            direction = "lower"
+        else:
+            direction = "higher" if result_partial_success > ref_partial_success else "lower"
+        logger.trace(
+            f"[BASE SAMPLER] _get_different_partial_success_traj: Found trajectory {result.get('id', 'unknown')} with partial_success {result_partial_success} ({direction} than {ref_partial_success}, abs diff: {abs(ref_partial_success - result_partial_success):.3f}, threshold: {min_threshold})"
+        )
+        return result
+    def _get_subsample_indices(
+        self, data, direction: str = "bidirectional", max_frames: int = None
+    ) -> Optional[Tuple[int, int, int]]:
+        """Get start, middle, and end indices for subsample strategy.
+        Samples three random frames from the trajectory. The relationship between indices
+        follows three main scenarios:
+        1. start < middle < end: forward progress - normal forward progression through trajectory
+        2. start < end < middle: rewind progress - forward from start to end, then continues to middle (simulating rewind/backtrack)
+        3. end < middle < start: reverse progress - backward from start through middle to end (full backward traversal)
+        Args:
+            data: Trajectory data (frames or embeddings) to sample from
+            direction: Sampling direction - "forward" (start < middle < end),
+                      "reverse" (end < middle < start),
+                      "rewind" (start < end < middle),
+                      or "bidirectional" (any of the 3 orderings)
+            max_frames: Maximum number of frames to subsample. If 1, returns only start. If 2, returns start and end.
+        Returns:
+            Tuple of (start_idx, middle_idx, end_idx), or None if insufficient frames
+            For max_frames == 1: returns (start_idx, None, None)
+            For max_frames == 2: returns (start_idx, None, end_idx)
+        """
+        num_frames_total = len(data) if hasattr(data, "__len__") else data.shape[0]
+        # Handle edge cases for max_frames == 1 or 2
+        if max_frames == 1:
+            # Randomly sample 1 frame
+            random_idx = random.randint(0, num_frames_total - 1)
+            logger.trace(f"[BASE SAMPLER] _get_subsample_indices: max_frames=1, randomly sampled idx={random_idx}")
+            return (random_idx, None, None)
+        if max_frames == 2:
+            # Sample 2 frames: either forward (start < end) or reverse (end < start)
+            # No rewind possible with only 2 frames
+            if direction == "reverse":
+                # Reverse: sample end first, then start (end < start)
+                end_idx = random.randint(0, num_frames_total - 2)
+                start_idx = random.randint(end_idx + 1, num_frames_total - 1)
+            else:
+                # Forward: sample start first, then end (start < end)
+                start_idx = random.randint(0, num_frames_total - 2)
+                end_idx = random.randint(start_idx + 1, num_frames_total - 1)
+            logger.trace(
+                f"[BASE SAMPLER] _get_subsample_indices: max_frames=2, start_idx={start_idx}, end_idx={end_idx}, direction={direction}"
+            )
+            return (start_idx, None, end_idx)
+        if num_frames_total < 3:
+            logger.trace(f"[BASE SAMPLER] _get_subsample_indices: Not enough frames ({num_frames_total})")
+            return None
+        # Sample three random distinct frames
+        frame_indices = sorted(random.sample(range(num_frames_total), 3))
+        frame1_idx, frame2_idx, frame3_idx = frame_indices
+        # Determine start, middle, and end based on direction
+        # We only care about 3 cases:
+        # 1. start < middle < end: forward progress
+        # 2. start < end < middle: rewind progress
+        # 3. end < middle < start: reverse progress
+        if direction == "forward":
+            # Case 1: start < middle < end
+            start_idx = frame1_idx
+            middle_idx = frame2_idx
+            end_idx = frame3_idx
+        elif direction == "reverse":
+            # Case 3: end < middle < start
+            end_idx = frame1_idx
+            middle_idx = frame2_idx
+            start_idx = frame3_idx
+        elif direction == "rewind":
+            # Case 2: start < end < middle
+            start_idx = frame1_idx
+            end_idx = frame2_idx
+            middle_idx = frame3_idx
+        else:  # bidirectional (default)
+            # Randomly choose from the 3 cases
+            pattern = random.choice([1, 2, 3])
+            if pattern == 1:  # start < middle < end: forward progress
+                start_idx = frame1_idx
+                middle_idx = frame2_idx
+                end_idx = frame3_idx
+            elif pattern == 2:  # start < end < middle: rewind progress
+                start_idx = frame1_idx
+                end_idx = frame2_idx
+                middle_idx = frame3_idx
+            else:  # pattern == 3: end < middle < start: reverse progress
+                end_idx = frame1_idx
+                middle_idx = frame2_idx
+                start_idx = frame3_idx
+        logger.trace(
+            f"[BASE SAMPLER] _get_subsample_indices: Selected indices start={start_idx}, middle={middle_idx}, end={end_idx} "
+            f"from {num_frames_total} total frames (direction: {direction})"
+        )
+        return start_idx, middle_idx, end_idx
+    def _get_traj_from_data(
+        self,
+        traj: dict | Trajectory,
+        subsample_strategy: str | None = None,
+        frame_indices: List[int] | None = None,
+        metadata: Dict[str, Any] | None = None,
+    ) -> Trajectory:
+        """Load, subsample, and optionally pad trajectory data and create a Trajectory object.
+        Args:
+            traj: Trajectory dict or Trajectory object
+            subsample_strategy: Optional strategy for subsampling ("subsample_forward", "subsample_reverse", "subsample_rewind", or None for default/bidirectional). Ignored if frame_indices is provided.
+            frame_indices: Optional list of specific frame indices to use. If provided, subsample_strategy is ignored.
+            metadata: Optional metadata dict to merge into trajectory metadata.
+        Returns:
+            Trajectory object with loaded and subsampled data (padded)
+        """
+        # Initialize variables
+        frames = None
+        video_embeddings = None
+        text_embedding = None
+        data = None
+        if isinstance(traj, Trajectory):
+            # If already a Trajectory, just return it
+            return traj
+        # Load from dict
+        # Check if text_embedding is already provided in the dict (for samplers that need to override it)
+        if "text_embedding" in traj and traj["text_embedding"] is not None:
+            text_embedding = traj["text_embedding"]
+        if self.config.load_embeddings and traj.get("embeddings_path"):
+            embeddings = load_embeddings_from_path(traj["embeddings_path"])
+            video_embeddings = embeddings["video_embeddings"]
+            # Only use loaded text_embedding if not already provided in dict
+            if text_embedding is None:
+                text_embedding = embeddings["text_embedding"]
+            data = video_embeddings
+        else:
+            if isinstance(traj["frames"], str):
+                frames = load_frames_from_npz(traj["frames"])
+            else:
+                frames = traj["frames"]
+            data = frames
+        # Get total frames for progress computation
+        if hasattr(data, "shape"):
+            num_frames_total = data.shape[0]
+        else:
+            num_frames_total = len(data)
+        ds_key = traj["data_source"]
+        success_cutoff = self.dataset_success_cutoff_map.get(ds_key, self.config.max_success)
+        # Determine which indices to use (construct indices first, then subsample uniformly)
+        if frame_indices is not None:
+            # Use provided frame indices directly
+            indices = frame_indices
+        elif subsample_strategy is not None:
+            # Use subsampling strategy
+            # Get subsample indices (handles edge cases for max_frames == 1 or 2)
+            if subsample_strategy == "subsample_forward":
+                strategy_indices = self._get_subsample_indices(
+                    data, direction="forward", max_frames=self.config.max_frames
+                )
+            elif subsample_strategy == "subsample_reverse":
+                strategy_indices = self._get_subsample_indices(
+                    data, direction="reverse", max_frames=self.config.max_frames
+                )
+            elif subsample_strategy == "subsample_rewind":
+                strategy_indices = self._get_subsample_indices(
+                    data, direction="rewind", max_frames=self.config.max_frames
+                )
+            else:
+                strategy_indices = self._get_subsample_indices(
+                    data, direction="bidirectional", max_frames=self.config.max_frames
+                )
+            if strategy_indices is None:
+                logger.trace("[BASE SAMPLER] _get_traj_from_data: Failed to get uniform sample indices")
+                return None
+            start_idx, middle_idx, end_idx = strategy_indices
+            logger.trace(
+                f"[BASE SAMPLER] _get_traj_from_data: Subsampling trajectory with strategy: {subsample_strategy}, start_idx: {start_idx}, middle_idx: {middle_idx}, end_idx: {end_idx}"
+            )
+            # Use middle_idx only for rewind strategy (requires at least 3 frames)
+            use_middle = subsample_strategy == "subsample_rewind" and middle_idx is not None and num_frames_total >= 3
+            # Use get_segment_indices_with_middle to construct indices
+            indices = get_segment_indices_with_middle(
+                num_frames_total=num_frames_total,
+                start_idx=start_idx,
+                end_idx=end_idx,
+                middle_idx=middle_idx if use_middle else None,
+                max_frames=self.config.max_frames,
+            )
+        else:
+            # No subsampling strategy or indices provided - use all frames
+            indices = list(range(num_frames_total))
+        # Extract data using indices
+        subsampled = data[indices]
+        # Get partial_success early to pass to compute_progress_from_segment
+        partial_success = traj.get("partial_success")
+        # Compute progress
+        target_progress = compute_progress_from_segment(
+            num_frames_total=num_frames_total,
+            frame_indices=indices,
+            progress_pred_type=self.config.progress_pred_type,
+            success_cutoff=success_cutoff,
+            partial_success=partial_success,
+        )
+        # Subsample uniformly if needed (if we have more frames than max_frames)
+        current_frame_count = len(subsampled) if hasattr(subsampled, "__len__") else subsampled.shape[0]
+        if current_frame_count > self.config.max_frames:
+            subsampled, frame_indices_subsample = linspace_subsample_frames(subsampled, self.config.max_frames)
+            # Update indices and target_progress
+            if target_progress and len(target_progress) == current_frame_count:
+                target_progress = [target_progress[idx] for idx in frame_indices_subsample]
+            indices = [indices[idx] for idx in frame_indices_subsample] if isinstance(indices, list) else indices
+        # Pad if needed
+        if target_progress:
+            if self.config.load_embeddings:
+                subsampled, target_progress = pad_trajectory_to_max_frames_torch(
+                    subsampled, target_progress, self.config.max_frames
+                )
+            else:
+                subsampled, target_progress = pad_trajectory_to_max_frames_np(
+                    subsampled, target_progress, self.config.max_frames
+                )
+        # Update frames_shape
+        frames_shape = subsampled.shape if hasattr(subsampled, "shape") else tuple()
+        # Set frames or video_embeddings
+        if self.config.load_embeddings:
+            video_embeddings = subsampled
+        else:
+            frames = subsampled
+        # Compute success labels
+        success_label = compute_success_labels(
+            target_progress=target_progress,
+            data_source=traj["data_source"],
+            dataset_success_percent=self.dataset_success_cutoff_map,
+            max_success=self.config.max_success,
+            quality_label=traj.get("quality_label"),
+        )
+        # Convert partial_success and target_progress to discrete bins if in discrete mode
+        if self.config.progress_loss_type.lower() == "discrete":
+            if partial_success is not None:
+                partial_success = convert_continuous_to_discrete_bins(
+                    [partial_success], self.config.progress_discrete_bins
+                )[0]
+            target_progress = convert_continuous_to_discrete_bins(target_progress, self.config.progress_discrete_bins)
+        trajectory = create_trajectory_from_dict(
+            traj,
+            overrides={
+                "frames": frames,
+                "frames_shape": frames_shape,
+                "video_embeddings": video_embeddings,
+                "text_embedding": text_embedding,
+                "target_progress": target_progress,
+                "success_label": success_label,
+                "partial_success": partial_success,
+                "metadata": metadata,
+            },
+        )
+        return trajectory

samplers/eval/__pycache__/base_pref.cpython-310.pyc ADDED Viewed

Binary file (2.15 kB). View file

samplers/eval/__pycache__/base_pref.cpython-311.pyc ADDED Viewed

Binary file (3.22 kB). View file

samplers/eval/__pycache__/confusion_matrix.cpython-310.pyc ADDED Viewed

Binary file (8.56 kB). View file

samplers/eval/__pycache__/confusion_matrix.cpython-311.pyc ADDED Viewed

Binary file (15.7 kB). View file

samplers/eval/__pycache__/progress_default.cpython-310.pyc ADDED Viewed

Binary file (2.74 kB). View file

samplers/eval/__pycache__/progress_default.cpython-311.pyc ADDED Viewed

Binary file (6.93 kB). View file

samplers/eval/__pycache__/progress_policy_ranking.cpython-310.pyc ADDED Viewed

Binary file (6.34 kB). View file

samplers/eval/__pycache__/progress_policy_ranking.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

samplers/eval/__pycache__/quality_preference.cpython-310.pyc ADDED Viewed

Binary file (4.38 kB). View file

samplers/eval/__pycache__/quality_preference.cpython-311.pyc ADDED Viewed

Binary file (7.59 kB). View file

samplers/eval/__pycache__/reward_alignment.cpython-310.pyc ADDED Viewed

Binary file (4.24 kB). View file

samplers/eval/__pycache__/reward_alignment.cpython-311.pyc ADDED Viewed

Binary file (6.6 kB). View file

samplers/eval/__pycache__/roboarena_quality_preference.cpython-310.pyc ADDED Viewed

Binary file (2.9 kB). View file

samplers/eval/__pycache__/roboarena_quality_preference.cpython-311.pyc ADDED Viewed

Binary file (4.71 kB). View file

samplers/eval/__pycache__/similarity_score.cpython-310.pyc ADDED Viewed

Binary file (4.35 kB). View file

samplers/eval/__pycache__/similarity_score.cpython-311.pyc ADDED Viewed

Binary file (6.66 kB). View file

samplers/eval/base_pref.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Dict, Any
+import numpy as np
+from rfm.data.dataset_types import PreferenceSample, Trajectory
+from rfm.data.samplers.base import RFMBaseSampler
+class BaseQualityPreferenceSampler(RFMBaseSampler):
+    """Base class for quality preference samplers.
+    Subclasses should implement `_generate_all_sample_indices` to define how
+    trajectories are paired. This base class provides the common `_generate_sample_from_indices`
+    method that loads and processes the trajectories.
+    """
+    def _generate_sample_from_indices(self, sample_idx_info: Dict[str, Any]) -> PreferenceSample:
+        """Generate a single sample from stored indices."""
+        chosen_idx = sample_idx_info["chosen_traj_idx"]
+        rejected_idx = sample_idx_info["rejected_traj_idx"]
+        # Get the trajectories
+        chosen_traj = self.dataset[chosen_idx]
+        rejected_traj = self.dataset[rejected_idx]
+        chosen_metadata = {
+            "quality_label": chosen_traj["quality_label"],
+            "data_source": chosen_traj["data_source"],
+            "task": chosen_traj["task"],
+            "id": chosen_traj["id"],
+            "video_path": chosen_traj["frames"],
+        }
+        # Add partial_success if available
+        if chosen_traj.get("partial_success") is not None:
+            chosen_metadata["partial_success"] = chosen_traj.get("partial_success")
+        chosen_trajectory = self._get_traj_from_data(
+            traj=chosen_traj,
+            metadata=chosen_metadata,
+        )
+        rejected_metadata = {
+            "quality_label": rejected_traj["quality_label"],
+            "data_source": rejected_traj["data_source"],
+            "task": rejected_traj["task"],
+            "id": rejected_traj["id"],
+            "video_path": rejected_traj["frames"],
+        }
+        # Add partial_success if available
+        if rejected_traj.get("partial_success") is not None:
+            rejected_metadata["partial_success"] = rejected_traj.get("partial_success")
+        rejected_trajectory = self._get_traj_from_data(
+            traj=rejected_traj,
+            metadata=rejected_metadata,
+        )
+        data_gen_strategy = getattr(self, "data_gen_strategy", "quality_preference")
+        # Create preference sample
+        sample = PreferenceSample(
+            chosen_trajectory=chosen_trajectory,
+            rejected_trajectory=rejected_trajectory,
+            data_gen_strategy=data_gen_strategy,
+        )
+        return sample
+    def __len__(self):
+        return len(self.sample_indices)
+    def __getitem__(self, idx):
+        return self._generate_sample_from_indices(self.sample_indices[idx])

samplers/eval/confusion_matrix.py ADDED Viewed

	@@ -0,0 +1,299 @@

+#!/usr/bin/env python3
+"""
+Data generator for confusion matrix analysis.
+"""
+import random
+import torch
+from collections import Counter, defaultdict
+from typing import Tuple
+from rfm.data.dataset_types import PreferenceSample, ProgressSample
+from rfm.data.samplers.base import RFMBaseSampler
+from rfm.utils.distributed import rank_0_print
+from sentence_transformers import SentenceTransformer
+class ConfusionMatrixSampler(RFMBaseSampler):
+    """
+    Data generator that creates task-trajectory pairs for confusion matrix analysis.
+    For each unique task, creates samples with each trajectory to analyze
+    how well the model can distinguish between different tasks.
+    If multiple data sources are present, samples N random trajectories from each data source
+    and prioritizes different language instructions by randomizing the pairing order.
+    """
+    def __init__(self, n_trajectories_per_source: int = 5, **kwargs):
+        """Initialize confusion matrix sampler.
+        Args:
+            n_trajectories_per_source: Number of trajectories to sample from each data source.
+                If None, uses all available trajectories.
+            **kwargs: Additional arguments passed to parent class.
+        """
+        super().__init__(**kwargs)
+        self.n_trajectories_per_source = n_trajectories_per_source
+        # Load sentence transformer model and precompute embeddings for all unique tasks
+        self.sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
+        self.sentence_model.eval()
+        # Precompute language embeddings for all unique tasks
+        unique_tasks = list(self.task_indices.keys())
+        rank_0_print(f"Precomputing language embeddings for {len(unique_tasks)} unique tasks", verbose=self.verbose)
+        self.task_embeddings = {}
+        for task in unique_tasks:
+            embedding = self.sentence_model.encode(task)
+            self.task_embeddings[task] = torch.tensor(embedding)
+        rank_0_print(f"Precomputed {len(self.task_embeddings)} language embeddings", verbose=self.verbose)
+        # Free up the model after precomputation (no longer needed)
+        del self.sentence_model
+        self.sample_indices = self._generate_all_sample_indices()
+        rank_0_print(
+            f"Generated {len(self.sample_indices)} confusion matrix sample indices from {len(self.robot_trajectories)} trajectories and {len(self.task_indices)} tasks"
+        )
+    def _generate_all_sample_indices(self) -> list[dict]:
+        """Generate all possible task-trajectory pair sample indices.
+        If multiple data sources exist, samples N random trajectories from each data source.
+        Prioritizes different video tasks first, then prioritizes different language instructions
+        when creating pairs.
+        """
+        sample_indices = []
+        # Get unique tasks (these will be the language instructions)
+        unique_lang_tasks = list(self.task_indices.keys())
+        rank_0_print(f"Found {len(unique_lang_tasks)} unique language tasks: {unique_lang_tasks}", verbose=self.verbose)
+        # Sample trajectories per data source (prioritizing different video tasks)
+        sampled_trajectories, stats = self._sample_trajectories_by_data_source()
+        rank_0_print(
+            f"Processing {len(sampled_trajectories)} trajectories for confusion matrix analysis",
+            verbose=self.verbose,
+        )
+        # Print statistics about sampled trajectories
+        self._print_sampling_stats(stats)
+        # Shuffle language tasks once for round-robin pairing
+        shuffled_lang_tasks = unique_lang_tasks.copy()
+        self._local_random.shuffle(shuffled_lang_tasks)
+        # Create task-trajectory pairs with prioritized language instruction pairing
+        video_task_count = Counter()
+        for traj_idx in sampled_trajectories:
+            traj = self.dataset[traj_idx]
+            video_task = traj["task"]
+            # # Limit the number of video trajectories for each task to 5
+            # if video_task_count[video_task] >= 5:
+            #     continue
+            video_task_count[video_task] += 1
+            # Pair this trajectory with all language tasks (shuffled for variety)
+            traj_id = traj.get("id", str(traj_idx))
+            for lang_task in shuffled_lang_tasks:
+                sample_indices.append({
+                    "traj_idx": traj_idx,
+                    "lang_task": lang_task,
+                    "video_task": video_task,
+                    "video_path": traj["frames"],
+                    "id": traj_id,
+                })
+        # Shuffle final sample indices to further randomize the order
+        self._local_random.shuffle(sample_indices)
+        # Print statistics about pairs created
+        rank_0_print(f"Generated {len(sample_indices)} task-trajectory pairs", verbose=self.verbose)
+        rank_0_print(f"  Video tasks sampled: {dict(video_task_count)}", verbose=self.verbose)
+        rank_0_print(f"  Trajectories per video task: {dict(sorted(video_task_count.items()))}", verbose=self.verbose)
+        return sample_indices
+    def _sample_trajectories_by_data_source(self) -> Tuple[list[int], dict]:
+        """Sample N random trajectories from each data source, prioritizing different video tasks.
+        When sampling N trajectories, first selects one trajectory from each unique video task,
+        then repeats in round-robin fashion until N trajectories are sampled.
+        Returns:
+            Tuple of (list of sampled trajectory indices, stats dictionary)
+        """
+        sampled_indices = []
+        stats = {
+            "by_source": {},
+            "by_task": Counter(),
+            "traj_to_task": {},
+        }
+        # Group robot trajectories by data source, then by video task
+        trajectories_by_source_and_task = defaultdict(lambda: defaultdict(list))
+        for traj_idx in self.robot_trajectories:
+            traj = self.dataset[traj_idx]
+            data_source = traj.get("data_source", "unknown")
+            video_task = traj.get("task", "unknown")
+            trajectories_by_source_and_task[data_source][video_task].append(traj_idx)
+        rank_0_print(
+            f"Found {len(trajectories_by_source_and_task)} data sources: {list(trajectories_by_source_and_task.keys())}",
+            verbose=self.verbose,
+        )
+        # Sample N trajectories from each data source, prioritizing different tasks
+        for data_source, tasks_to_indices in trajectories_by_source_and_task.items():
+            # Shuffle trajectories within each task for randomization
+            for task in tasks_to_indices:
+                self._local_random.shuffle(tasks_to_indices[task])
+            # Get all unique tasks for this data source
+            all_tasks = list(tasks_to_indices.keys())
+            self._local_random.shuffle(all_tasks)  # Randomize task order too
+            source_stats = {
+                "total_available": sum(len(indices) for indices in tasks_to_indices.values()),
+                "tasks_available": {task: len(indices) for task, indices in tasks_to_indices.items()},
+                "tasks_sampled": Counter(),
+            }
+            if self.n_trajectories_per_source is None:
+                # Use all available trajectories
+                sampled_from_source = []
+                for task, indices in tasks_to_indices.items():
+                    sampled_from_source.extend(indices)
+                    source_stats["tasks_sampled"][task] = len(indices)
+                    stats["by_task"][task] += len(indices)
+                rank_0_print(
+                    f"  Data source '{data_source}': Using all {len(sampled_from_source)} trajectories",
+                    verbose=self.verbose,
+                )
+            else:
+                # Sample N trajectories using round-robin to prioritize different tasks
+                n_to_sample = min(self.n_trajectories_per_source, source_stats["total_available"])
+                sampled_from_source = []
+                # Round-robin sampling: first get one from each task, then repeat
+                task_iterators = {task: iter(indices) for task, indices in tasks_to_indices.items()}
+                task_list = all_tasks.copy()
+                round_idx = 0
+                while len(sampled_from_source) < n_to_sample:
+                    # If we've gone through all tasks once, reshuffle for next round
+                    if round_idx >= len(task_list):
+                        round_idx = 0
+                        self._local_random.shuffle(task_list)
+                    # Try to get one trajectory from current task
+                    task = task_list[round_idx]
+                    try:
+                        traj_idx = next(task_iterators[task])
+                        sampled_from_source.append(traj_idx)
+                        source_stats["tasks_sampled"][task] += 1
+                        stats["by_task"][task] += 1
+                    except StopIteration:
+                        # This task is exhausted, remove it from rotation
+                        task_list.pop(round_idx)
+                        if not task_list:
+                            break  # All tasks exhausted
+                        continue
+                    round_idx += 1
+                rank_0_print(
+                    f"  Data source '{data_source}': Sampled {len(sampled_from_source)} out of {source_stats['total_available']} trajectories",
+                    verbose=self.verbose,
+                )
+                rank_0_print(
+                    f"    Tasks sampled: {dict(sorted(source_stats['tasks_sampled'].items()))}",
+                    verbose=self.verbose,
+                )
+            # Track trajectory to task mapping for stats
+            for traj_idx in sampled_from_source:
+                traj = self.dataset[traj_idx]
+                traj_id = traj.get("id", str(traj_idx))
+                stats["traj_to_task"][traj_id] = traj.get("task", "unknown")
+            sampled_indices.extend(sampled_from_source)
+            stats["by_source"][data_source] = source_stats
+        return sampled_indices, stats
+    def _print_sampling_stats(self, stats: dict):
+        """Print detailed statistics about sampled trajectories.
+        Args:
+            stats: Statistics dictionary from _sample_trajectories_by_data_source
+        """
+        if not self.verbose:
+            return
+        rank_0_print("\n=== Confusion Matrix Sampling Statistics ===", verbose=self.verbose)
+        # Overall task statistics
+        rank_0_print(f"\nOverall trajectories per video task:", verbose=self.verbose)
+        for task, count in sorted(stats["by_task"].items()):
+            rank_0_print(f"  {task}: {count} trajectories", verbose=self.verbose)
+        # Per data source statistics
+        rank_0_print(f"\nPer data source breakdown:", verbose=self.verbose)
+        for data_source, source_stats in stats["by_source"].items():
+            rank_0_print(f"  Data source: {data_source}", verbose=self.verbose)
+            rank_0_print(f"    Total available: {source_stats['total_available']}", verbose=self.verbose)
+            rank_0_print(f"    Tasks available: {len(source_stats['tasks_available'])}", verbose=self.verbose)
+            for task, count in sorted(source_stats['tasks_available'].items()):
+                sampled_count = source_stats['tasks_sampled'].get(task, 0)
+                rank_0_print(
+                    f"      {task}: {sampled_count}/{count} trajectories sampled",
+                    verbose=self.verbose,
+                )
+        rank_0_print("=" * 50, verbose=self.verbose)
+    def _generate_sample_from_indices(self, sample_idx_info: dict) -> PreferenceSample:
+        """Generate a single task-trajectory sample from stored indices."""
+        traj_idx = sample_idx_info["traj_idx"]
+        lang_task = sample_idx_info["lang_task"]
+        video_task = sample_idx_info["video_task"]
+        video_path = sample_idx_info["video_path"]
+        video_traj = self.dataset[traj_idx]
+        # Look up precomputed embedding instead of encoding
+        text_embedding = self.task_embeddings[lang_task]
+        metadata = {
+            "id": video_traj["id"],
+            "lang_task": lang_task,
+            "video_task": video_task,
+            "video_path": video_path,
+        }
+        # Override task and text_embedding in the trajectory dict
+        video_traj_with_task = video_traj.copy()
+        video_traj_with_task["task"] = lang_task
+        video_traj_with_task["text_embedding"] = text_embedding
+        sample_trajectory = self._get_traj_from_data(
+            traj=video_traj_with_task,
+            metadata=metadata,
+        )
+        sample = ProgressSample(trajectory=sample_trajectory)
+        return sample
+    def __len__(self):
+        return len(self.sample_indices)
+    def __getitem__(self, idx):
+        return self._generate_sample_from_indices(self.sample_indices[idx])

samplers/eval/progress_policy_ranking.py ADDED Viewed

	@@ -0,0 +1,231 @@

+from typing import Dict, List, Any, Optional
+from itertools import cycle
+import numpy as np
+from collections import defaultdict
+from rfm.data.dataset_types import ProgressSample
+from rfm.data.samplers.base import RFMBaseSampler
+from rfm.utils.logger import get_logger
+logger = get_logger()
+class ProgressPolicyRankingSampler(RFMBaseSampler):
+    """Dataset that generates progress samples for policy ranking by selecting N trajectories per quality label for tasks with multiple quality labels."""
+    def __init__(
+        self,
+        num_examples_per_quality_pr: int = 5,
+        num_partial_successes: Optional[int] = None,
+        frame_step: int = 1,
+        use_frame_steps: bool = True,
+        max_tasks: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_examples_per_quality_pr = num_examples_per_quality_pr
+        self.num_partial_successes = num_partial_successes
+        self.frame_step = frame_step
+        self.use_frame_steps = use_frame_steps
+        self.max_tasks = max_tasks
+        logger.info(f"ProgressPolicyRankingSampler initialized with {len(self.robot_trajectories)} trajectories")
+        self.sample_indices = self._generate_all_sample_indices()
+        logger.info(f"Generated {len(self.sample_indices)} sample indices")
+    def _generate_all_sample_indices(self) -> List[Dict[str, Any]]:
+        """Generate sample indices by selecting tasks with multiple quality labels/partial_success values and sampling N trajectories per group.
+        For non-RoboArena: Groups by task and quality_label.
+        For RoboArena: Groups by task and partial_success values.
+        If use_frame_steps=True, generates subsequence samples like reward_alignment (0:frame_step, 0:2*frame_step, etc.).
+        If use_frame_steps=False, generates one sample per trajectory (whole trajectory).
+        """
+        # Check if this is RoboArena (has partial_success)
+        is_roboarena = False
+        if self.robot_trajectories:
+            first_traj = self.dataset[self.robot_trajectories[0]]
+            is_roboarena = first_traj.get("partial_success") is not None
+        # Group trajectories by task and grouping key (quality_label or partial_success)
+        task_to_key_to_trajs = defaultdict(lambda: defaultdict(list))
+        for traj_idx in self.robot_trajectories:
+            traj = self.dataset[traj_idx]
+            task = traj["task"]
+            if is_roboarena:
+                # RoboArena: Use rounded partial_success as key to group similar values
+                partial_success_val = traj.get("partial_success")
+                if partial_success_val is not None:
+                    partial_success = round(float(partial_success_val), 2)
+                    task_to_key_to_trajs[task][partial_success].append(traj_idx)
+            else:
+                # Non-RoboArena: Use quality_label
+                quality = traj["quality_label"]
+                task_to_key_to_trajs[task][quality].append(traj_idx)
+        # Filter to tasks that have multiple grouping values
+        tasks_with_multiple_values = {
+            task: key_to_trajs for task, key_to_trajs in task_to_key_to_trajs.items() if len(key_to_trajs) > 1
+        }
+        dataset_type_str = "partial_success values" if is_roboarena else "quality labels"
+        logger.info(f"Found {len(tasks_with_multiple_values)} tasks with multiple {dataset_type_str}")
+        # Limit number of tasks if max_tasks is specified
+        if self.max_tasks is not None and self.max_tasks > 0:
+            # Convert to list, shuffle, and take first max_tasks
+            # Sort tasks first to ensure deterministic ordering before shuffling
+            tasks_list = sorted(tasks_with_multiple_values.items())
+            self._local_random.shuffle(tasks_list)
+            tasks_with_multiple_values = dict(tasks_list[: self.max_tasks])
+            logger.info(f"Limited to {len(tasks_with_multiple_values)} tasks (max_tasks={self.max_tasks})")
+        # Sample trajectories for each task
+        sample_indices = []
+        all_sampled_traj_indices = []
+        # Sort tasks to ensure deterministic processing order
+        for task, key_to_trajs in sorted(tasks_with_multiple_values.items()):
+            if is_roboarena:
+                # RoboArena: Use num_partial_successes for circular sampling
+                num_to_sample_total = self.num_partial_successes
+                # Build lists of available indices per partial_success (sorted for deterministic sampling)
+                available_lists = []
+                for partial_success in sorted(key_to_trajs.keys()):
+                    traj_indices = sorted(key_to_trajs[partial_success])
+                    if traj_indices:
+                        available_lists.append(traj_indices)
+                # Circular sampling: cycle through partial_success groups until we reach max
+                sampled_traj_indices = []
+                for available_indices in cycle(available_lists):
+                    if len(sampled_traj_indices) >= num_to_sample_total:
+                        break
+                    if not available_indices:
+                        # If all lists are empty, stop
+                        if all(not lst for lst in available_lists):
+                            break
+                        continue
+                    # Sample one trajectory from this group
+                    sampled_idx = self._local_random.choice(available_indices)
+                    sampled_traj_indices.append(sampled_idx)
+                    # Remove the sampled index from this list
+                    available_indices.remove(sampled_idx)
+                # Generate samples for all sampled trajectories
+                for traj_idx in sampled_traj_indices:
+                    traj = self.dataset[traj_idx]
+                    sample_indices.extend(self._generate_indices_for_trajectory(traj_idx, traj))
+                    all_sampled_traj_indices.append(traj_idx)
+            else:
+                # Non-RoboArena: Sample N trajectories per quality label
+                # Sort quality labels to ensure deterministic order
+                for quality in sorted(key_to_trajs.keys()):
+                    traj_indices = key_to_trajs[quality]
+                    # Sort trajectory indices to ensure deterministic sampling
+                    traj_indices = sorted(traj_indices)
+                    # Sample up to num_examples_per_quality_pr trajectories for this quality label
+                    num_to_sample = min(self.num_examples_per_quality_pr, len(traj_indices))
+                    sampled_traj_indices = self._local_random.sample(traj_indices, num_to_sample)
+                    for traj_idx in sampled_traj_indices:
+                        traj = self.dataset[traj_idx]
+                        sample_indices.extend(self._generate_indices_for_trajectory(traj_idx, traj))
+                        all_sampled_traj_indices.append(traj_idx)
+        logger.info(f"Sampled {len(sample_indices)} samples across {len(tasks_with_multiple_values)} tasks")
+        logger.info(f"Sampled trajectory indices: {all_sampled_traj_indices}")
+        return sample_indices
+    def _generate_indices_for_trajectory(self, traj_idx: int, traj: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate sample indices for a single trajectory.
+        Args:
+            traj_idx: Index of the trajectory in the dataset
+            traj: Trajectory dictionary
+        Returns:
+            List of sample index dictionaries
+        """
+        num_frames = traj["num_frames"]
+        indices = []
+        if self.use_frame_steps:
+            # Generate subsequence indices like reward_alignment: 0:frame_step, 0:2*frame_step, etc.
+            for end_idx in range(self.frame_step, num_frames + 1, self.frame_step):
+                frame_indices = list(range(end_idx))
+                indices.append({
+                    "traj_idx": traj_idx,
+                    "frame_indices": frame_indices,
+                    "num_frames": num_frames,
+                    "video_path": traj["frames"],
+                    "id": traj["id"],
+                    "use_frame_steps": True,
+                })
+        else:
+            # Generate one sample per trajectory (whole trajectory)
+            indices.append({
+                "traj_idx": traj_idx,
+                "video_path": traj["frames"],
+                "id": traj["id"],
+                "use_frame_steps": False,
+            })
+        return indices
+    def _generate_sample_from_indices(self, sample_idx_info: dict) -> ProgressSample:
+        """Generate a single progress sample from trajectory index."""
+        traj_idx = sample_idx_info["traj_idx"]
+        use_frame_steps = sample_idx_info.get("use_frame_steps", True)
+        traj = self.dataset[traj_idx]
+        if use_frame_steps:
+            # Frame steps mode: create subsequence like reward_alignment
+            frame_indices = sample_idx_info["frame_indices"]
+            num_frames = sample_idx_info["num_frames"]
+            metadata = {
+                "quality_label": traj["quality_label"],
+                "data_source": traj["data_source"],
+                "task": traj["task"],
+                "id": traj["id"],
+                "video_path": sample_idx_info["video_path"],
+                "frame_step": frame_indices[-1] if frame_indices else 0,
+            }
+            trajectory = self._get_traj_from_data(
+                traj=traj,
+                frame_indices=frame_indices,
+                metadata=metadata,
+            )
+        else:
+            # Whole trajectory mode
+            metadata = {
+                "quality_label": traj["quality_label"],
+                "data_source": traj["data_source"],
+                "task": traj["task"],
+                "id": traj["id"],
+                "video_path": sample_idx_info["video_path"],
+            }
+            trajectory = self._get_traj_from_data(
+                traj=traj,
+                metadata=metadata,
+            )
+        sample = ProgressSample(trajectory=trajectory)
+        return sample
+    def __len__(self):
+        return len(self.sample_indices)
+    def __getitem__(self, idx):
+        return self._generate_sample_from_indices(self.sample_indices[idx])

samplers/eval/quality_preference.py ADDED Viewed

	@@ -0,0 +1,219 @@

+from typing import Dict, List, Any
+from itertools import combinations
+from tqdm import tqdm
+from rfm.data.samplers.eval.base_pref import BaseQualityPreferenceSampler
+from rfm.utils.distributed import rank_0_print
+class QualityPreferenceSampler(BaseQualityPreferenceSampler):
+    """Dataset that generates preference samples by pairing trajectories with different quality labels or partial_success values for the same task.
+    For non-RoboArena: Pairs trajectories with different quality labels (failure, suboptimal, successful).
+    For RoboArena: Pairs trajectories with different partial_success values (higher partial_success = chosen).
+    """
+    def __init__(
+        self,
+        comparisons_per_task=None,
+        max_comparisons=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Set data_gen_strategy for this sampler
+        self.data_gen_strategy = "quality_preference"
+        self.comparisons_per_task = comparisons_per_task
+        self.max_comparisons = max_comparisons
+        # Generate all possible sample indices upfront (not the actual samples)
+        self.sample_indices = self._generate_all_sample_indices()
+        rank_0_print(f"Generated {len(self.sample_indices)} quality preference sample indices", verbose=self.verbose)
+    def _generate_all_sample_indices(self) -> List[Dict[str, Any]]:
+        """Generate all possible quality preference sample indices (not the actual samples).
+        For non-RoboArena: Groups by task and quality_label, pairs trajectories with different quality labels.
+        For RoboArena: Groups by task and partial_success values, pairs trajectories with different partial_success.
+        """
+        sample_indices = []
+        # Check if this is RoboArena (has partial_success)
+        is_roboarena = False
+        if self.robot_trajectories:
+            first_traj = self.dataset[self.robot_trajectories[0]]
+            is_roboarena = first_traj.get("partial_success") is not None
+        rank_0_print(
+            f"Generating quality preference samples for {len(self.robot_trajectories)} trajectories "
+            f"({'RoboArena (partial_success)' if is_roboarena else 'non-RoboArena (quality_label)'})",
+            verbose=self.verbose,
+        )
+        if is_roboarena:
+            # RoboArena: Group by task and partial_success (rounded to 2 decimals)
+            task_to_partial_trajs = {}
+            for traj_idx in self.robot_trajectories:
+                traj = self.dataset[traj_idx]
+                task = traj["task"]
+                partial_success_val = traj.get("partial_success")
+                if partial_success_val is None:
+                    rank_0_print(
+                        f"Warning: Trajectory {traj_idx} (task: {task}) missing partial_success, skipping",
+                        verbose=self.verbose,
+                    )
+                    continue
+                # Round partial_success to 2 decimals for grouping
+                partial_success = round(float(partial_success_val), 2)
+                if task not in task_to_partial_trajs:
+                    task_to_partial_trajs[task] = {}
+                if partial_success not in task_to_partial_trajs[task]:
+                    task_to_partial_trajs[task][partial_success] = []
+                task_to_partial_trajs[task][partial_success].append(traj_idx)
+            # Generate pairs for each task
+            for task in tqdm(task_to_partial_trajs, desc="Generating RoboArena quality preference samples"):
+                partial_groups = task_to_partial_trajs[task]
+                partial_values = list(partial_groups.keys())
+                # Only create pairs if we have at least 2 different partial_success values
+                if len(partial_values) < 2:
+                    continue
+                # Collect all pairs for this task
+                task_pairs = []
+                # Create pairs of different partial_success values
+                for partial1, partial2 in combinations(partial_values, 2):
+                    trajs1 = partial_groups[partial1]
+                    trajs2 = partial_groups[partial2]
+                    if not trajs1 or not trajs2:
+                        continue
+                    # Determine which partial_success is higher (chosen)
+                    if partial1 > partial2:
+                        chosen_partial = partial1
+                        rejected_partial = partial2
+                        chosen_indices = trajs1
+                        rejected_indices = trajs2
+                    elif partial2 > partial1:
+                        chosen_partial = partial2
+                        rejected_partial = partial1
+                        chosen_indices = trajs2
+                        rejected_indices = trajs1
+                    else:
+                        # Same value, skip this pair
+                        continue
+                    # Create all possible pairs for this partial_success combination
+                    for chosen_idx in chosen_indices:
+                        for rejected_idx in rejected_indices:
+                            task_pairs.append({
+                                "chosen_traj_idx": chosen_idx,
+                                "rejected_traj_idx": rejected_idx,
+                                "task": task,
+                                "chosen_partial_success": chosen_partial,
+                                "rejected_partial_success": rejected_partial,
+                            })
+                # Apply comparisons_per_task limit if set (sample uniformly across all pairs for this task)
+                if self.comparisons_per_task is not None and len(task_pairs) > self.comparisons_per_task:
+                    # Uniformly sample comparisons for this task
+                    task_pairs = self._local_random.sample(task_pairs, self.comparisons_per_task)
+                sample_indices.extend(task_pairs)
+        else:
+            # Non-RoboArena: Group by task and quality label
+            task_to_quality_trajs = {}
+            for traj_idx in self.robot_trajectories:
+                traj = self.dataset[traj_idx]
+                task = traj["task"]
+                quality_label = traj["quality_label"]
+                if task not in task_to_quality_trajs:
+                    task_to_quality_trajs[task] = {}
+                if quality_label not in task_to_quality_trajs[task]:
+                    task_to_quality_trajs[task][quality_label] = []
+                task_to_quality_trajs[task][quality_label].append(traj_idx)
+            # Generate pairs for each task
+            quality_order = {"failure": 1, "suboptimal": 2, "successful": 3}
+            for task in tqdm(task_to_quality_trajs, desc="Generating quality preference samples"):
+                quality_groups = task_to_quality_trajs[task]
+                quality_labels = list(quality_groups.keys())
+                # Only create pairs if we have at least 2 different quality labels
+                if len(quality_labels) < 2:
+                    continue
+                # Collect all pairs for this task
+                task_pairs = []
+                # Create pairs of different quality labels
+                for quality1, quality2 in combinations(quality_labels, 2):
+                    trajs1 = quality_groups[quality1]
+                    trajs2 = quality_groups[quality2]
+                    if not trajs1 or not trajs2:
+                        continue
+                    # Determine which quality is better (chosen)
+                    order1 = quality_order.get(quality1, 0)
+                    order2 = quality_order.get(quality2, 0)
+                    # Only create pairs if one quality is strictly better than the other
+                    if order1 > order2:
+                        chosen_quality = quality1
+                        rejected_quality = quality2
+                        chosen_indices = trajs1
+                        rejected_indices = trajs2
+                    elif order2 > order1:
+                        chosen_quality = quality2
+                        rejected_quality = quality1
+                        chosen_indices = trajs2
+                        rejected_indices = trajs1
+                    else:
+                        # Same order, skip this pair as we can't reliably compare them
+                        continue
+                    # Create all possible pairs for this quality combination
+                    for chosen_idx in chosen_indices:
+                        for rejected_idx in rejected_indices:
+                            task_pairs.append({
+                                "chosen_traj_idx": chosen_idx,
+                                "rejected_traj_idx": rejected_idx,
+                                "task": task,
+                                "chosen_quality": chosen_quality,
+                                "rejected_quality": rejected_quality,
+                            })
+                # Apply comparisons_per_task limit if set (sample uniformly across all pairs for this task)
+                if self.comparisons_per_task is not None and len(task_pairs) > self.comparisons_per_task:
+                    # Uniformly sample comparisons for this task
+                    task_pairs = self._local_random.sample(task_pairs, self.comparisons_per_task)
+                sample_indices.extend(task_pairs)
+        # Apply max_comparisons limit if set (sample uniformly across all comparisons)
+        original_count = len(sample_indices)
+        if self.max_comparisons is not None and original_count > self.max_comparisons:
+            sample_indices = self._local_random.sample(sample_indices, self.max_comparisons)
+            rank_0_print(
+                f"Limited total comparisons to {self.max_comparisons} (from {original_count} total comparisons)",
+                verbose=self.verbose,
+            )
+        return sample_indices

samplers/eval/reward_alignment.py ADDED Viewed

	@@ -0,0 +1,147 @@

+#!/usr/bin/env python3
+"""
+Data generator for reward alignment evaluation.
+This generator creates subsequence samples from trajectories for progress prediction evaluation.
+For each trajectory, it creates multiple subsequences (0:2, 0:4, 0:6, etc.) and formats them
+as PreferenceSample objects that can be evaluated by the model.
+"""
+from typing import Dict, List, Any
+import torch
+from tqdm import tqdm
+from rfm.data.dataset_types import ProgressSample, Trajectory
+from rfm.data.samplers.base import RFMBaseSampler
+from rfm.utils.distributed import rank_0_print
+class RewardAlignmentSampler(RFMBaseSampler):
+    """
+    Data generator that creates subsequence samples for reward alignment evaluation.
+    For each trajectory, creates subsequences of frames (0:2, 0:4, 0:6, etc.)
+    and formats them as PreferenceSample objects for evaluation.
+    """
+    def __init__(
+        self,
+        max_trajectories: int | None = None,
+        frame_step: int = 1,
+        use_frame_steps: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_trajectories = max_trajectories
+        self.frame_step = frame_step
+        self.use_frame_steps = use_frame_steps
+        self.sample_indices = self._generate_all_sample_indices()
+        rank_0_print(
+            f"Generated {len(self.sample_indices)} reward alignment sample indices from {min(len(self.robot_trajectories), self.max_trajectories) if self.max_trajectories else len(self.robot_trajectories)} trajectories",
+            verbose=self.verbose,
+        )
+    def _generate_all_sample_indices(self) -> List[Dict[str, Any]]:
+        """Generate all possible subsequence sample indices (not the actual samples)."""
+        sample_indices = []
+        # Limit number of trajectories if specified
+        trajectories_to_process = self.robot_trajectories
+        if self.max_trajectories is not None and self.max_trajectories < len(self.robot_trajectories):
+            trajectories_to_process = self._local_random.sample(self.robot_trajectories, self.max_trajectories)
+        rank_0_print(
+            f"Generating subsequence samples for {len(trajectories_to_process)} trajectories", verbose=self.verbose
+        )
+        for traj_idx in trajectories_to_process:
+            traj = self.dataset[traj_idx]
+            sample_indices.extend(self._generate_indices_for_trajectory(traj_idx, traj))
+        return sample_indices
+    def _generate_indices_for_trajectory(self, traj_idx: int, traj: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate sample indices for a single trajectory.
+        Args:
+            traj_idx: Index of the trajectory in the dataset
+            traj: Trajectory dictionary
+        Returns:
+            List of sample index dictionaries
+        """
+        num_frames = traj["num_frames"]
+        indices = []
+        if self.use_frame_steps:
+            # Generate subsequence indices like reward_alignment: 0:frame_step, 0:2*frame_step, etc.
+            for end_idx in range(self.frame_step, num_frames + 1, self.frame_step):
+                frame_indices = list(range(end_idx))
+                indices.append({
+                    "traj_idx": traj_idx,
+                    "frame_indices": frame_indices,
+                    "num_frames": num_frames,
+                    "video_path": traj["frames"],
+                    "id": traj["id"],
+                    "use_frame_steps": True,
+                })
+        else:
+            # Generate one sample per trajectory (whole trajectory)
+            indices.append({
+                "traj_idx": traj_idx,
+                "video_path": traj["frames"],
+                "id": traj["id"],
+                "use_frame_steps": False,
+            })
+        return indices
+    def _generate_sample_from_indices(self, sample_idx_info: dict) -> ProgressSample:
+        """Generate a single subsequence sample from stored indices."""
+        traj_idx = sample_idx_info["traj_idx"]
+        use_frame_steps = sample_idx_info.get("use_frame_steps", True)
+        traj = self.dataset[traj_idx]
+        if use_frame_steps:
+            # Frame steps mode: create subsequence like reward_alignment
+            frame_indices = sample_idx_info["frame_indices"]
+            num_frames = sample_idx_info["num_frames"]
+            metadata = {
+                "data_gen_strategy": "reward_alignment",
+                "id": traj["id"],
+                "video_path": sample_idx_info["video_path"],
+                "frame_step": frame_indices[-1] if frame_indices else 0,
+                "num_frames": num_frames,
+            }
+            trajectory = self._get_traj_from_data(
+                traj=traj,
+                frame_indices=frame_indices,
+                metadata=metadata,
+            )
+        else:
+            # Whole trajectory mode
+            metadata = {
+                "data_gen_strategy": "reward_alignment",
+                "id": traj["id"],
+                "video_path": sample_idx_info["video_path"],
+            }
+            trajectory = self._get_traj_from_data(
+                traj=traj,
+                metadata=metadata,
+            )
+        sample = ProgressSample(trajectory=trajectory, sample_type="progress")
+        return sample
+    def __len__(self):
+        return len(self.sample_indices)
+    def __getitem__(self, idx):
+        return self._generate_sample_from_indices(self.sample_indices[idx])