Spaces:

robometer
/

rewardeval_ui

Running

App Files Files Community

Anthony Liang commited on Feb 12

Commit

679ca41

1 Parent(s): a326636

remove similarity

Browse files

Files changed (3) hide show

app.py +16 -63
dataset_types.py +2 -15
eval_utils.py +4 -18

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-Gradio app for RFM (Reward Foundation Model) inference visualization.
-Supports single video (progress/success) and dual video (preference/similarity) predictions.
 Uses eval server for inference instead of loading models locally.
 """
@@ -25,7 +25,7 @@ import numpy as np
 import requests
 from typing import Any, List, Optional, Tuple
-from dataset_types import Trajectory, ProgressSample, PreferenceSample, SimilaritySample
 from eval_utils import build_payload, post_batch_npy
 from eval_viz_utils import create_combined_progress_success_plot, extract_frames
 from datasets import load_dataset as load_dataset_hf, get_dataset_config_names
@@ -225,7 +225,6 @@ def format_model_info(model_info: dict) -> str:
             lines.append(f"- **Model Type:** `{model_cfg.get('model_type', 'N/A')}`\n")
             lines.append(f"- **Train Progress Head:** {model_cfg.get('train_progress_head', False)}\n")
             lines.append(f"- **Train Preference Head:** {model_cfg.get('train_preference_head', False)}\n")
-            lines.append(f"- **Train Similarity Head:** {model_cfg.get('train_similarity_head', False)}\n")
             lines.append(f"- **Train Success Head:** {model_cfg.get('train_success_head', False)}\n")
             lines.append(f"- **Use PEFT:** {model_cfg.get('use_peft', False)}\n")
             lines.append(f"- **Use Unsloth:** {model_cfg.get('use_unsloth', False)}\n")
@@ -259,8 +258,8 @@ def format_model_info(model_info: dict) -> str:
     return "".join(lines)
-def load_rfm_dataset(dataset_name, config_name):
-    """Load the RFM dataset from HuggingFace Hub."""
     try:
         if not dataset_name or not config_name:
             return None, "Please provide both dataset name and configuration"
@@ -302,7 +301,7 @@ def get_trajectory_video_path(dataset, index, dataset_name):
             if dataset_name:
                 video_path = f"https://huggingface.co/datasets/{dataset_name}/resolve/main/{frames_data}"
             else:
-                video_path = f"https://huggingface.co/datasets/aliangdw/rfm/resolve/main/{frames_data}"
             task = item.get("task", "Complete the task")
             quality_label = item.get("quality_label", None)
@@ -432,7 +431,7 @@ def process_two_videos(
     server_url: str = "",
     fps: float = 1.0,
 ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
-    """Process two videos for preference, similarity, or progress prediction using eval server."""
     # Get server URL from state if not provided
     if not server_url:
         server_url = _server_state.get("server_url")
@@ -515,8 +514,6 @@ def process_two_videos(
         elif prediction_type == "progress":
             # Create ProgressSamples for both videos
-            from dataset_types import ProgressSample
             progress_sample_a = ProgressSample(
                 trajectory=trajectory_a,
                 data_gen_strategy="demo",
@@ -554,45 +551,6 @@ def process_two_videos(
             else:
                 result_text += "Could not extract progress predictions from server response.\n"
-        elif prediction_type == "similarity":
-            # For similarity inference, we have two videos:
-            # - Video A as reference trajectory
-            # - Video B as similar trajectory
-            # diff_trajectory is None in inference mode (only need similarity between ref and sim)
-            # Create SimilaritySample with Video A as ref and Video B as sim
-            similarity_sample = SimilaritySample(
-                ref_trajectory=trajectory_a,
-                sim_trajectory=trajectory_b,
-                diff_trajectory=None,  # None in inference mode
-                data_gen_strategy="demo",
-            )
-            # Build payload and send to server
-            files, sample_data = build_payload([similarity_sample])
-            response = post_batch_npy(server_url, files, sample_data, timeout_s=120.0)
-            # Process response - we only care about sim_score_ref_sim (similarity between Video A and Video B)
-            outputs_similarity = response.get("outputs_similarity", {})
-            sim_score_ref_sim = outputs_similarity.get("sim_score_ref_sim", [])
-            result_text = f"**Similarity Prediction:**\n"
-            if sim_score_ref_sim and len(sim_score_ref_sim) > 0:
-                sim_score = sim_score_ref_sim[0]
-                if sim_score is not None:
-                    result_text += f"- Similarity score (Video A vs Video B): {sim_score:.3f}\n"
-                    # Interpret similarity score (higher = more similar)
-                    if sim_score > 0.7:
-                        result_text += f"- Interpretation: High similarity - videos are very similar\n"
-                    elif sim_score > 0.4:
-                        result_text += f"- Interpretation: Moderate similarity - videos share some similarities\n"
-                    else:
-                        result_text += f"- Interpretation: Low similarity - videos are quite different\n"
-                else:
-                    result_text += "Could not extract similarity score from server response.\n"
-            else:
-                result_text += "Could not extract similarity prediction from server response.\n"
         # Return result text and both video paths
         return result_text, video_a_path, video_b_path
@@ -603,15 +561,15 @@ def process_two_videos(
 # Create Gradio interface
 try:
     # Try with theme (Gradio 4.0+)
-    demo = gr.Blocks(title="RFM Evaluation Server", theme=gr.themes.Soft())
 except TypeError:
     # Fallback for older Gradio versions without theme support
-    demo = gr.Blocks(title="RFM Evaluation Server")
 with demo:
     gr.Markdown(
         """
-        # RFM (Reward Foundation Model) Evaluation Server
         """
     )
@@ -822,7 +780,7 @@ with demo:
             def load_dataset_single(dataset_name, config_name):
                 """Load dataset and update slider."""
-                dataset, status = load_rfm_dataset(dataset_name, config_name)
                 if dataset is not None:
                     max_index = len(dataset) - 1
                     return (
@@ -1021,8 +979,8 @@ with demo:
                 api_name="process_single_video",
             )
-        with gr.Tab("Preference/Similarity Analysis"):
-            gr.Markdown("### Preference & Similarity Prediction")
             with gr.Row():
                 with gr.Column():
                     video_a_input = gr.Video(label="Video A", height=250)
@@ -1033,7 +991,7 @@ with demo:
                         value="Complete the task",
                     )
                     prediction_type = gr.Radio(
-                        choices=["preference", "similarity", "progress"],
                         value="preference",
                         label="Prediction Type",
                     )
@@ -1129,7 +1087,7 @@ with demo:
             def load_dataset_a(dataset_name, config_name):
                 """Load dataset A and update slider."""
-                dataset, status = load_rfm_dataset(dataset_name, config_name)
                 if dataset is not None:
                     max_index = len(dataset) - 1
                     return (
@@ -1274,7 +1232,7 @@ with demo:
             def load_dataset_b(dataset_name, config_name):
                 """Load dataset B and update slider."""
-                dataset, status = load_rfm_dataset(dataset_name, config_name)
                 if dataset is not None:
                     max_index = len(dataset) - 1
                     return (
@@ -1509,11 +1467,6 @@ with demo:
 def main():
     """Launch the Gradio app."""
-    import sys
-    # Check if reload mode is requested
-    watch_files = os.getenv("GRADIO_WATCH", "0") == "1" or "--reload" in sys.argv
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 #!/usr/bin/env python3
 """
+Gradio app for RBM (Reward Foundation Model) inference visualization.
+Supports single video (progress/success) and dual video (preference/progress) predictions.
 Uses eval server for inference instead of loading models locally.
 """
 import requests
 from typing import Any, List, Optional, Tuple
+from dataset_types import Trajectory, ProgressSample, PreferenceSample
 from eval_utils import build_payload, post_batch_npy
 from eval_viz_utils import create_combined_progress_success_plot, extract_frames
 from datasets import load_dataset as load_dataset_hf, get_dataset_config_names
             lines.append(f"- **Model Type:** `{model_cfg.get('model_type', 'N/A')}`\n")
             lines.append(f"- **Train Progress Head:** {model_cfg.get('train_progress_head', False)}\n")
             lines.append(f"- **Train Preference Head:** {model_cfg.get('train_preference_head', False)}\n")
             lines.append(f"- **Train Success Head:** {model_cfg.get('train_success_head', False)}\n")
             lines.append(f"- **Use PEFT:** {model_cfg.get('use_peft', False)}\n")
             lines.append(f"- **Use Unsloth:** {model_cfg.get('use_unsloth', False)}\n")
     return "".join(lines)
+def load_rbm_dataset(dataset_name, config_name):
+    """Load an RBM-format dataset from HuggingFace Hub."""
     try:
         if not dataset_name or not config_name:
             return None, "Please provide both dataset name and configuration"
             if dataset_name:
                 video_path = f"https://huggingface.co/datasets/{dataset_name}/resolve/main/{frames_data}"
             else:
+                video_path = f"https://huggingface.co/datasets/rewardfm/rbm-1m/resolve/main/{frames_data}"
             task = item.get("task", "Complete the task")
             quality_label = item.get("quality_label", None)
     server_url: str = "",
     fps: float = 1.0,
 ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+    """Process two videos for preference or progress prediction using eval server."""
     # Get server URL from state if not provided
     if not server_url:
         server_url = _server_state.get("server_url")
         elif prediction_type == "progress":
             # Create ProgressSamples for both videos
             progress_sample_a = ProgressSample(
                 trajectory=trajectory_a,
                 data_gen_strategy="demo",
             else:
                 result_text += "Could not extract progress predictions from server response.\n"
         # Return result text and both video paths
         return result_text, video_a_path, video_b_path
 # Create Gradio interface
 try:
     # Try with theme (Gradio 4.0+)
+    demo = gr.Blocks(title="RBM Evaluation Server", theme=gr.themes.Soft())
 except TypeError:
     # Fallback for older Gradio versions without theme support
+    demo = gr.Blocks(title="RBM Evaluation Server")
 with demo:
     gr.Markdown(
         """
+        # RBM (Reward Foundation Model) Evaluation Server
         """
     )
             def load_dataset_single(dataset_name, config_name):
                 """Load dataset and update slider."""
+                dataset, status = load_rbm_dataset(dataset_name, config_name)
                 if dataset is not None:
                     max_index = len(dataset) - 1
                     return (
                 api_name="process_single_video",
             )
+        with gr.Tab("Preference Analysis"):
+            gr.Markdown("### Preference & Progress Prediction")
             with gr.Row():
                 with gr.Column():
                     video_a_input = gr.Video(label="Video A", height=250)
                         value="Complete the task",
                     )
                     prediction_type = gr.Radio(
+                        choices=["preference", "progress"],
                         value="preference",
                         label="Prediction Type",
                     )
             def load_dataset_a(dataset_name, config_name):
                 """Load dataset A and update slider."""
+                dataset, status = load_rbm_dataset(dataset_name, config_name)
                 if dataset is not None:
                     max_index = len(dataset) - 1
                     return (
             def load_dataset_b(dataset_name, config_name):
                 """Load dataset B and update slider."""
+                dataset, status = load_rbm_dataset(dataset_name, config_name)
                 if dataset is not None:
                     max_index = len(dataset) - 1
                     return (
 def main():
     """Launch the Gradio app."""
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

dataset_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Dataclasses for RFM model dataset trajectory structures.
 Defines the standard format for HuggingFace dataset trajectories.
 """
@@ -62,17 +62,4 @@ class PreferenceSample(BaseModel):
     resample_attempts: int = 1
-class SimilaritySample(BaseModel):
-    """Sample structure for similarity scoring: traj_sim and traj_diff ranked against o^ref."""
-    # Trajectories
-    ref_trajectory: Trajectory  # o^ref
-    sim_trajectory: Trajectory  # Similar trajectory
-    diff_trajectory: Optional[Trajectory] = None  # Different trajectory (optional in inference mode)
-    sample_type: str = "similarity"
-    data_gen_strategy: Optional[str] = None
-    resample_attempts: int = 1
-SampleType = Union[PreferenceSample, SimilaritySample, ProgressSample]

 #!/usr/bin/env python3
 """
+Dataclasses for RBM model dataset trajectory structures.
 Defines the standard format for HuggingFace dataset trajectories.
 """
     resample_attempts: int = 1
+SampleType = Union[PreferenceSample, ProgressSample]

eval_utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations
-import re
 import torch
 import io
 import json
@@ -15,7 +14,7 @@ import numpy as np
 import requests
 import torch
-from dataset_types import PreferenceSample, SimilaritySample, ProgressSample, Trajectory
 def pad_trajectory_to_max_frames_np(
@@ -123,13 +122,6 @@ def linspace_subsample_frames(
     return subsampled_frames, indices
-def extract_answer_from_text(text: str) -> str:
-    """Extract answer from text using <ans> tags."""
-    m = re.search(r"<ans>(.*?)</ans>", text, re.DOTALL)
-    ans = m.group(1).strip() if m else ""
-    return ans
 def raw_dict_to_sample(
     raw_data: Union[Tuple[Dict[str, Any], Dict[str, Any]], Dict[str, Any]],
     max_frames: int = 16,
@@ -216,7 +208,7 @@ def raw_dict_to_sample(
 def build_payload(
-    samples: list[PreferenceSample | SimilaritySample | ProgressSample],
 ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
     """Build a payload with numpy array handling.
@@ -239,9 +231,6 @@ def build_payload(
         for key in [
             "chosen_trajectory",
             "rejected_trajectory",
-            "reference_trajectory",
-            "traj_sim_trajectory",
-            "traj_diff_trajectory",
             "trajectory",
         ]:
             if key in processed_sample and isinstance(processed_sample[key], dict):
@@ -287,7 +276,7 @@ def post_batch_npy(
     extra_form_data: Optional[dict[str, Any]] = None,
 ) -> dict[str, Any]:
     """POST batch using .npy format for numpy arrays.
     Args:
         url: Server URL
         files: Dict of numpy arrays converted to .npy format
@@ -297,7 +286,7 @@ def post_batch_npy(
     """
     # Convert sample_data to form data
     data = {f"sample_{i}": json.dumps(sample) for i, sample in enumerate(sample_data)}
     # Add extra form data if provided
     if extra_form_data:
         for key, value in extra_form_data.items():
@@ -400,9 +389,6 @@ def reconstruct_payload_from_npy(
         trajectory_keys = [
             "chosen_trajectory",
             "rejected_trajectory",
-            "reference_trajectory",
-            "traj_sim_trajectory",
-            "traj_diff_trajectory",
             "trajectory",
         ]

 #!/usr/bin/env python3
 from __future__ import annotations
 import torch
 import io
 import json
 import requests
 import torch
+from dataset_types import PreferenceSample, ProgressSample, Trajectory
 def pad_trajectory_to_max_frames_np(
     return subsampled_frames, indices
 def raw_dict_to_sample(
     raw_data: Union[Tuple[Dict[str, Any], Dict[str, Any]], Dict[str, Any]],
     max_frames: int = 16,
 def build_payload(
+    samples: list[PreferenceSample | ProgressSample],
 ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
     """Build a payload with numpy array handling.
         for key in [
             "chosen_trajectory",
             "rejected_trajectory",
             "trajectory",
         ]:
             if key in processed_sample and isinstance(processed_sample[key], dict):
     extra_form_data: Optional[dict[str, Any]] = None,
 ) -> dict[str, Any]:
     """POST batch using .npy format for numpy arrays.
     Args:
         url: Server URL
         files: Dict of numpy arrays converted to .npy format
     """
     # Convert sample_data to form data
     data = {f"sample_{i}": json.dumps(sample) for i, sample in enumerate(sample_data)}
     # Add extra form data if provided
     if extra_form_data:
         for key, value in extra_form_data.items():
         trajectory_keys = [
             "chosen_trajectory",
             "rejected_trajectory",
             "trajectory",
         ]