Anthony Liang commited on
Commit
d8adb0b
·
1 Parent(s): c66a872
Files changed (1) hide show
  1. app.py +752 -742
app.py CHANGED
@@ -609,40 +609,14 @@ with demo:
609
  gr.Markdown(
610
  """
611
  # RFM (Reward Foundation Model) Evaluation Server
612
-
613
- Select a model from the dropdown below. The app will automatically discover available models.
614
  """
615
  )
616
 
617
- # Model selector at the top
618
- with gr.Row():
619
- with gr.Column(scale=4):
620
- base_url_input = gr.Textbox(
621
- label="Base Server URL",
622
- placeholder="http://40.119.56.66",
623
- value="http://40.119.56.66",
624
- interactive=True,
625
- )
626
- model_dropdown = gr.Dropdown(
627
- label="Select Model",
628
- choices=[],
629
- value=None,
630
- interactive=True,
631
- info="Click 'Discover Models' to find available models on ports 8000-8010",
632
- )
633
- with gr.Column(scale=1):
634
- discover_btn = gr.Button("🔍 Discover Models", variant="primary", size="lg")
635
-
636
- with gr.Row():
637
- server_status = gr.Markdown("Click 'Discover Models' to find available models", visible=True)
638
-
639
- with gr.Accordion("📋 Model Information", open=False) as model_info_accordion:
640
- model_info_display = gr.Markdown("", visible=True)
641
-
642
- # Hidden state to store server URL and model mapping
643
  server_url_state = gr.State(value=None)
644
  model_url_mapping_state = gr.State(value={}) # Maps model_name -> server_url
645
 
 
646
  def discover_and_select_models(base_url: str):
647
  """Discover models and update dropdown."""
648
  if not base_url:
@@ -723,757 +697,793 @@ with demo:
723
  server_url,
724
  )
725
 
726
- discover_btn.click(
727
- fn=discover_and_select_models,
728
- inputs=[base_url_input],
729
- outputs=[model_dropdown, server_status, model_info_display, server_url_state, model_url_mapping_state],
730
- )
731
-
732
- model_dropdown.change(
733
- fn=on_model_selected,
734
- inputs=[model_dropdown, model_url_mapping_state],
735
- outputs=[server_status, model_info_display, server_url_state],
736
- )
737
-
738
- with gr.Tab("Progress Prediction"):
739
- gr.Markdown("### Progress & Success Prediction")
740
- gr.Markdown("Upload a video or select one from a dataset to get progress predictions.")
741
-
742
- with gr.Row():
743
- with gr.Column():
744
- with gr.Accordion("📁 Select from Dataset", open=False):
745
- dataset_name_single = gr.Dropdown(
746
- choices=PREDEFINED_DATASETS,
747
- value="jesbu1/oxe_rfm",
748
- label="Dataset Name",
749
- allow_custom_value=True,
750
- )
751
- config_name_single = gr.Dropdown(
752
- choices=[], value="", label="Configuration Name", allow_custom_value=True
753
- )
754
- with gr.Row():
755
- refresh_configs_btn = gr.Button("🔄 Refresh Configs", variant="secondary", size="sm")
756
- load_dataset_btn = gr.Button("Load Dataset", variant="secondary", size="sm")
757
-
758
- dataset_status_single = gr.Markdown("", visible=False)
759
- with gr.Row():
760
- prev_traj_btn = gr.Button("⬅️ Prev", variant="secondary", size="sm")
761
- trajectory_slider = gr.Slider(
762
- minimum=0, maximum=0, step=1, value=0, label="Trajectory Index", interactive=True
763
- )
764
- next_traj_btn = gr.Button("Next ➡️", variant="secondary", size="sm")
765
- trajectory_metadata = gr.Markdown("", visible=False)
766
- use_dataset_video_btn = gr.Button("Use Selected Video", variant="secondary")
767
-
768
- gr.Markdown("---")
769
- gr.Markdown("**OR**")
770
- gr.Markdown("---")
771
-
772
- single_video_input = gr.Video(label="Upload Video", height=300)
773
- task_text_input = gr.Textbox(
774
- label="Task Description",
775
- placeholder="Describe the task (e.g., 'Pick up the red block')",
776
- value="Complete the task",
777
- )
778
- fps_input_single = gr.Slider(
779
- label="FPS (Frames Per Second)",
780
- minimum=0.1,
781
- maximum=10.0,
782
- value=1.0,
783
- step=0.1,
784
- info="Frames per second to extract from video (higher = more frames)",
785
- )
786
- analyze_single_btn = gr.Button("Analyze Video", variant="primary")
787
-
788
- with gr.Column():
789
- progress_plot = gr.Image(label="Progress & Success Prediction", height=400)
790
- info_output = gr.Markdown("")
791
-
792
- # State variables for dataset
793
- current_dataset_single = gr.State(None)
794
-
795
- def update_config_choices_single(dataset_name):
796
- """Update config choices when dataset changes."""
797
- if not dataset_name:
798
- return gr.update(choices=[], value="")
799
- try:
800
- configs = get_available_configs(dataset_name)
801
- if configs:
802
- return gr.update(choices=configs, value=configs[0])
803
- else:
804
- return gr.update(choices=[], value="")
805
- except Exception as e:
806
- logger.warning(f"Could not fetch configs: {e}")
807
- return gr.update(choices=[], value="")
808
-
809
- def load_dataset_single(dataset_name, config_name):
810
- """Load dataset and update slider."""
811
- dataset, status = load_rfm_dataset(dataset_name, config_name)
812
- if dataset is not None:
813
- max_index = len(dataset) - 1
814
- return (
815
- dataset,
816
- gr.update(value=status, visible=True),
817
- gr.update(
818
- maximum=max_index, value=0, interactive=True, label=f"Trajectory Index (0 to {max_index})"
819
- ),
820
- )
821
- else:
822
- return None, gr.update(value=status, visible=True), gr.update(maximum=0, value=0, interactive=False)
823
-
824
- def use_dataset_video(dataset, index, dataset_name):
825
- """Load video from dataset and update inputs."""
826
- if dataset is None:
827
- return (
828
- None,
829
- "Complete the task",
830
- gr.update(value="No dataset loaded", visible=True),
831
- gr.update(visible=False),
832
- )
833
-
834
- video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
835
- if video_path:
836
- # Build metadata text
837
- metadata_lines = []
838
- if quality_label:
839
- metadata_lines.append(f"**Quality Label:** {quality_label}")
840
- if partial_success is not None:
841
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
842
-
843
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
844
- status_text = f"✅ Loaded trajectory {index} from dataset"
845
- if metadata_text:
846
- status_text += f"\n\n{metadata_text}"
847
-
848
- return (
849
- video_path,
850
- task,
851
- gr.update(value=status_text, visible=True),
852
- gr.update(value=metadata_text, visible=bool(metadata_text)),
853
- )
854
- else:
855
- return (
856
- None,
857
- "Complete the task",
858
- gr.update(value="❌ Error loading trajectory", visible=True),
859
- gr.update(visible=False),
860
- )
861
-
862
- def next_trajectory(dataset, current_idx, dataset_name):
863
- """Go to next trajectory."""
864
- if dataset is None:
865
- return 0, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
866
- next_idx = min(current_idx + 1, len(dataset) - 1)
867
- video_path, task, quality_label, partial_success = get_trajectory_video_path(
868
- dataset, next_idx, dataset_name
869
  )
870
-
871
- if video_path:
872
- # Build metadata text
873
- metadata_lines = []
874
- if quality_label:
875
- metadata_lines.append(f"**Quality Label:** {quality_label}")
876
- if partial_success is not None:
877
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
878
-
879
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
880
- return (
881
- next_idx,
882
- video_path,
883
- task,
884
- gr.update(value=metadata_text, visible=bool(metadata_text)),
885
- gr.update(value=f"✅ Trajectory {next_idx}/{len(dataset) - 1}", visible=True),
886
- )
887
- else:
888
- return current_idx, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
889
-
890
- def prev_trajectory(dataset, current_idx, dataset_name):
891
- """Go to previous trajectory."""
892
- if dataset is None:
893
- return 0, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
894
- prev_idx = max(current_idx - 1, 0)
895
- video_path, task, quality_label, partial_success = get_trajectory_video_path(
896
- dataset, prev_idx, dataset_name
897
  )
898
 
899
- if video_path:
900
- # Build metadata text
901
- metadata_lines = []
902
- if quality_label:
903
- metadata_lines.append(f"**Quality Label:** {quality_label}")
904
- if partial_success is not None:
905
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
906
-
907
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
908
- return (
909
- prev_idx,
910
- video_path,
911
- task,
912
- gr.update(value=metadata_text, visible=bool(metadata_text)),
913
- gr.update(value=f"✅ Trajectory {prev_idx}/{len(dataset) - 1}", visible=True),
914
- )
915
- else:
916
- return current_idx, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
917
-
918
- def update_trajectory_on_slider_change(dataset, index, dataset_name):
919
- """Update trajectory metadata when slider changes."""
920
- if dataset is None:
921
- return gr.update(visible=False), gr.update(visible=False)
922
-
923
- video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
924
- if video_path:
925
- # Build metadata text
926
- metadata_lines = []
927
- if quality_label:
928
- metadata_lines.append(f"**Quality Label:** {quality_label}")
929
- if partial_success is not None:
930
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
931
-
932
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
933
- return (
934
- gr.update(value=metadata_text, visible=bool(metadata_text)),
935
- gr.update(value=f"Trajectory {index}/{len(dataset) - 1}", visible=True),
936
- )
937
- else:
938
- return gr.update(visible=False), gr.update(visible=False)
939
-
940
- # Dataset selection handlers
941
- dataset_name_single.change(
942
- fn=update_config_choices_single, inputs=[dataset_name_single], outputs=[config_name_single]
943
- )
944
 
945
- refresh_configs_btn.click(
946
- fn=update_config_choices_single, inputs=[dataset_name_single], outputs=[config_name_single]
947
- )
 
 
 
948
 
949
- load_dataset_btn.click(
950
- fn=load_dataset_single,
951
- inputs=[dataset_name_single, config_name_single],
952
- outputs=[current_dataset_single, dataset_status_single, trajectory_slider],
953
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
 
955
- use_dataset_video_btn.click(
956
- fn=use_dataset_video,
957
- inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
958
- outputs=[single_video_input, task_text_input, dataset_status_single, trajectory_metadata],
959
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960
 
961
- # Navigation buttons
962
- next_traj_btn.click(
963
- fn=next_trajectory,
964
- inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
965
- outputs=[
966
- trajectory_slider,
967
- single_video_input,
968
- task_text_input,
969
- trajectory_metadata,
970
- dataset_status_single,
971
- ],
972
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973
 
974
- prev_traj_btn.click(
975
- fn=prev_trajectory,
976
- inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
977
- outputs=[
978
- trajectory_slider,
979
- single_video_input,
980
- task_text_input,
981
- trajectory_metadata,
982
- dataset_status_single,
983
- ],
984
- )
985
 
986
- # Update metadata when slider changes
987
- trajectory_slider.change(
988
- fn=update_trajectory_on_slider_change,
989
- inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
990
- outputs=[trajectory_metadata, dataset_status_single],
991
- )
992
 
993
- analyze_single_btn.click(
994
- fn=process_single_video,
995
- inputs=[single_video_input, task_text_input, server_url_state, fps_input_single],
996
- outputs=[progress_plot, info_output],
997
- api_name="process_single_video",
998
- )
999
 
1000
- with gr.Tab("Preference/Similarity Analysis"):
1001
- gr.Markdown("### Preference & Similarity Prediction")
1002
- with gr.Row():
1003
- with gr.Column():
1004
- with gr.Accordion("📁 Video A - Select from Dataset", open=False):
1005
- dataset_name_a = gr.Dropdown(
1006
- choices=PREDEFINED_DATASETS,
1007
- value="jesbu1/oxe_rfm",
1008
- label="Dataset Name",
1009
- allow_custom_value=True,
 
1010
  )
1011
- config_name_a = gr.Dropdown(
1012
- choices=[], value="", label="Configuration Name", allow_custom_value=True
 
 
 
 
 
 
 
 
 
1013
  )
1014
- with gr.Row():
1015
- refresh_configs_btn_a = gr.Button("🔄 Refresh Configs", variant="secondary", size="sm")
1016
- load_dataset_btn_a = gr.Button("Load Dataset", variant="secondary", size="sm")
1017
 
1018
- dataset_status_a = gr.Markdown("", visible=False)
1019
- with gr.Row():
1020
- prev_traj_btn_a = gr.Button("⬅️ Prev", variant="secondary", size="sm")
1021
- trajectory_slider_a = gr.Slider(
1022
- minimum=0, maximum=0, step=1, value=0, label="Trajectory Index", interactive=True
1023
- )
1024
- next_traj_btn_a = gr.Button("Next ➡️", variant="secondary", size="sm")
1025
- trajectory_metadata_a = gr.Markdown("", visible=False)
1026
- use_dataset_video_btn_a = gr.Button("Use Selected Video for A", variant="secondary")
1027
-
1028
- with gr.Accordion("📁 Video B - Select from Dataset", open=False):
1029
- dataset_name_b = gr.Dropdown(
1030
- choices=PREDEFINED_DATASETS,
1031
- value="jesbu1/oxe_rfm",
1032
- label="Dataset Name",
1033
- allow_custom_value=True,
1034
  )
1035
- config_name_b = gr.Dropdown(
1036
- choices=[], value="", label="Configuration Name", allow_custom_value=True
 
 
 
 
1037
  )
1038
- with gr.Row():
1039
- refresh_configs_btn_b = gr.Button("🔄 Refresh Configs", variant="secondary", size="sm")
1040
- load_dataset_btn_b = gr.Button("Load Dataset", variant="secondary", size="sm")
1041
 
1042
- dataset_status_b = gr.Markdown("", visible=False)
 
1043
  with gr.Row():
1044
- prev_traj_btn_b = gr.Button("⬅️ Prev", variant="secondary", size="sm")
1045
- trajectory_slider_b = gr.Slider(
1046
- minimum=0, maximum=0, step=1, value=0, label="Trajectory Index", interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
  )
1048
- next_traj_btn_b = gr.Button("Next ➡️", variant="secondary", size="sm")
1049
- trajectory_metadata_b = gr.Markdown("", visible=False)
1050
- use_dataset_video_btn_b = gr.Button("Use Selected Video for B", variant="secondary")
1051
-
1052
- gr.Markdown("---")
1053
- gr.Markdown("**OR Upload Videos Directly**")
1054
- gr.Markdown("---")
1055
-
1056
- video_a_input = gr.Video(label="Video A", height=250)
1057
- video_b_input = gr.Video(label="Video B", height=250)
1058
- task_text_dual = gr.Textbox(
1059
- label="Task Description",
1060
- placeholder="Describe the task",
1061
- value="Complete the task",
1062
- )
1063
- prediction_type = gr.Radio(
1064
- choices=["preference", "similarity", "progress"],
1065
- value="preference",
1066
- label="Prediction Type",
1067
- )
1068
- fps_input_dual = gr.Slider(
1069
- label="FPS (Frames Per Second)",
1070
- minimum=0.1,
1071
- maximum=10.0,
1072
- value=1.0,
1073
- step=0.1,
1074
- info="Frames per second to extract from videos (higher = more frames)",
1075
- )
1076
- analyze_dual_btn = gr.Button("Compare Videos", variant="primary")
1077
-
1078
- with gr.Column():
1079
- # Videos displayed side by side
1080
- with gr.Row():
1081
- video_a_display = gr.Video(label="Video A", height=400)
1082
- video_b_display = gr.Video(label="Video B", height=400)
1083
-
1084
- # Result text at the bottom
1085
- result_text = gr.Markdown("")
1086
-
1087
- # State variables for datasets
1088
- current_dataset_a = gr.State(None)
1089
- current_dataset_b = gr.State(None)
1090
-
1091
- # Helper functions for Video A
1092
- def update_config_choices_a(dataset_name):
1093
- """Update config choices for Video A when dataset changes."""
1094
- if not dataset_name:
1095
- return gr.update(choices=[], value="")
1096
- try:
1097
- configs = get_available_configs(dataset_name)
1098
- if configs:
1099
- return gr.update(choices=configs, value=configs[0])
1100
- else:
1101
- return gr.update(choices=[], value="")
1102
- except Exception as e:
1103
- logger.warning(f"Could not fetch configs: {e}")
1104
- return gr.update(choices=[], value="")
1105
-
1106
- def load_dataset_a(dataset_name, config_name):
1107
- """Load dataset A and update slider."""
1108
- dataset, status = load_rfm_dataset(dataset_name, config_name)
1109
- if dataset is not None:
1110
- max_index = len(dataset) - 1
1111
- return (
1112
- dataset,
1113
- gr.update(value=status, visible=True),
1114
- gr.update(
1115
- maximum=max_index, value=0, interactive=True, label=f"Trajectory Index (0 to {max_index})"
1116
- ),
1117
- )
1118
- else:
1119
- return None, gr.update(value=status, visible=True), gr.update(maximum=0, value=0, interactive=False)
1120
-
1121
- def use_dataset_video_a(dataset, index, dataset_name):
1122
- """Load video A from dataset and update input."""
1123
- if dataset is None:
1124
- return (
1125
- None,
1126
- gr.update(value="No dataset loaded", visible=True),
1127
- gr.update(visible=False),
1128
- )
1129
-
1130
- video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1131
- if video_path:
1132
- # Build metadata text
1133
- metadata_lines = []
1134
- if quality_label:
1135
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1136
- if partial_success is not None:
1137
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1138
-
1139
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1140
- status_text = f"✅ Loaded trajectory {index} from dataset for Video A"
1141
- if metadata_text:
1142
- status_text += f"\n\n{metadata_text}"
1143
-
1144
- return (
1145
- video_path,
1146
- gr.update(value=status_text, visible=True),
1147
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1148
- )
1149
- else:
1150
- return (
1151
- None,
1152
- gr.update(value="❌ Error loading trajectory", visible=True),
1153
- gr.update(visible=False),
1154
- )
1155
-
1156
- def next_trajectory_a(dataset, current_idx, dataset_name):
1157
- """Go to next trajectory for Video A."""
1158
- if dataset is None:
1159
- return 0, None, gr.update(visible=False), gr.update(visible=False)
1160
- next_idx = min(current_idx + 1, len(dataset) - 1)
1161
- video_path, task, quality_label, partial_success = get_trajectory_video_path(
1162
- dataset, next_idx, dataset_name
1163
- )
1164
-
1165
- if video_path:
1166
- # Build metadata text
1167
- metadata_lines = []
1168
- if quality_label:
1169
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1170
- if partial_success is not None:
1171
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1172
-
1173
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1174
- return (
1175
- next_idx,
1176
- video_path,
1177
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1178
- gr.update(value=f"✅ Trajectory {next_idx}/{len(dataset) - 1}", visible=True),
1179
- )
1180
- else:
1181
- return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1182
-
1183
- def prev_trajectory_a(dataset, current_idx, dataset_name):
1184
- """Go to previous trajectory for Video A."""
1185
- if dataset is None:
1186
- return 0, None, gr.update(visible=False), gr.update(visible=False)
1187
- prev_idx = max(current_idx - 1, 0)
1188
- video_path, task, quality_label, partial_success = get_trajectory_video_path(
1189
- dataset, prev_idx, dataset_name
1190
- )
1191
 
1192
- if video_path:
1193
- # Build metadata text
1194
- metadata_lines = []
1195
- if quality_label:
1196
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1197
- if partial_success is not None:
1198
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1199
-
1200
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1201
- return (
1202
- prev_idx,
1203
- video_path,
1204
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1205
- gr.update(value=f"✅ Trajectory {prev_idx}/{len(dataset) - 1}", visible=True),
1206
- )
1207
- else:
1208
- return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1209
-
1210
- def update_trajectory_on_slider_change_a(dataset, index, dataset_name):
1211
- """Update trajectory metadata when slider changes for Video A."""
1212
- if dataset is None:
1213
- return gr.update(visible=False), gr.update(visible=False)
1214
-
1215
- video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1216
- if video_path:
1217
- # Build metadata text
1218
- metadata_lines = []
1219
- if quality_label:
1220
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1221
- if partial_success is not None:
1222
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1223
-
1224
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1225
- return (
1226
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1227
- gr.update(value=f"Trajectory {index}/{len(dataset) - 1}", visible=True),
1228
- )
1229
- else:
1230
- return gr.update(visible=False), gr.update(visible=False)
1231
-
1232
- # Helper functions for Video B (same as Video A)
1233
- def update_config_choices_b(dataset_name):
1234
- """Update config choices for Video B when dataset changes."""
1235
- if not dataset_name:
1236
- return gr.update(choices=[], value="")
1237
- try:
1238
- configs = get_available_configs(dataset_name)
1239
- if configs:
1240
- return gr.update(choices=configs, value=configs[0])
1241
- else:
1242
- return gr.update(choices=[], value="")
1243
- except Exception as e:
1244
- logger.warning(f"Could not fetch configs: {e}")
1245
- return gr.update(choices=[], value="")
1246
-
1247
- def load_dataset_b(dataset_name, config_name):
1248
- """Load dataset B and update slider."""
1249
- dataset, status = load_rfm_dataset(dataset_name, config_name)
1250
- if dataset is not None:
1251
- max_index = len(dataset) - 1
1252
- return (
1253
- dataset,
1254
- gr.update(value=status, visible=True),
1255
- gr.update(
1256
- maximum=max_index, value=0, interactive=True, label=f"Trajectory Index (0 to {max_index})"
1257
- ),
1258
- )
1259
- else:
1260
- return None, gr.update(value=status, visible=True), gr.update(maximum=0, value=0, interactive=False)
1261
-
1262
- def use_dataset_video_b(dataset, index, dataset_name):
1263
- """Load video B from dataset and update input."""
1264
- if dataset is None:
1265
- return (
1266
- None,
1267
- gr.update(value="No dataset loaded", visible=True),
1268
- gr.update(visible=False),
1269
- )
1270
-
1271
- video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1272
- if video_path:
1273
- # Build metadata text
1274
- metadata_lines = []
1275
- if quality_label:
1276
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1277
- if partial_success is not None:
1278
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1279
-
1280
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1281
- status_text = f"✅ Loaded trajectory {index} from dataset for Video B"
1282
- if metadata_text:
1283
- status_text += f"\n\n{metadata_text}"
1284
-
1285
- return (
1286
- video_path,
1287
- gr.update(value=status_text, visible=True),
1288
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1289
- )
1290
- else:
1291
- return (
1292
- None,
1293
- gr.update(value="❌ Error loading trajectory", visible=True),
1294
- gr.update(visible=False),
1295
- )
1296
-
1297
- def next_trajectory_b(dataset, current_idx, dataset_name):
1298
- """Go to next trajectory for Video B."""
1299
- if dataset is None:
1300
- return 0, None, gr.update(visible=False), gr.update(visible=False)
1301
- next_idx = min(current_idx + 1, len(dataset) - 1)
1302
- video_path, task, quality_label, partial_success = get_trajectory_video_path(
1303
- dataset, next_idx, dataset_name
1304
- )
1305
 
1306
- if video_path:
1307
- # Build metadata text
1308
- metadata_lines = []
1309
- if quality_label:
1310
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1311
- if partial_success is not None:
1312
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1313
-
1314
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1315
- return (
1316
- next_idx,
1317
- video_path,
1318
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1319
- gr.update(value=f"✅ Trajectory {next_idx}/{len(dataset) - 1}", visible=True),
1320
- )
1321
- else:
1322
- return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1323
-
1324
- def prev_trajectory_b(dataset, current_idx, dataset_name):
1325
- """Go to previous trajectory for Video B."""
1326
- if dataset is None:
1327
- return 0, None, gr.update(visible=False), gr.update(visible=False)
1328
- prev_idx = max(current_idx - 1, 0)
1329
- video_path, task, quality_label, partial_success = get_trajectory_video_path(
1330
- dataset, prev_idx, dataset_name
1331
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1332
 
1333
- if video_path:
1334
- # Build metadata text
1335
- metadata_lines = []
1336
- if quality_label:
1337
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1338
- if partial_success is not None:
1339
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1340
-
1341
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1342
- return (
1343
- prev_idx,
1344
- video_path,
1345
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1346
- gr.update(value=f"✅ Trajectory {prev_idx}/{len(dataset) - 1}", visible=True),
1347
- )
1348
- else:
1349
- return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1350
-
1351
- def update_trajectory_on_slider_change_b(dataset, index, dataset_name):
1352
- """Update trajectory metadata when slider changes for Video B."""
1353
- if dataset is None:
1354
- return gr.update(visible=False), gr.update(visible=False)
1355
-
1356
- video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1357
- if video_path:
1358
- # Build metadata text
1359
- metadata_lines = []
1360
- if quality_label:
1361
- metadata_lines.append(f"**Quality Label:** {quality_label}")
1362
- if partial_success is not None:
1363
- metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1364
-
1365
- metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1366
- return (
1367
- gr.update(value=metadata_text, visible=bool(metadata_text)),
1368
- gr.update(value=f"Trajectory {index}/{len(dataset) - 1}", visible=True),
1369
- )
1370
- else:
1371
- return gr.update(visible=False), gr.update(visible=False)
1372
 
1373
- # Video A dataset selection handlers
1374
- dataset_name_a.change(
1375
- fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a]
1376
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1377
 
1378
- refresh_configs_btn_a.click(
1379
- fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a]
1380
- )
1381
 
1382
- load_dataset_btn_a.click(
1383
- fn=load_dataset_a,
1384
- inputs=[dataset_name_a, config_name_a],
1385
- outputs=[current_dataset_a, dataset_status_a, trajectory_slider_a],
1386
- )
1387
 
1388
- use_dataset_video_btn_a.click(
1389
- fn=use_dataset_video_a,
1390
- inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1391
- outputs=[video_a_input, dataset_status_a, trajectory_metadata_a],
1392
- )
1393
 
1394
- next_traj_btn_a.click(
1395
- fn=next_trajectory_a,
1396
- inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1397
- outputs=[
1398
- trajectory_slider_a,
1399
- video_a_input,
1400
- trajectory_metadata_a,
1401
- dataset_status_a,
1402
- ],
1403
- )
1404
 
1405
- prev_traj_btn_a.click(
1406
- fn=prev_trajectory_a,
1407
- inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1408
- outputs=[
1409
- trajectory_slider_a,
1410
- video_a_input,
1411
- trajectory_metadata_a,
1412
- dataset_status_a,
1413
- ],
1414
- )
1415
 
1416
- trajectory_slider_a.change(
1417
- fn=update_trajectory_on_slider_change_a,
1418
- inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1419
- outputs=[trajectory_metadata_a, dataset_status_a],
1420
- )
1421
 
1422
- # Video B dataset selection handlers
1423
- dataset_name_b.change(
1424
- fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b]
1425
- )
1426
 
1427
- refresh_configs_btn_b.click(
1428
- fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b]
1429
- )
1430
 
1431
- load_dataset_btn_b.click(
1432
- fn=load_dataset_b,
1433
- inputs=[dataset_name_b, config_name_b],
1434
- outputs=[current_dataset_b, dataset_status_b, trajectory_slider_b],
1435
- )
1436
 
1437
- use_dataset_video_btn_b.click(
1438
- fn=use_dataset_video_b,
1439
- inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1440
- outputs=[video_b_input, dataset_status_b, trajectory_metadata_b],
1441
- )
1442
 
1443
- next_traj_btn_b.click(
1444
- fn=next_trajectory_b,
1445
- inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1446
- outputs=[
1447
- trajectory_slider_b,
1448
- video_b_input,
1449
- trajectory_metadata_b,
1450
- dataset_status_b,
1451
- ],
1452
- )
1453
 
1454
- prev_traj_btn_b.click(
1455
- fn=prev_trajectory_b,
1456
- inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1457
- outputs=[
1458
- trajectory_slider_b,
1459
- video_b_input,
1460
- trajectory_metadata_b,
1461
- dataset_status_b,
1462
- ],
1463
- )
1464
 
1465
- trajectory_slider_b.change(
1466
- fn=update_trajectory_on_slider_change_b,
1467
- inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1468
- outputs=[trajectory_metadata_b, dataset_status_b],
1469
- )
1470
 
1471
- analyze_dual_btn.click(
1472
- fn=process_two_videos,
1473
- inputs=[video_a_input, video_b_input, task_text_dual, prediction_type, server_url_state, fps_input_dual],
1474
- outputs=[result_text, video_a_display, video_b_display],
1475
- api_name="process_two_videos",
1476
- )
1477
 
1478
 
1479
  def main():
 
609
  gr.Markdown(
610
  """
611
  # RFM (Reward Foundation Model) Evaluation Server
 
 
612
  """
613
  )
614
 
615
+ # Hidden state to store server URL and model mapping (define before use)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
  server_url_state = gr.State(value=None)
617
  model_url_mapping_state = gr.State(value={}) # Maps model_name -> server_url
618
 
619
+ # Function definitions for event handlers
620
  def discover_and_select_models(base_url: str):
621
  """Discover models and update dropdown."""
622
  if not base_url:
 
697
  server_url,
698
  )
699
 
700
+ # Main layout with sidebar and content area
701
+ with gr.Row():
702
+ # Sidebar for model selection and info
703
+ with gr.Column(scale=1, min_width=300):
704
+ gr.Markdown("### 🔧 Model Configuration")
705
+
706
+ base_url_input = gr.Textbox(
707
+ label="Base Server URL",
708
+ placeholder="http://40.119.56.66",
709
+ value="http://40.119.56.66",
710
+ interactive=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  )
712
+
713
+ discover_btn = gr.Button("🔍 Discover Models", variant="primary", size="lg")
714
+
715
+ model_dropdown = gr.Dropdown(
716
+ label="Select Model",
717
+ choices=[],
718
+ value=None,
719
+ interactive=True,
720
+ info="Models will be discovered on ports 8000-8010",
721
+ )
722
+
723
+ server_status = gr.Markdown(
724
+ "Click 'Discover Models' to find available models",
725
+ visible=True,
726
+ )
727
+
728
+ gr.Markdown("---")
729
+ gr.Markdown("### 📋 Model Information")
730
+ model_info_display = gr.Markdown("", visible=True)
731
+
732
+ # Event handlers for sidebar
733
+ discover_btn.click(
734
+ fn=discover_and_select_models,
735
+ inputs=[base_url_input],
736
+ outputs=[model_dropdown, server_status, model_info_display, server_url_state, model_url_mapping_state],
 
 
737
  )
738
 
739
+ model_dropdown.change(
740
+ fn=on_model_selected,
741
+ inputs=[model_dropdown, model_url_mapping_state],
742
+ outputs=[server_status, model_info_display, server_url_state],
743
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
 
745
+ # Main content area with tabs
746
+ with gr.Column(scale=4):
747
+ with gr.Tabs():
748
+ with gr.Tab("Progress Prediction"):
749
+ gr.Markdown("### Progress & Success Prediction")
750
+ gr.Markdown("Upload a video or select one from a dataset to get progress predictions.")
751
 
752
+ with gr.Row():
753
+ with gr.Column():
754
+ single_video_input = gr.Video(label="Upload Video", height=300)
755
+ task_text_input = gr.Textbox(
756
+ label="Task Description",
757
+ placeholder="Describe the task (e.g., 'Pick up the red block')",
758
+ value="Complete the task",
759
+ )
760
+ fps_input_single = gr.Slider(
761
+ label="FPS (Frames Per Second)",
762
+ minimum=0.1,
763
+ maximum=10.0,
764
+ value=1.0,
765
+ step=0.1,
766
+ info="Frames per second to extract from video (higher = more frames)",
767
+ )
768
+ analyze_single_btn = gr.Button("Analyze Video", variant="primary")
769
+
770
+ gr.Markdown("---")
771
+ gr.Markdown("**OR Select from Dataset**")
772
+ gr.Markdown("---")
773
+
774
+ with gr.Accordion("📁 Select from Dataset", open=False):
775
+ dataset_name_single = gr.Dropdown(
776
+ choices=PREDEFINED_DATASETS,
777
+ value="jesbu1/oxe_rfm",
778
+ label="Dataset Name",
779
+ allow_custom_value=True,
780
+ )
781
+ config_name_single = gr.Dropdown(
782
+ choices=[], value="", label="Configuration Name", allow_custom_value=True
783
+ )
784
+ with gr.Row():
785
+ refresh_configs_btn = gr.Button("🔄 Refresh Configs", variant="secondary", size="sm")
786
+ load_dataset_btn = gr.Button("Load Dataset", variant="secondary", size="sm")
787
+
788
+ dataset_status_single = gr.Markdown("", visible=False)
789
+ with gr.Row():
790
+ prev_traj_btn = gr.Button("⬅️ Prev", variant="secondary", size="sm")
791
+ trajectory_slider = gr.Slider(
792
+ minimum=0, maximum=0, step=1, value=0, label="Trajectory Index", interactive=True
793
+ )
794
+ next_traj_btn = gr.Button("Next ➡️", variant="secondary", size="sm")
795
+ trajectory_metadata = gr.Markdown("", visible=False)
796
+ use_dataset_video_btn = gr.Button("Use Selected Video", variant="secondary")
797
+
798
+ with gr.Column():
799
+ progress_plot = gr.Image(label="Progress & Success Prediction", height=400)
800
+ info_output = gr.Markdown("")
801
+
802
+ # State variables for dataset
803
+ current_dataset_single = gr.State(None)
804
+
805
+ def update_config_choices_single(dataset_name):
806
+ """Update config choices when dataset changes."""
807
+ if not dataset_name:
808
+ return gr.update(choices=[], value="")
809
+ try:
810
+ configs = get_available_configs(dataset_name)
811
+ if configs:
812
+ return gr.update(choices=configs, value=configs[0])
813
+ else:
814
+ return gr.update(choices=[], value="")
815
+ except Exception as e:
816
+ logger.warning(f"Could not fetch configs: {e}")
817
+ return gr.update(choices=[], value="")
818
+
819
+ def load_dataset_single(dataset_name, config_name):
820
+ """Load dataset and update slider."""
821
+ dataset, status = load_rfm_dataset(dataset_name, config_name)
822
+ if dataset is not None:
823
+ max_index = len(dataset) - 1
824
+ return (
825
+ dataset,
826
+ gr.update(value=status, visible=True),
827
+ gr.update(
828
+ maximum=max_index, value=0, interactive=True, label=f"Trajectory Index (0 to {max_index})"
829
+ ),
830
+ )
831
+ else:
832
+ return None, gr.update(value=status, visible=True), gr.update(maximum=0, value=0, interactive=False)
833
+
834
+ def use_dataset_video(dataset, index, dataset_name):
835
+ """Load video from dataset and update inputs."""
836
+ if dataset is None:
837
+ return (
838
+ None,
839
+ "Complete the task",
840
+ gr.update(value="No dataset loaded", visible=True),
841
+ gr.update(visible=False),
842
+ )
843
+
844
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
845
+ if video_path:
846
+ # Build metadata text
847
+ metadata_lines = []
848
+ if quality_label:
849
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
850
+ if partial_success is not None:
851
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
852
+
853
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
854
+ status_text = f"✅ Loaded trajectory {index} from dataset"
855
+ if metadata_text:
856
+ status_text += f"\n\n{metadata_text}"
857
+
858
+ return (
859
+ video_path,
860
+ task,
861
+ gr.update(value=status_text, visible=True),
862
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
863
+ )
864
+ else:
865
+ return (
866
+ None,
867
+ "Complete the task",
868
+ gr.update(value="❌ Error loading trajectory", visible=True),
869
+ gr.update(visible=False),
870
+ )
871
+
872
+ def next_trajectory(dataset, current_idx, dataset_name):
873
+ """Go to next trajectory."""
874
+ if dataset is None:
875
+ return 0, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
876
+ next_idx = min(current_idx + 1, len(dataset) - 1)
877
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(
878
+ dataset, next_idx, dataset_name
879
+ )
880
 
881
+ if video_path:
882
+ # Build metadata text
883
+ metadata_lines = []
884
+ if quality_label:
885
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
886
+ if partial_success is not None:
887
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
888
+
889
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
890
+ return (
891
+ next_idx,
892
+ video_path,
893
+ task,
894
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
895
+ gr.update(value=f"✅ Trajectory {next_idx}/{len(dataset) - 1}", visible=True),
896
+ )
897
+ else:
898
+ return current_idx, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
899
+
900
+ def prev_trajectory(dataset, current_idx, dataset_name):
901
+ """Go to previous trajectory."""
902
+ if dataset is None:
903
+ return 0, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
904
+ prev_idx = max(current_idx - 1, 0)
905
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(
906
+ dataset, prev_idx, dataset_name
907
+ )
908
 
909
+ if video_path:
910
+ # Build metadata text
911
+ metadata_lines = []
912
+ if quality_label:
913
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
914
+ if partial_success is not None:
915
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
916
+
917
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
918
+ return (
919
+ prev_idx,
920
+ video_path,
921
+ task,
922
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
923
+ gr.update(value=f"✅ Trajectory {prev_idx}/{len(dataset) - 1}", visible=True),
924
+ )
925
+ else:
926
+ return current_idx, None, "Complete the task", gr.update(visible=False), gr.update(visible=False)
927
+
928
+ def update_trajectory_on_slider_change(dataset, index, dataset_name):
929
+ """Update trajectory metadata when slider changes."""
930
+ if dataset is None:
931
+ return gr.update(visible=False), gr.update(visible=False)
932
+
933
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
934
+ if video_path:
935
+ # Build metadata text
936
+ metadata_lines = []
937
+ if quality_label:
938
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
939
+ if partial_success is not None:
940
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
941
+
942
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
943
+ return (
944
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
945
+ gr.update(value=f"Trajectory {index}/{len(dataset) - 1}", visible=True),
946
+ )
947
+ else:
948
+ return gr.update(visible=False), gr.update(visible=False)
949
+
950
+ # Dataset selection handlers
951
+ dataset_name_single.change(
952
+ fn=update_config_choices_single, inputs=[dataset_name_single], outputs=[config_name_single]
953
+ )
954
 
955
+ refresh_configs_btn.click(
956
+ fn=update_config_choices_single, inputs=[dataset_name_single], outputs=[config_name_single]
957
+ )
 
 
 
 
 
 
 
 
958
 
959
+ load_dataset_btn.click(
960
+ fn=load_dataset_single,
961
+ inputs=[dataset_name_single, config_name_single],
962
+ outputs=[current_dataset_single, dataset_status_single, trajectory_slider],
963
+ )
 
964
 
965
+ use_dataset_video_btn.click(
966
+ fn=use_dataset_video,
967
+ inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
968
+ outputs=[single_video_input, task_text_input, dataset_status_single, trajectory_metadata],
969
+ )
 
970
 
971
+ # Navigation buttons
972
+ next_traj_btn.click(
973
+ fn=next_trajectory,
974
+ inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
975
+ outputs=[
976
+ trajectory_slider,
977
+ single_video_input,
978
+ task_text_input,
979
+ trajectory_metadata,
980
+ dataset_status_single,
981
+ ],
982
  )
983
+
984
+ prev_traj_btn.click(
985
+ fn=prev_trajectory,
986
+ inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
987
+ outputs=[
988
+ trajectory_slider,
989
+ single_video_input,
990
+ task_text_input,
991
+ trajectory_metadata,
992
+ dataset_status_single,
993
+ ],
994
  )
 
 
 
995
 
996
+ # Update metadata when slider changes
997
+ trajectory_slider.change(
998
+ fn=update_trajectory_on_slider_change,
999
+ inputs=[current_dataset_single, trajectory_slider, dataset_name_single],
1000
+ outputs=[trajectory_metadata, dataset_status_single],
 
 
 
 
 
 
 
 
 
 
 
1001
  )
1002
+
1003
+ analyze_single_btn.click(
1004
+ fn=process_single_video,
1005
+ inputs=[single_video_input, task_text_input, server_url_state, fps_input_single],
1006
+ outputs=[progress_plot, info_output],
1007
+ api_name="process_single_video",
1008
  )
 
 
 
1009
 
1010
+ with gr.Tab("Preference/Similarity Analysis"):
1011
+ gr.Markdown("### Preference & Similarity Prediction")
1012
  with gr.Row():
1013
+ with gr.Column():
1014
+ video_a_input = gr.Video(label="Video A", height=250)
1015
+ video_b_input = gr.Video(label="Video B", height=250)
1016
+ task_text_dual = gr.Textbox(
1017
+ label="Task Description",
1018
+ placeholder="Describe the task",
1019
+ value="Complete the task",
1020
+ )
1021
+ prediction_type = gr.Radio(
1022
+ choices=["preference", "similarity", "progress"],
1023
+ value="preference",
1024
+ label="Prediction Type",
1025
+ )
1026
+ fps_input_dual = gr.Slider(
1027
+ label="FPS (Frames Per Second)",
1028
+ minimum=0.1,
1029
+ maximum=10.0,
1030
+ value=1.0,
1031
+ step=0.1,
1032
+ info="Frames per second to extract from videos (higher = more frames)",
1033
+ )
1034
+ analyze_dual_btn = gr.Button("Compare Videos", variant="primary")
1035
+
1036
+ gr.Markdown("---")
1037
+ gr.Markdown("**OR Select from Dataset**")
1038
+ gr.Markdown("---")
1039
+
1040
+ with gr.Accordion("📁 Video A - Select from Dataset", open=False):
1041
+ dataset_name_a = gr.Dropdown(
1042
+ choices=PREDEFINED_DATASETS,
1043
+ value="jesbu1/oxe_rfm",
1044
+ label="Dataset Name",
1045
+ allow_custom_value=True,
1046
+ )
1047
+ config_name_a = gr.Dropdown(
1048
+ choices=[], value="", label="Configuration Name", allow_custom_value=True
1049
+ )
1050
+ with gr.Row():
1051
+ refresh_configs_btn_a = gr.Button("🔄 Refresh Configs", variant="secondary", size="sm")
1052
+ load_dataset_btn_a = gr.Button("Load Dataset", variant="secondary", size="sm")
1053
+
1054
+ dataset_status_a = gr.Markdown("", visible=False)
1055
+ with gr.Row():
1056
+ prev_traj_btn_a = gr.Button("⬅️ Prev", variant="secondary", size="sm")
1057
+ trajectory_slider_a = gr.Slider(
1058
+ minimum=0, maximum=0, step=1, value=0, label="Trajectory Index", interactive=True
1059
+ )
1060
+ next_traj_btn_a = gr.Button("Next ➡️", variant="secondary", size="sm")
1061
+ trajectory_metadata_a = gr.Markdown("", visible=False)
1062
+ use_dataset_video_btn_a = gr.Button("Use Selected Video for A", variant="secondary")
1063
+
1064
+ with gr.Accordion("📁 Video B - Select from Dataset", open=False):
1065
+ dataset_name_b = gr.Dropdown(
1066
+ choices=PREDEFINED_DATASETS,
1067
+ value="jesbu1/oxe_rfm",
1068
+ label="Dataset Name",
1069
+ allow_custom_value=True,
1070
+ )
1071
+ config_name_b = gr.Dropdown(
1072
+ choices=[], value="", label="Configuration Name", allow_custom_value=True
1073
+ )
1074
+ with gr.Row():
1075
+ refresh_configs_btn_b = gr.Button("🔄 Refresh Configs", variant="secondary", size="sm")
1076
+ load_dataset_btn_b = gr.Button("Load Dataset", variant="secondary", size="sm")
1077
+
1078
+ dataset_status_b = gr.Markdown("", visible=False)
1079
+ with gr.Row():
1080
+ prev_traj_btn_b = gr.Button("⬅️ Prev", variant="secondary", size="sm")
1081
+ trajectory_slider_b = gr.Slider(
1082
+ minimum=0, maximum=0, step=1, value=0, label="Trajectory Index", interactive=True
1083
+ )
1084
+ next_traj_btn_b = gr.Button("Next ➡️", variant="secondary", size="sm")
1085
+ trajectory_metadata_b = gr.Markdown("", visible=False)
1086
+ use_dataset_video_btn_b = gr.Button("Use Selected Video for B", variant="secondary")
1087
+
1088
+ with gr.Column():
1089
+ # Videos displayed side by side
1090
+ with gr.Row():
1091
+ video_a_display = gr.Video(label="Video A", height=400)
1092
+ video_b_display = gr.Video(label="Video B", height=400)
1093
+
1094
+ # Result text at the bottom
1095
+ result_text = gr.Markdown("")
1096
+
1097
+ # State variables for datasets
1098
+ current_dataset_a = gr.State(None)
1099
+ current_dataset_b = gr.State(None)
1100
+
1101
+ # Helper functions for Video A
1102
+ def update_config_choices_a(dataset_name):
1103
+ """Update config choices for Video A when dataset changes."""
1104
+ if not dataset_name:
1105
+ return gr.update(choices=[], value="")
1106
+ try:
1107
+ configs = get_available_configs(dataset_name)
1108
+ if configs:
1109
+ return gr.update(choices=configs, value=configs[0])
1110
+ else:
1111
+ return gr.update(choices=[], value="")
1112
+ except Exception as e:
1113
+ logger.warning(f"Could not fetch configs: {e}")
1114
+ return gr.update(choices=[], value="")
1115
+
1116
+ def load_dataset_a(dataset_name, config_name):
1117
+ """Load dataset A and update slider."""
1118
+ dataset, status = load_rfm_dataset(dataset_name, config_name)
1119
+ if dataset is not None:
1120
+ max_index = len(dataset) - 1
1121
+ return (
1122
+ dataset,
1123
+ gr.update(value=status, visible=True),
1124
+ gr.update(
1125
+ maximum=max_index, value=0, interactive=True, label=f"Trajectory Index (0 to {max_index})"
1126
+ ),
1127
+ )
1128
+ else:
1129
+ return None, gr.update(value=status, visible=True), gr.update(maximum=0, value=0, interactive=False)
1130
+
1131
+ def use_dataset_video_a(dataset, index, dataset_name):
1132
+ """Load video A from dataset and update input."""
1133
+ if dataset is None:
1134
+ return (
1135
+ None,
1136
+ gr.update(value="No dataset loaded", visible=True),
1137
+ gr.update(visible=False),
1138
+ )
1139
+
1140
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1141
+ if video_path:
1142
+ # Build metadata text
1143
+ metadata_lines = []
1144
+ if quality_label:
1145
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1146
+ if partial_success is not None:
1147
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1148
+
1149
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1150
+ status_text = f"✅ Loaded trajectory {index} from dataset for Video A"
1151
+ if metadata_text:
1152
+ status_text += f"\n\n{metadata_text}"
1153
+
1154
+ return (
1155
+ video_path,
1156
+ gr.update(value=status_text, visible=True),
1157
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1158
+ )
1159
+ else:
1160
+ return (
1161
+ None,
1162
+ gr.update(value="❌ Error loading trajectory", visible=True),
1163
+ gr.update(visible=False),
1164
+ )
1165
+
1166
+ def next_trajectory_a(dataset, current_idx, dataset_name):
1167
+ """Go to next trajectory for Video A."""
1168
+ if dataset is None:
1169
+ return 0, None, gr.update(visible=False), gr.update(visible=False)
1170
+ next_idx = min(current_idx + 1, len(dataset) - 1)
1171
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(
1172
+ dataset, next_idx, dataset_name
1173
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1174
 
1175
+ if video_path:
1176
+ # Build metadata text
1177
+ metadata_lines = []
1178
+ if quality_label:
1179
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1180
+ if partial_success is not None:
1181
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1182
+
1183
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1184
+ return (
1185
+ next_idx,
1186
+ video_path,
1187
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1188
+ gr.update(value=f"✅ Trajectory {next_idx}/{len(dataset) - 1}", visible=True),
1189
+ )
1190
+ else:
1191
+ return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1192
+
1193
+ def prev_trajectory_a(dataset, current_idx, dataset_name):
1194
+ """Go to previous trajectory for Video A."""
1195
+ if dataset is None:
1196
+ return 0, None, gr.update(visible=False), gr.update(visible=False)
1197
+ prev_idx = max(current_idx - 1, 0)
1198
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(
1199
+ dataset, prev_idx, dataset_name
1200
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1201
 
1202
+ if video_path:
1203
+ # Build metadata text
1204
+ metadata_lines = []
1205
+ if quality_label:
1206
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1207
+ if partial_success is not None:
1208
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1209
+
1210
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1211
+ return (
1212
+ prev_idx,
1213
+ video_path,
1214
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1215
+ gr.update(value=f"✅ Trajectory {prev_idx}/{len(dataset) - 1}", visible=True),
1216
+ )
1217
+ else:
1218
+ return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1219
+
1220
+ def update_trajectory_on_slider_change_a(dataset, index, dataset_name):
1221
+ """Update trajectory metadata when slider changes for Video A."""
1222
+ if dataset is None:
1223
+ return gr.update(visible=False), gr.update(visible=False)
1224
+
1225
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1226
+ if video_path:
1227
+ # Build metadata text
1228
+ metadata_lines = []
1229
+ if quality_label:
1230
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1231
+ if partial_success is not None:
1232
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1233
+
1234
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1235
+ return (
1236
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1237
+ gr.update(value=f"Trajectory {index}/{len(dataset) - 1}", visible=True),
1238
+ )
1239
+ else:
1240
+ return gr.update(visible=False), gr.update(visible=False)
1241
+
1242
+ # Helper functions for Video B (same as Video A)
1243
+ def update_config_choices_b(dataset_name):
1244
+ """Update config choices for Video B when dataset changes."""
1245
+ if not dataset_name:
1246
+ return gr.update(choices=[], value="")
1247
+ try:
1248
+ configs = get_available_configs(dataset_name)
1249
+ if configs:
1250
+ return gr.update(choices=configs, value=configs[0])
1251
+ else:
1252
+ return gr.update(choices=[], value="")
1253
+ except Exception as e:
1254
+ logger.warning(f"Could not fetch configs: {e}")
1255
+ return gr.update(choices=[], value="")
1256
+
1257
+ def load_dataset_b(dataset_name, config_name):
1258
+ """Load dataset B and update slider."""
1259
+ dataset, status = load_rfm_dataset(dataset_name, config_name)
1260
+ if dataset is not None:
1261
+ max_index = len(dataset) - 1
1262
+ return (
1263
+ dataset,
1264
+ gr.update(value=status, visible=True),
1265
+ gr.update(
1266
+ maximum=max_index, value=0, interactive=True, label=f"Trajectory Index (0 to {max_index})"
1267
+ ),
1268
+ )
1269
+ else:
1270
+ return None, gr.update(value=status, visible=True), gr.update(maximum=0, value=0, interactive=False)
1271
+
1272
+ def use_dataset_video_b(dataset, index, dataset_name):
1273
+ """Load video B from dataset and update input."""
1274
+ if dataset is None:
1275
+ return (
1276
+ None,
1277
+ gr.update(value="No dataset loaded", visible=True),
1278
+ gr.update(visible=False),
1279
+ )
1280
+
1281
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1282
+ if video_path:
1283
+ # Build metadata text
1284
+ metadata_lines = []
1285
+ if quality_label:
1286
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1287
+ if partial_success is not None:
1288
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1289
+
1290
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1291
+ status_text = f"✅ Loaded trajectory {index} from dataset for Video B"
1292
+ if metadata_text:
1293
+ status_text += f"\n\n{metadata_text}"
1294
+
1295
+ return (
1296
+ video_path,
1297
+ gr.update(value=status_text, visible=True),
1298
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1299
+ )
1300
+ else:
1301
+ return (
1302
+ None,
1303
+ gr.update(value="❌ Error loading trajectory", visible=True),
1304
+ gr.update(visible=False),
1305
+ )
1306
+
1307
+ def next_trajectory_b(dataset, current_idx, dataset_name):
1308
+ """Go to next trajectory for Video B."""
1309
+ if dataset is None:
1310
+ return 0, None, gr.update(visible=False), gr.update(visible=False)
1311
+ next_idx = min(current_idx + 1, len(dataset) - 1)
1312
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(
1313
+ dataset, next_idx, dataset_name
1314
+ )
1315
 
1316
+ if video_path:
1317
+ # Build metadata text
1318
+ metadata_lines = []
1319
+ if quality_label:
1320
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1321
+ if partial_success is not None:
1322
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1323
+
1324
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1325
+ return (
1326
+ next_idx,
1327
+ video_path,
1328
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1329
+ gr.update(value=f"✅ Trajectory {next_idx}/{len(dataset) - 1}", visible=True),
1330
+ )
1331
+ else:
1332
+ return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1333
+
1334
+ def prev_trajectory_b(dataset, current_idx, dataset_name):
1335
+ """Go to previous trajectory for Video B."""
1336
+ if dataset is None:
1337
+ return 0, None, gr.update(visible=False), gr.update(visible=False)
1338
+ prev_idx = max(current_idx - 1, 0)
1339
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(
1340
+ dataset, prev_idx, dataset_name
1341
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
1342
 
1343
+ if video_path:
1344
+ # Build metadata text
1345
+ metadata_lines = []
1346
+ if quality_label:
1347
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1348
+ if partial_success is not None:
1349
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1350
+
1351
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1352
+ return (
1353
+ prev_idx,
1354
+ video_path,
1355
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1356
+ gr.update(value=f"✅ Trajectory {prev_idx}/{len(dataset) - 1}", visible=True),
1357
+ )
1358
+ else:
1359
+ return current_idx, None, gr.update(visible=False), gr.update(visible=False)
1360
+
1361
+ def update_trajectory_on_slider_change_b(dataset, index, dataset_name):
1362
+ """Update trajectory metadata when slider changes for Video B."""
1363
+ if dataset is None:
1364
+ return gr.update(visible=False), gr.update(visible=False)
1365
+
1366
+ video_path, task, quality_label, partial_success = get_trajectory_video_path(dataset, index, dataset_name)
1367
+ if video_path:
1368
+ # Build metadata text
1369
+ metadata_lines = []
1370
+ if quality_label:
1371
+ metadata_lines.append(f"**Quality Label:** {quality_label}")
1372
+ if partial_success is not None:
1373
+ metadata_lines.append(f"**Partial Success:** {partial_success:.3f}")
1374
+
1375
+ metadata_text = "\n".join(metadata_lines) if metadata_lines else ""
1376
+ return (
1377
+ gr.update(value=metadata_text, visible=bool(metadata_text)),
1378
+ gr.update(value=f"Trajectory {index}/{len(dataset) - 1}", visible=True),
1379
+ )
1380
+ else:
1381
+ return gr.update(visible=False), gr.update(visible=False)
1382
+
1383
+ # Video A dataset selection handlers
1384
+ dataset_name_a.change(
1385
+ fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a]
1386
+ )
1387
 
1388
+ refresh_configs_btn_a.click(
1389
+ fn=update_config_choices_a, inputs=[dataset_name_a], outputs=[config_name_a]
1390
+ )
1391
 
1392
+ load_dataset_btn_a.click(
1393
+ fn=load_dataset_a,
1394
+ inputs=[dataset_name_a, config_name_a],
1395
+ outputs=[current_dataset_a, dataset_status_a, trajectory_slider_a],
1396
+ )
1397
 
1398
+ use_dataset_video_btn_a.click(
1399
+ fn=use_dataset_video_a,
1400
+ inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1401
+ outputs=[video_a_input, dataset_status_a, trajectory_metadata_a],
1402
+ )
1403
 
1404
+ next_traj_btn_a.click(
1405
+ fn=next_trajectory_a,
1406
+ inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1407
+ outputs=[
1408
+ trajectory_slider_a,
1409
+ video_a_input,
1410
+ trajectory_metadata_a,
1411
+ dataset_status_a,
1412
+ ],
1413
+ )
1414
 
1415
+ prev_traj_btn_a.click(
1416
+ fn=prev_trajectory_a,
1417
+ inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1418
+ outputs=[
1419
+ trajectory_slider_a,
1420
+ video_a_input,
1421
+ trajectory_metadata_a,
1422
+ dataset_status_a,
1423
+ ],
1424
+ )
1425
 
1426
+ trajectory_slider_a.change(
1427
+ fn=update_trajectory_on_slider_change_a,
1428
+ inputs=[current_dataset_a, trajectory_slider_a, dataset_name_a],
1429
+ outputs=[trajectory_metadata_a, dataset_status_a],
1430
+ )
1431
 
1432
+ # Video B dataset selection handlers
1433
+ dataset_name_b.change(
1434
+ fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b]
1435
+ )
1436
 
1437
+ refresh_configs_btn_b.click(
1438
+ fn=update_config_choices_b, inputs=[dataset_name_b], outputs=[config_name_b]
1439
+ )
1440
 
1441
+ load_dataset_btn_b.click(
1442
+ fn=load_dataset_b,
1443
+ inputs=[dataset_name_b, config_name_b],
1444
+ outputs=[current_dataset_b, dataset_status_b, trajectory_slider_b],
1445
+ )
1446
 
1447
+ use_dataset_video_btn_b.click(
1448
+ fn=use_dataset_video_b,
1449
+ inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1450
+ outputs=[video_b_input, dataset_status_b, trajectory_metadata_b],
1451
+ )
1452
 
1453
+ next_traj_btn_b.click(
1454
+ fn=next_trajectory_b,
1455
+ inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1456
+ outputs=[
1457
+ trajectory_slider_b,
1458
+ video_b_input,
1459
+ trajectory_metadata_b,
1460
+ dataset_status_b,
1461
+ ],
1462
+ )
1463
 
1464
+ prev_traj_btn_b.click(
1465
+ fn=prev_trajectory_b,
1466
+ inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1467
+ outputs=[
1468
+ trajectory_slider_b,
1469
+ video_b_input,
1470
+ trajectory_metadata_b,
1471
+ dataset_status_b,
1472
+ ],
1473
+ )
1474
 
1475
+ trajectory_slider_b.change(
1476
+ fn=update_trajectory_on_slider_change_b,
1477
+ inputs=[current_dataset_b, trajectory_slider_b, dataset_name_b],
1478
+ outputs=[trajectory_metadata_b, dataset_status_b],
1479
+ )
1480
 
1481
+ analyze_dual_btn.click(
1482
+ fn=process_two_videos,
1483
+ inputs=[video_a_input, video_b_input, task_text_dual, prediction_type, server_url_state, fps_input_dual],
1484
+ outputs=[result_text, video_a_display, video_b_display],
1485
+ api_name="process_two_videos",
1486
+ )
1487
 
1488
 
1489
  def main():