Spaces:
Sleeping
Sleeping
| """ | |
| Video Inference Demo App | |
| A Gradio demo to visualize video inference results from multiple models. | |
| Shows the corresponding video alongside inference data including prompt, | |
| expected answer, model response, and judgment. | |
| """ | |
| import gradio as gr | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, Any | |
| # Base paths - all relative to the demo directory for Huggingface Spaces compatibility | |
| BASE_DIR = Path(__file__).parent.resolve() | |
| OUTPUTS_DIR = BASE_DIR / "outputs" | |
| VIDEOS_DIR = BASE_DIR / "videos" | |
| def get_available_experiments(): | |
| """Get list of available experiment directories.""" | |
| if not OUTPUTS_DIR.exists(): | |
| return [] | |
| available_experiments = set() | |
| for d in OUTPUTS_DIR.iterdir(): | |
| if d.is_dir() and (d / "inference.json").exists() and "must" not in d.name: # Ignore folders with "must" as it is an ablation | |
| name = "-".join(d.name.split("-")[:-2]) | |
| available_experiments.add(name) | |
| return list(sorted(available_experiments)) | |
| def load_inference_data(experiment_name): | |
| """Load inference data for a specific experiment.""" | |
| experiment_name_without_prepost = experiment_name + "-wo-prepost" | |
| experiment_name_with_prepost = experiment_name + "-with-prepost" | |
| without_prepost_inference_path = OUTPUTS_DIR / experiment_name_without_prepost / "inference.json" | |
| with_prepost_inference_path = OUTPUTS_DIR / experiment_name_with_prepost / "inference.json" | |
| if not without_prepost_inference_path.exists() or not with_prepost_inference_path.exists(): | |
| raise FileNotFoundError(f"Inference data not found for experiment {experiment_name} without or with prepost") | |
| with open(without_prepost_inference_path, "r") as f: | |
| without_prepost_data = json.load(f) | |
| with open(with_prepost_inference_path, "r") as f: | |
| with_prepost_data = json.load(f) | |
| return { | |
| "without_prepost": without_prepost_data, | |
| "with_prepost": with_prepost_data | |
| } | |
| def load_evals_data(experiment_name): | |
| """Load metrics data for a specific experiment.""" | |
| experiment_name_without_prepost = experiment_name + "-wo-prepost" | |
| experiment_name_with_prepost = experiment_name + "-with-prepost" | |
| without_prepost_metrics_path = OUTPUTS_DIR / experiment_name_without_prepost / "metrics.json" | |
| with_prepost_metrics_path = OUTPUTS_DIR / experiment_name_with_prepost / "metrics.json" | |
| if not without_prepost_metrics_path.exists() or not with_prepost_metrics_path.exists(): | |
| raise FileNotFoundError(f"Metrics data not found for experiment {experiment_name} without or with prepost") | |
| with open(without_prepost_metrics_path, "r") as f: | |
| without_prepost_data = json.load(f) | |
| with open(with_prepost_metrics_path, "r") as f: | |
| with_prepost_data = json.load(f) | |
| return { | |
| "without_prepost": without_prepost_data, | |
| "with_prepost": with_prepost_data | |
| } | |
| def get_video_path(video_id): | |
| """Get the path to a video file by ID.""" | |
| video_path = VIDEOS_DIR / f"{video_id}.mp4" | |
| if not video_path.exists(): | |
| video_path = VIDEOS_DIR / f"{video_id}.webm" | |
| if not video_path.exists(): | |
| return None | |
| return str(video_path) | |
| def format_prompt_for_display(prompt): | |
| """Format the prompt text for better readability.""" | |
| # Replace escaped newlines with actual newlines | |
| formatted = prompt.replace("\\n", "\n") | |
| return formatted | |
| def create_demo(): | |
| """Create and return the Gradio demo interface.""" | |
| # Load initial data | |
| experiments = get_available_experiments() | |
| if not experiments: | |
| raise ValueError("No experiment outputs found in the outputs directory!") | |
| # Cache for inference data | |
| data_cache = {} | |
| metrics_cache = {} | |
| filter_type_to_judgment = { | |
| "Correct Only": "correct", | |
| "Wrong Only": "wrong", | |
| } | |
| judgment_to_style = { | |
| "correct": "β Correct", | |
| "wrong": "β Wrong", | |
| "invalid": "β οΈ Invalid" | |
| } | |
| def load_experiment_data(experiment_name): | |
| if experiment_name not in data_cache: | |
| data_cache[experiment_name]: Dict[str, Dict[str, Any]] = load_inference_data(experiment_name) | |
| return data_cache[experiment_name] | |
| def load_metrics_data(experiment_name): | |
| if experiment_name not in metrics_cache: | |
| metrics_cache[experiment_name]: Dict[str, Dict[str, Any]] = load_evals_data(experiment_name) | |
| return metrics_cache[experiment_name] | |
| def get_entry_count(experiment_name): | |
| data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name) | |
| # Since both experiment results have the same data in the same order, we can return the length of either | |
| return len(data["without_prepost"]) | |
| def find_index_by_video_id(experiment_name, video_id): | |
| """Find the index of an entry by its video ID.""" | |
| data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name) | |
| video_id_str = str(video_id).strip() | |
| for i, entry in enumerate(data["without_prepost"]): | |
| if str(entry["id"]) == video_id_str: | |
| return i | |
| return None | |
| def format_metrics(metrics: Dict[str, Dict[str, float]]): | |
| metrics2keys = { | |
| "accuracy": "Accuracy", | |
| "invalid_rate": "Invalid", | |
| "f1": "F1 Score", | |
| "precision": "Precision", | |
| "recall": "Recall", | |
| "robust_accuracy": "Robust Accuracy", | |
| } | |
| def format_per_prepost_type(metrics: Dict[str, float]): | |
| formatted_metrics = [ | |
| " {}: {:.2f} ".format(metrics2keys[key], metrics[key]) | |
| for key in metrics | |
| ] | |
| return "|".join(formatted_metrics) | |
| without_prepost_metrics = format_per_prepost_type(metrics["without_prepost"]) | |
| with_prepost_metrics = format_per_prepost_type(metrics["with_prepost"]) | |
| return f"\nWithout Pre-Post: {without_prepost_metrics}\nWith Pre-Post: {with_prepost_metrics}" | |
| def get_entry(experiment_name, index): | |
| data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name) | |
| metrics: Dict[str, Dict[str, Any]] = load_metrics_data(experiment_name) | |
| if not data or index < 0 or index >= len(data["without_prepost"]): | |
| return None, "", "", "", "", "", "", "", "No entry found" | |
| without_prepost_entry = data["without_prepost"][index] | |
| with_prepost_entry = data["with_prepost"][index] | |
| video_id = without_prepost_entry["id"] | |
| video_path = get_video_path(video_id) | |
| prompt = format_prompt_for_display(with_prepost_entry["prompt"]) # Get the prompt with pre-post conditions | |
| answer = without_prepost_entry["answer"] | |
| without_prepost_response = without_prepost_entry["response"] | |
| with_prepost_response = with_prepost_entry["response"] | |
| without_prepost_judgment = without_prepost_entry["judgment"] | |
| with_prepost_judgment = with_prepost_entry["judgment"] | |
| # Compute metrics | |
| formatted_metrics = format_metrics(metrics) # Returns formatted metrics for both with and without pre-post conditions | |
| total = get_entry_count(experiment_name) | |
| stats = f"[Entry {index + 1} of {total}]\n{formatted_metrics}" | |
| return video_path, video_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats | |
| def on_experiment_change(experiment_name): | |
| data = load_experiment_data(experiment_name) | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, 0) | |
| without_prepost_judgment_style = judgment_to_style[without_prepost_judgment] | |
| with_prepost_judgment_style = judgment_to_style[with_prepost_judgment] | |
| # Return the video_id of the first entry and current_index=0 in state | |
| first_vid_id = str(data["without_prepost"][0]["id"]) if data else "" | |
| return ( | |
| first_vid_id, # video_id_input value | |
| 0, # current_index state | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats | |
| ) | |
| def on_video_id_submit(experiment_name, video_id_str, current_index): | |
| """Handle video ID input - find and load the matching entry.""" | |
| index = find_index_by_video_id(experiment_name, video_id_str) | |
| if index is None: | |
| # Video ID not found, stay at current position | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, int(current_index)) | |
| without_prepost_judgment_style = judgment_to_style[without_prepost_judgment] | |
| with_prepost_judgment_style = judgment_to_style[with_prepost_judgment] | |
| stats = f"β οΈ Video ID '{video_id_str}' not found. {stats}" | |
| return vid_id, current_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, index) | |
| without_prepost_judgment_style = judgment_to_style[without_prepost_judgment] | |
| with_prepost_judgment_style = judgment_to_style[with_prepost_judgment] | |
| return vid_id, index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats | |
| def indices_by_filter_type(experiment_name, filter_type): | |
| ''' | |
| Assumption: | |
| Since experiment without pre-post conditions has more mistakes, the filter will be applied to the without pre-post conditions. | |
| ''' | |
| data = load_experiment_data(experiment_name) | |
| if filter_type == "All": | |
| return range(len(data["without_prepost"])) | |
| judgment = filter_type_to_judgment[filter_type] | |
| indices = [i for i, e in enumerate(data["without_prepost"]) if e["judgment"] == judgment] | |
| return indices | |
| def binary_search(indices, index, type: str): | |
| l, h = 0, len(indices) - 1 | |
| # type='high': Find smallest element >= index (lower_bound) | |
| # type='low': Find largest element <= index (upper_bound - 1) | |
| result = None | |
| is_high = (type == 'high') | |
| while l <= h: | |
| mid = (l + h) // 2 | |
| condition = (indices[mid] >= index) if is_high else (indices[mid] <= index) | |
| if condition: | |
| result = indices[mid] | |
| if is_high: | |
| h = mid - 1 | |
| else: | |
| l = mid + 1 | |
| else: | |
| if is_high: | |
| l = mid + 1 | |
| else: | |
| h = mid - 1 | |
| return result | |
| def next_entry(experiment_name, current_index, filter_type = "All"): | |
| count = get_entry_count(experiment_name) | |
| incremented_index = min(int(current_index) + 1, count - 1) | |
| # Compute the list of indices for filter_type and find the nearest >= index to new_index | |
| filtered_indices = indices_by_filter_type(experiment_name, filter_type) | |
| new_index = binary_search(filtered_indices, incremented_index, 'high') | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index) | |
| without_prepost_judgment_style = judgment_to_style[without_prepost_judgment] | |
| with_prepost_judgment_style = judgment_to_style[with_prepost_judgment] | |
| return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats | |
| def prev_entry(experiment_name, current_index, filter_type = "All"): | |
| decremented_index = max(int(current_index) - 1, 0) | |
| # Compute the list of indices for filter_type and find the nearest >= index to new_index | |
| filtered_indices = indices_by_filter_type(experiment_name, filter_type) | |
| new_index = binary_search(filtered_indices, decremented_index, 'low') | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index) | |
| without_prepost_judgment_style = judgment_to_style[without_prepost_judgment] | |
| with_prepost_judgment_style = judgment_to_style[with_prepost_judgment] | |
| return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats | |
| def filter_by_judgment(experiment_name, filter_type, current_index): | |
| data = load_experiment_data(experiment_name) | |
| if filter_type == "All": | |
| indices = list(range(len(data["without_prepost"]))) | |
| elif filter_type == "Correct Only": | |
| indices = [i for i, e in enumerate(data["without_prepost"]) if e.get("judgment") == "correct"] | |
| else: # Wrong Only | |
| indices = [i for i, e in enumerate(data["without_prepost"]) if e.get("judgment") == "wrong"] | |
| if not indices: | |
| return "", current_index, None, "", "No entries match filter", "", "", "", "", "No matching entries" | |
| # Find closest matching index | |
| new_index = min(indices, key=lambda x: abs(x - int(current_index))) | |
| video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index) | |
| without_prepost_judgment_style = judgment_to_style[without_prepost_judgment] | |
| with_prepost_judgment_style = judgment_to_style[with_prepost_judgment] | |
| filtered_stats = f"{stats}\nShowing: {filter_type} ({len(indices)} entries)" | |
| return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, filtered_stats | |
| # Build the interface | |
| with gr.Blocks( | |
| title="Video Inference Viewer", | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| ), | |
| css=""" | |
| .judgment-correct { | |
| color: #22c55e !important; | |
| font-weight: bold !important; | |
| font-size: 1.2em !important; | |
| } | |
| .judgment-wrong { | |
| color: #ef4444 !important; | |
| font-weight: bold !important; | |
| font-size: 1.2em !important; | |
| } | |
| .stats-bar textarea { | |
| background: linear-gradient(90deg, #3b82f6 0%, #8b5cf6 100%) !important; | |
| color: white !important; | |
| padding: 10px 15px !important; | |
| border-radius: 8px !important; | |
| font-weight: 500 !important; | |
| white-space: pre-wrap !important; | |
| line-height: 1.5 !important; | |
| border: none !important; | |
| resize: none !important; | |
| } | |
| .stats-bar { | |
| background: transparent !important; | |
| border: none !important; | |
| } | |
| .prompt-box { | |
| font-family: monospace; | |
| white-space: pre-wrap; | |
| } | |
| /* Ensure textboxes wrap text properly and don't overflow */ | |
| .wrap textarea { | |
| overflow-wrap: break-word !important; | |
| word-wrap: break-word !important; | |
| } | |
| /* Make columns more compact */ | |
| .compact-row > .gr-column { | |
| min-width: 180px !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # π¬ Video Inference Viewer | |
| Explore model predictions on video understanding tasks. Select an experiment, browse through entries, | |
| and compare expected answers with model responses. | |
| """ | |
| ) | |
| with gr.Row(): | |
| experiment_dropdown = gr.Dropdown( | |
| choices=experiments, | |
| value=experiments[0] if experiments else None, | |
| label="π€ Select Experiment", | |
| scale=3 | |
| ) | |
| filter_dropdown = gr.Dropdown( | |
| choices=["All", "Correct Only", "Wrong Only"], | |
| value="All", | |
| label="π Filter Results (without Pre-Post)", | |
| scale=2 | |
| ) | |
| stats_display = gr.Textbox( | |
| show_label=False, | |
| interactive=False, | |
| lines=3, | |
| max_lines=5, | |
| elem_classes=["stats-bar"] | |
| ) | |
| # State to track current index (since we're using video ID for display) | |
| current_index_state = gr.State(value=0) | |
| with gr.Row(): | |
| prev_btn = gr.Button("β¬ οΈ Previous", scale=1) | |
| video_id_input = gr.Textbox( | |
| value="", | |
| label="π Video ID (press Enter to search)", | |
| placeholder="Enter video ID...", | |
| scale=4 | |
| ) | |
| next_btn = gr.Button("Next β‘οΈ", scale=1) | |
| with gr.Row(equal_height=True): | |
| # Left column: Video | |
| with gr.Column(scale=1): | |
| video_display = gr.Video( | |
| label="πΉ Video", | |
| autoplay=True, | |
| loop=True, | |
| height=400 | |
| ) | |
| video_id_display = gr.Textbox( | |
| label="Video ID", | |
| interactive=False | |
| ) | |
| # Right column: Text info | |
| with gr.Column(scale=1): | |
| with gr.Accordion("π Prompt with Pre-Post Conditions", open=True): | |
| prompt_display = gr.Textbox( | |
| show_label=False, | |
| interactive=False, | |
| lines=8, | |
| elem_classes=["prompt-box"] | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=200): | |
| answer_display = gr.Textbox( | |
| label="β Expected Answer", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| with gr.Column(scale=1, min_width=200): | |
| without_prepost_response_display = gr.Textbox( | |
| label="π€ Response without Pre-Post", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| with gr.Column(scale=1, min_width=200): | |
| with_prepost_response_display = gr.Textbox( | |
| label="π€ Response with Pre-Post", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| without_prepost_judgment_display = gr.Textbox( | |
| label="π€ Judgment without Pre-Post", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| with gr.Column(): | |
| with_prepost_judgment_display = gr.Textbox( | |
| label="π€ Judgment with Pre-Post", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| # Event handlers | |
| outputs = [video_display, video_id_display, prompt_display, | |
| answer_display, without_prepost_response_display, with_prepost_response_display, without_prepost_judgment_display, with_prepost_judgment_display, stats_display] | |
| experiment_dropdown.change( | |
| fn=on_experiment_change, | |
| inputs=[experiment_dropdown], | |
| outputs=[video_id_input, current_index_state] + outputs | |
| ) | |
| video_id_input.submit( | |
| fn=on_video_id_submit, | |
| inputs=[experiment_dropdown, video_id_input, current_index_state], | |
| outputs=[video_id_input, current_index_state] + outputs | |
| ) | |
| next_btn.click( | |
| fn=next_entry, | |
| inputs=[experiment_dropdown, current_index_state, filter_dropdown], | |
| outputs=[video_id_input, current_index_state] + outputs | |
| ) | |
| prev_btn.click( | |
| fn=prev_entry, | |
| inputs=[experiment_dropdown, current_index_state, filter_dropdown], | |
| outputs=[video_id_input, current_index_state] + outputs | |
| ) | |
| filter_dropdown.change( | |
| fn=filter_by_judgment, | |
| inputs=[experiment_dropdown, filter_dropdown, current_index_state], | |
| outputs=[video_id_input, current_index_state] + outputs | |
| ) | |
| # Load initial data | |
| demo.load( | |
| fn=on_experiment_change, | |
| inputs=[experiment_dropdown], | |
| outputs=[video_id_input, current_index_state] + outputs | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True | |
| ) | |