jena-shreyas's picture
Remove Invalid Only from top right dropdown for now
fb7230b
"""
Video Inference Demo App
A Gradio demo to visualize video inference results from multiple models.
Shows the corresponding video alongside inference data including prompt,
expected answer, model response, and judgment.
"""
import gradio as gr
import json
from pathlib import Path
from typing import Dict, Any
# Base paths - all relative to the demo directory for Huggingface Spaces compatibility
BASE_DIR = Path(__file__).parent.resolve()
OUTPUTS_DIR = BASE_DIR / "outputs"
VIDEOS_DIR = BASE_DIR / "videos"
def get_available_experiments():
"""Get list of available experiment directories."""
if not OUTPUTS_DIR.exists():
return []
available_experiments = set()
for d in OUTPUTS_DIR.iterdir():
if d.is_dir() and (d / "inference.json").exists() and "must" not in d.name: # Ignore folders with "must" as it is an ablation
name = "-".join(d.name.split("-")[:-2])
available_experiments.add(name)
return list(sorted(available_experiments))
def load_inference_data(experiment_name):
"""Load inference data for a specific experiment."""
experiment_name_without_prepost = experiment_name + "-wo-prepost"
experiment_name_with_prepost = experiment_name + "-with-prepost"
without_prepost_inference_path = OUTPUTS_DIR / experiment_name_without_prepost / "inference.json"
with_prepost_inference_path = OUTPUTS_DIR / experiment_name_with_prepost / "inference.json"
if not without_prepost_inference_path.exists() or not with_prepost_inference_path.exists():
raise FileNotFoundError(f"Inference data not found for experiment {experiment_name} without or with prepost")
with open(without_prepost_inference_path, "r") as f:
without_prepost_data = json.load(f)
with open(with_prepost_inference_path, "r") as f:
with_prepost_data = json.load(f)
return {
"without_prepost": without_prepost_data,
"with_prepost": with_prepost_data
}
def load_evals_data(experiment_name):
"""Load metrics data for a specific experiment."""
experiment_name_without_prepost = experiment_name + "-wo-prepost"
experiment_name_with_prepost = experiment_name + "-with-prepost"
without_prepost_metrics_path = OUTPUTS_DIR / experiment_name_without_prepost / "metrics.json"
with_prepost_metrics_path = OUTPUTS_DIR / experiment_name_with_prepost / "metrics.json"
if not without_prepost_metrics_path.exists() or not with_prepost_metrics_path.exists():
raise FileNotFoundError(f"Metrics data not found for experiment {experiment_name} without or with prepost")
with open(without_prepost_metrics_path, "r") as f:
without_prepost_data = json.load(f)
with open(with_prepost_metrics_path, "r") as f:
with_prepost_data = json.load(f)
return {
"without_prepost": without_prepost_data,
"with_prepost": with_prepost_data
}
def get_video_path(video_id):
"""Get the path to a video file by ID."""
video_path = VIDEOS_DIR / f"{video_id}.mp4"
if not video_path.exists():
video_path = VIDEOS_DIR / f"{video_id}.webm"
if not video_path.exists():
return None
return str(video_path)
def format_prompt_for_display(prompt):
"""Format the prompt text for better readability."""
# Replace escaped newlines with actual newlines
formatted = prompt.replace("\\n", "\n")
return formatted
def create_demo():
"""Create and return the Gradio demo interface."""
# Load initial data
experiments = get_available_experiments()
if not experiments:
raise ValueError("No experiment outputs found in the outputs directory!")
# Cache for inference data
data_cache = {}
metrics_cache = {}
filter_type_to_judgment = {
"Correct Only": "correct",
"Wrong Only": "wrong",
}
judgment_to_style = {
"correct": "βœ… Correct",
"wrong": "❌ Wrong",
"invalid": "⚠️ Invalid"
}
def load_experiment_data(experiment_name):
if experiment_name not in data_cache:
data_cache[experiment_name]: Dict[str, Dict[str, Any]] = load_inference_data(experiment_name)
return data_cache[experiment_name]
def load_metrics_data(experiment_name):
if experiment_name not in metrics_cache:
metrics_cache[experiment_name]: Dict[str, Dict[str, Any]] = load_evals_data(experiment_name)
return metrics_cache[experiment_name]
def get_entry_count(experiment_name):
data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name)
# Since both experiment results have the same data in the same order, we can return the length of either
return len(data["without_prepost"])
def find_index_by_video_id(experiment_name, video_id):
"""Find the index of an entry by its video ID."""
data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name)
video_id_str = str(video_id).strip()
for i, entry in enumerate(data["without_prepost"]):
if str(entry["id"]) == video_id_str:
return i
return None
def format_metrics(metrics: Dict[str, Dict[str, float]]):
metrics2keys = {
"accuracy": "Accuracy",
"invalid_rate": "Invalid",
"f1": "F1 Score",
"precision": "Precision",
"recall": "Recall",
"robust_accuracy": "Robust Accuracy",
}
def format_per_prepost_type(metrics: Dict[str, float]):
formatted_metrics = [
" {}: {:.2f} ".format(metrics2keys[key], metrics[key])
for key in metrics
]
return "|".join(formatted_metrics)
without_prepost_metrics = format_per_prepost_type(metrics["without_prepost"])
with_prepost_metrics = format_per_prepost_type(metrics["with_prepost"])
return f"\nWithout Pre-Post: {without_prepost_metrics}\nWith Pre-Post: {with_prepost_metrics}"
def get_entry(experiment_name, index):
data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name)
metrics: Dict[str, Dict[str, Any]] = load_metrics_data(experiment_name)
if not data or index < 0 or index >= len(data["without_prepost"]):
return None, "", "", "", "", "", "", "", "No entry found"
without_prepost_entry = data["without_prepost"][index]
with_prepost_entry = data["with_prepost"][index]
video_id = without_prepost_entry["id"]
video_path = get_video_path(video_id)
prompt = format_prompt_for_display(with_prepost_entry["prompt"]) # Get the prompt with pre-post conditions
answer = without_prepost_entry["answer"]
without_prepost_response = without_prepost_entry["response"]
with_prepost_response = with_prepost_entry["response"]
without_prepost_judgment = without_prepost_entry["judgment"]
with_prepost_judgment = with_prepost_entry["judgment"]
# Compute metrics
formatted_metrics = format_metrics(metrics) # Returns formatted metrics for both with and without pre-post conditions
total = get_entry_count(experiment_name)
stats = f"[Entry {index + 1} of {total}]\n{formatted_metrics}"
return video_path, video_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats
def on_experiment_change(experiment_name):
data = load_experiment_data(experiment_name)
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, 0)
without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
# Return the video_id of the first entry and current_index=0 in state
first_vid_id = str(data["without_prepost"][0]["id"]) if data else ""
return (
first_vid_id, # video_id_input value
0, # current_index state
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats
)
def on_video_id_submit(experiment_name, video_id_str, current_index):
"""Handle video ID input - find and load the matching entry."""
index = find_index_by_video_id(experiment_name, video_id_str)
if index is None:
# Video ID not found, stay at current position
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, int(current_index))
without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
stats = f"⚠️ Video ID '{video_id_str}' not found. {stats}"
return vid_id, current_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, index)
without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
return vid_id, index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats
def indices_by_filter_type(experiment_name, filter_type):
'''
Assumption:
Since experiment without pre-post conditions has more mistakes, the filter will be applied to the without pre-post conditions.
'''
data = load_experiment_data(experiment_name)
if filter_type == "All":
return range(len(data["without_prepost"]))
judgment = filter_type_to_judgment[filter_type]
indices = [i for i, e in enumerate(data["without_prepost"]) if e["judgment"] == judgment]
return indices
def binary_search(indices, index, type: str):
l, h = 0, len(indices) - 1
# type='high': Find smallest element >= index (lower_bound)
# type='low': Find largest element <= index (upper_bound - 1)
result = None
is_high = (type == 'high')
while l <= h:
mid = (l + h) // 2
condition = (indices[mid] >= index) if is_high else (indices[mid] <= index)
if condition:
result = indices[mid]
if is_high:
h = mid - 1
else:
l = mid + 1
else:
if is_high:
l = mid + 1
else:
h = mid - 1
return result
def next_entry(experiment_name, current_index, filter_type = "All"):
count = get_entry_count(experiment_name)
incremented_index = min(int(current_index) + 1, count - 1)
# Compute the list of indices for filter_type and find the nearest >= index to new_index
filtered_indices = indices_by_filter_type(experiment_name, filter_type)
new_index = binary_search(filtered_indices, incremented_index, 'high')
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index)
without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats
def prev_entry(experiment_name, current_index, filter_type = "All"):
decremented_index = max(int(current_index) - 1, 0)
# Compute the list of indices for filter_type and find the nearest >= index to new_index
filtered_indices = indices_by_filter_type(experiment_name, filter_type)
new_index = binary_search(filtered_indices, decremented_index, 'low')
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index)
without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats
def filter_by_judgment(experiment_name, filter_type, current_index):
data = load_experiment_data(experiment_name)
if filter_type == "All":
indices = list(range(len(data["without_prepost"])))
elif filter_type == "Correct Only":
indices = [i for i, e in enumerate(data["without_prepost"]) if e.get("judgment") == "correct"]
else: # Wrong Only
indices = [i for i, e in enumerate(data["without_prepost"]) if e.get("judgment") == "wrong"]
if not indices:
return "", current_index, None, "", "No entries match filter", "", "", "", "", "No matching entries"
# Find closest matching index
new_index = min(indices, key=lambda x: abs(x - int(current_index)))
video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index)
without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
filtered_stats = f"{stats}\nShowing: {filter_type} ({len(indices)} entries)"
return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, filtered_stats
# Build the interface
with gr.Blocks(
title="Video Inference Viewer",
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
),
css="""
.judgment-correct {
color: #22c55e !important;
font-weight: bold !important;
font-size: 1.2em !important;
}
.judgment-wrong {
color: #ef4444 !important;
font-weight: bold !important;
font-size: 1.2em !important;
}
.stats-bar textarea {
background: linear-gradient(90deg, #3b82f6 0%, #8b5cf6 100%) !important;
color: white !important;
padding: 10px 15px !important;
border-radius: 8px !important;
font-weight: 500 !important;
white-space: pre-wrap !important;
line-height: 1.5 !important;
border: none !important;
resize: none !important;
}
.stats-bar {
background: transparent !important;
border: none !important;
}
.prompt-box {
font-family: monospace;
white-space: pre-wrap;
}
/* Ensure textboxes wrap text properly and don't overflow */
.wrap textarea {
overflow-wrap: break-word !important;
word-wrap: break-word !important;
}
/* Make columns more compact */
.compact-row > .gr-column {
min-width: 180px !important;
}
"""
) as demo:
gr.Markdown(
"""
# 🎬 Video Inference Viewer
Explore model predictions on video understanding tasks. Select an experiment, browse through entries,
and compare expected answers with model responses.
"""
)
with gr.Row():
experiment_dropdown = gr.Dropdown(
choices=experiments,
value=experiments[0] if experiments else None,
label="πŸ€– Select Experiment",
scale=3
)
filter_dropdown = gr.Dropdown(
choices=["All", "Correct Only", "Wrong Only"],
value="All",
label="πŸ” Filter Results (without Pre-Post)",
scale=2
)
stats_display = gr.Textbox(
show_label=False,
interactive=False,
lines=3,
max_lines=5,
elem_classes=["stats-bar"]
)
# State to track current index (since we're using video ID for display)
current_index_state = gr.State(value=0)
with gr.Row():
prev_btn = gr.Button("⬅️ Previous", scale=1)
video_id_input = gr.Textbox(
value="",
label="πŸ”Ž Video ID (press Enter to search)",
placeholder="Enter video ID...",
scale=4
)
next_btn = gr.Button("Next ➑️", scale=1)
with gr.Row(equal_height=True):
# Left column: Video
with gr.Column(scale=1):
video_display = gr.Video(
label="πŸ“Ή Video",
autoplay=True,
loop=True,
height=400
)
video_id_display = gr.Textbox(
label="Video ID",
interactive=False
)
# Right column: Text info
with gr.Column(scale=1):
with gr.Accordion("πŸ“ Prompt with Pre-Post Conditions", open=True):
prompt_display = gr.Textbox(
show_label=False,
interactive=False,
lines=8,
elem_classes=["prompt-box"]
)
with gr.Row():
with gr.Column(scale=1, min_width=200):
answer_display = gr.Textbox(
label="βœ… Expected Answer",
interactive=False,
lines=1
)
with gr.Column(scale=1, min_width=200):
without_prepost_response_display = gr.Textbox(
label="πŸ€– Response without Pre-Post",
interactive=False,
lines=1
)
with gr.Column(scale=1, min_width=200):
with_prepost_response_display = gr.Textbox(
label="πŸ€– Response with Pre-Post",
interactive=False,
lines=1
)
with gr.Row():
with gr.Column():
without_prepost_judgment_display = gr.Textbox(
label="πŸ€– Judgment without Pre-Post",
interactive=False,
lines=1
)
with gr.Column():
with_prepost_judgment_display = gr.Textbox(
label="πŸ€– Judgment with Pre-Post",
interactive=False,
lines=1
)
# Event handlers
outputs = [video_display, video_id_display, prompt_display,
answer_display, without_prepost_response_display, with_prepost_response_display, without_prepost_judgment_display, with_prepost_judgment_display, stats_display]
experiment_dropdown.change(
fn=on_experiment_change,
inputs=[experiment_dropdown],
outputs=[video_id_input, current_index_state] + outputs
)
video_id_input.submit(
fn=on_video_id_submit,
inputs=[experiment_dropdown, video_id_input, current_index_state],
outputs=[video_id_input, current_index_state] + outputs
)
next_btn.click(
fn=next_entry,
inputs=[experiment_dropdown, current_index_state, filter_dropdown],
outputs=[video_id_input, current_index_state] + outputs
)
prev_btn.click(
fn=prev_entry,
inputs=[experiment_dropdown, current_index_state, filter_dropdown],
outputs=[video_id_input, current_index_state] + outputs
)
filter_dropdown.change(
fn=filter_by_judgment,
inputs=[experiment_dropdown, filter_dropdown, current_index_state],
outputs=[video_id_input, current_index_state] + outputs
)
# Load initial data
demo.load(
fn=on_experiment_change,
inputs=[experiment_dropdown],
outputs=[video_id_input, current_index_state] + outputs
)
return demo
if __name__ == "__main__":
demo = create_demo()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)