Spaces:

jena-shreyas
/

Video-Pre-Post-Demo

Sleeping

App Files Files Community

Video-Pre-Post-Demo / app.py

jena-shreyas

Remove Invalid Only from top right dropdown for now

fb7230b 3 months ago

raw

history blame contribute delete

21.7 kB

	"""
	Video Inference Demo App

	A Gradio demo to visualize video inference results from multiple models.
	Shows the corresponding video alongside inference data including prompt,
	expected answer, model response, and judgment.
	"""

	import gradio as gr
	import json
	from pathlib import Path
	from typing import Dict, Any

	# Base paths - all relative to the demo directory for Huggingface Spaces compatibility
	BASE_DIR = Path(__file__).parent.resolve()
	OUTPUTS_DIR = BASE_DIR / "outputs"
	VIDEOS_DIR = BASE_DIR / "videos"


	def get_available_experiments():
	"""Get list of available experiment directories."""
	if not OUTPUTS_DIR.exists():
	return []
	available_experiments = set()
	for d in OUTPUTS_DIR.iterdir():
	if d.is_dir() and (d / "inference.json").exists() and "must" not in d.name: # Ignore folders with "must" as it is an ablation
	name = "-".join(d.name.split("-")[:-2])
	available_experiments.add(name)
	return list(sorted(available_experiments))


	def load_inference_data(experiment_name):
	"""Load inference data for a specific experiment."""
	experiment_name_without_prepost = experiment_name + "-wo-prepost"
	experiment_name_with_prepost = experiment_name + "-with-prepost"

	without_prepost_inference_path = OUTPUTS_DIR / experiment_name_without_prepost / "inference.json"
	with_prepost_inference_path = OUTPUTS_DIR / experiment_name_with_prepost / "inference.json"
	if not without_prepost_inference_path.exists() or not with_prepost_inference_path.exists():
	raise FileNotFoundError(f"Inference data not found for experiment {experiment_name} without or with prepost")
	with open(without_prepost_inference_path, "r") as f:
	without_prepost_data = json.load(f)
	with open(with_prepost_inference_path, "r") as f:
	with_prepost_data = json.load(f)
	return {
	"without_prepost": without_prepost_data,
	"with_prepost": with_prepost_data
	}


	def load_evals_data(experiment_name):
	"""Load metrics data for a specific experiment."""
	experiment_name_without_prepost = experiment_name + "-wo-prepost"
	experiment_name_with_prepost = experiment_name + "-with-prepost"

	without_prepost_metrics_path = OUTPUTS_DIR / experiment_name_without_prepost / "metrics.json"
	with_prepost_metrics_path = OUTPUTS_DIR / experiment_name_with_prepost / "metrics.json"
	if not without_prepost_metrics_path.exists() or not with_prepost_metrics_path.exists():
	raise FileNotFoundError(f"Metrics data not found for experiment {experiment_name} without or with prepost")
	with open(without_prepost_metrics_path, "r") as f:
	without_prepost_data = json.load(f)
	with open(with_prepost_metrics_path, "r") as f:
	with_prepost_data = json.load(f)
	return {
	"without_prepost": without_prepost_data,
	"with_prepost": with_prepost_data
	}


	def get_video_path(video_id):
	"""Get the path to a video file by ID."""
	video_path = VIDEOS_DIR / f"{video_id}.mp4"
	if not video_path.exists():
	video_path = VIDEOS_DIR / f"{video_id}.webm"
	if not video_path.exists():
	return None
	return str(video_path)


	def format_prompt_for_display(prompt):
	"""Format the prompt text for better readability."""
	# Replace escaped newlines with actual newlines
	formatted = prompt.replace("\\n", "\n")
	return formatted


	def create_demo():
	"""Create and return the Gradio demo interface."""

	# Load initial data
	experiments = get_available_experiments()
	if not experiments:
	raise ValueError("No experiment outputs found in the outputs directory!")

	# Cache for inference data
	data_cache = {}
	metrics_cache = {}
	filter_type_to_judgment = {
	"Correct Only": "correct",
	"Wrong Only": "wrong",
	}
	judgment_to_style = {
	"correct": "✅ Correct",
	"wrong": "❌ Wrong",
	"invalid": "⚠️ Invalid"
	}

	def load_experiment_data(experiment_name):
	if experiment_name not in data_cache:
	data_cache[experiment_name]: Dict[str, Dict[str, Any]] = load_inference_data(experiment_name)
	return data_cache[experiment_name]

	def load_metrics_data(experiment_name):
	if experiment_name not in metrics_cache:
	metrics_cache[experiment_name]: Dict[str, Dict[str, Any]] = load_evals_data(experiment_name)
	return metrics_cache[experiment_name]

	def get_entry_count(experiment_name):
	data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name)
	# Since both experiment results have the same data in the same order, we can return the length of either
	return len(data["without_prepost"])

	def find_index_by_video_id(experiment_name, video_id):
	"""Find the index of an entry by its video ID."""
	data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name)
	video_id_str = str(video_id).strip()
	for i, entry in enumerate(data["without_prepost"]):
	if str(entry["id"]) == video_id_str:
	return i
	return None

	def format_metrics(metrics: Dict[str, Dict[str, float]]):
	metrics2keys = {
	"accuracy": "Accuracy",
	"invalid_rate": "Invalid",
	"f1": "F1 Score",
	"precision": "Precision",
	"recall": "Recall",
	"robust_accuracy": "Robust Accuracy",
	}
	def format_per_prepost_type(metrics: Dict[str, float]):
	formatted_metrics = [
	" {}: {:.2f} ".format(metrics2keys[key], metrics[key])
	for key in metrics
	]
	return "\|".join(formatted_metrics)
	without_prepost_metrics = format_per_prepost_type(metrics["without_prepost"])
	with_prepost_metrics = format_per_prepost_type(metrics["with_prepost"])
	return f"\nWithout Pre-Post: {without_prepost_metrics}\nWith Pre-Post: {with_prepost_metrics}"

	def get_entry(experiment_name, index):
	data: Dict[str, Dict[str, Any]] = load_experiment_data(experiment_name)
	metrics: Dict[str, Dict[str, Any]] = load_metrics_data(experiment_name)
	if not data or index < 0 or index >= len(data["without_prepost"]):
	return None, "", "", "", "", "", "", "", "No entry found"

	without_prepost_entry = data["without_prepost"][index]
	with_prepost_entry = data["with_prepost"][index]
	video_id = without_prepost_entry["id"]
	video_path = get_video_path(video_id)
	prompt = format_prompt_for_display(with_prepost_entry["prompt"]) # Get the prompt with pre-post conditions
	answer = without_prepost_entry["answer"]
	without_prepost_response = without_prepost_entry["response"]
	with_prepost_response = with_prepost_entry["response"]
	without_prepost_judgment = without_prepost_entry["judgment"]
	with_prepost_judgment = with_prepost_entry["judgment"]
	# Compute metrics
	formatted_metrics = format_metrics(metrics) # Returns formatted metrics for both with and without pre-post conditions
	total = get_entry_count(experiment_name)
	stats = f"[Entry {index + 1} of {total}]\n{formatted_metrics}"

	return video_path, video_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats

	def on_experiment_change(experiment_name):
	data = load_experiment_data(experiment_name)
	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, 0)
	without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
	with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
	# Return the video_id of the first entry and current_index=0 in state
	first_vid_id = str(data["without_prepost"][0]["id"]) if data else ""
	return (
	first_vid_id, # video_id_input value
	0, # current_index state
	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats
	)

	def on_video_id_submit(experiment_name, video_id_str, current_index):
	"""Handle video ID input - find and load the matching entry."""
	index = find_index_by_video_id(experiment_name, video_id_str)
	if index is None:
	# Video ID not found, stay at current position
	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, int(current_index))
	without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
	with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
	stats = f"⚠️ Video ID '{video_id_str}' not found. {stats}"
	return vid_id, current_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats

	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, index)
	without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
	with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
	return vid_id, index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats

	def indices_by_filter_type(experiment_name, filter_type):
	'''
	Assumption:
	Since experiment without pre-post conditions has more mistakes, the filter will be applied to the without pre-post conditions.
	'''
	data = load_experiment_data(experiment_name)
	if filter_type == "All":
	return range(len(data["without_prepost"]))
	judgment = filter_type_to_judgment[filter_type]
	indices = [i for i, e in enumerate(data["without_prepost"]) if e["judgment"] == judgment]
	return indices

	def binary_search(indices, index, type: str):
	l, h = 0, len(indices) - 1
	# type='high': Find smallest element >= index (lower_bound)
	# type='low': Find largest element <= index (upper_bound - 1)
	result = None
	is_high = (type == 'high')

	while l <= h:
	mid = (l + h) // 2
	condition = (indices[mid] >= index) if is_high else (indices[mid] <= index)

	if condition:
	result = indices[mid]
	if is_high:
	h = mid - 1
	else:
	l = mid + 1
	else:
	if is_high:
	l = mid + 1
	else:
	h = mid - 1

	return result

	def next_entry(experiment_name, current_index, filter_type = "All"):
	count = get_entry_count(experiment_name)
	incremented_index = min(int(current_index) + 1, count - 1)
	# Compute the list of indices for filter_type and find the nearest >= index to new_index
	filtered_indices = indices_by_filter_type(experiment_name, filter_type)
	new_index = binary_search(filtered_indices, incremented_index, 'high')
	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index)
	without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
	with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
	return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats

	def prev_entry(experiment_name, current_index, filter_type = "All"):
	decremented_index = max(int(current_index) - 1, 0)
	# Compute the list of indices for filter_type and find the nearest >= index to new_index
	filtered_indices = indices_by_filter_type(experiment_name, filter_type)
	new_index = binary_search(filtered_indices, decremented_index, 'low')
	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index)
	without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
	with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]
	return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, stats

	def filter_by_judgment(experiment_name, filter_type, current_index):
	data = load_experiment_data(experiment_name)
	if filter_type == "All":
	indices = list(range(len(data["without_prepost"])))
	elif filter_type == "Correct Only":
	indices = [i for i, e in enumerate(data["without_prepost"]) if e.get("judgment") == "correct"]
	else: # Wrong Only
	indices = [i for i, e in enumerate(data["without_prepost"]) if e.get("judgment") == "wrong"]

	if not indices:
	return "", current_index, None, "", "No entries match filter", "", "", "", "", "No matching entries"

	# Find closest matching index
	new_index = min(indices, key=lambda x: abs(x - int(current_index)))
	video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment, with_prepost_judgment, stats = get_entry(experiment_name, new_index)
	without_prepost_judgment_style = judgment_to_style[without_prepost_judgment]
	with_prepost_judgment_style = judgment_to_style[with_prepost_judgment]

	filtered_stats = f"{stats}\nShowing: {filter_type} ({len(indices)} entries)"

	return vid_id, new_index, video, vid_id, prompt, answer, without_prepost_response, with_prepost_response, without_prepost_judgment_style, with_prepost_judgment_style, filtered_stats

	# Build the interface
	with gr.Blocks(
	title="Video Inference Viewer",
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	),
	css="""
	.judgment-correct {
	color: #22c55e !important;
	font-weight: bold !important;
	font-size: 1.2em !important;
	}
	.judgment-wrong {
	color: #ef4444 !important;
	font-weight: bold !important;
	font-size: 1.2em !important;
	}
	.stats-bar textarea {
	background: linear-gradient(90deg, #3b82f6 0%, #8b5cf6 100%) !important;
	color: white !important;
	padding: 10px 15px !important;
	border-radius: 8px !important;
	font-weight: 500 !important;
	white-space: pre-wrap !important;
	line-height: 1.5 !important;
	border: none !important;
	resize: none !important;
	}
	.stats-bar {
	background: transparent !important;
	border: none !important;
	}
	.prompt-box {
	font-family: monospace;
	white-space: pre-wrap;
	}
	/* Ensure textboxes wrap text properly and don't overflow */
	.wrap textarea {
	overflow-wrap: break-word !important;
	word-wrap: break-word !important;
	}
	/* Make columns more compact */
	.compact-row > .gr-column {
	min-width: 180px !important;
	}
	"""
	) as demo:
	gr.Markdown(
	"""
	# 🎬 Video Inference Viewer

	Explore model predictions on video understanding tasks. Select an experiment, browse through entries,
	and compare expected answers with model responses.
	"""
	)

	with gr.Row():
	experiment_dropdown = gr.Dropdown(
	choices=experiments,
	value=experiments[0] if experiments else None,
	label="🤖 Select Experiment",
	scale=3
	)
	filter_dropdown = gr.Dropdown(
	choices=["All", "Correct Only", "Wrong Only"],
	value="All",
	label="🔍 Filter Results (without Pre-Post)",
	scale=2
	)

	stats_display = gr.Textbox(
	show_label=False,
	interactive=False,
	lines=3,
	max_lines=5,
	elem_classes=["stats-bar"]
	)

	# State to track current index (since we're using video ID for display)
	current_index_state = gr.State(value=0)

	with gr.Row():
	prev_btn = gr.Button("⬅️ Previous", scale=1)
	video_id_input = gr.Textbox(
	value="",
	label="🔎 Video ID (press Enter to search)",
	placeholder="Enter video ID...",
	scale=4
	)
	next_btn = gr.Button("Next ➡️", scale=1)

	with gr.Row(equal_height=True):
	# Left column: Video
	with gr.Column(scale=1):
	video_display = gr.Video(
	label="📹 Video",
	autoplay=True,
	loop=True,
	height=400
	)
	video_id_display = gr.Textbox(
	label="Video ID",
	interactive=False
	)

	# Right column: Text info
	with gr.Column(scale=1):
	with gr.Accordion("📝 Prompt with Pre-Post Conditions", open=True):
	prompt_display = gr.Textbox(
	show_label=False,
	interactive=False,
	lines=8,
	elem_classes=["prompt-box"]
	)

	with gr.Row():
	with gr.Column(scale=1, min_width=200):
	answer_display = gr.Textbox(
	label="✅ Expected Answer",
	interactive=False,
	lines=1
	)
	with gr.Column(scale=1, min_width=200):
	without_prepost_response_display = gr.Textbox(
	label="🤖 Response without Pre-Post",
	interactive=False,
	lines=1
	)
	with gr.Column(scale=1, min_width=200):
	with_prepost_response_display = gr.Textbox(
	label="🤖 Response with Pre-Post",
	interactive=False,
	lines=1
	)
	with gr.Row():
	with gr.Column():
	without_prepost_judgment_display = gr.Textbox(
	label="🤖 Judgment without Pre-Post",
	interactive=False,
	lines=1
	)
	with gr.Column():
	with_prepost_judgment_display = gr.Textbox(
	label="🤖 Judgment with Pre-Post",
	interactive=False,
	lines=1
	)

	# Event handlers
	outputs = [video_display, video_id_display, prompt_display,
	answer_display, without_prepost_response_display, with_prepost_response_display, without_prepost_judgment_display, with_prepost_judgment_display, stats_display]

	experiment_dropdown.change(
	fn=on_experiment_change,
	inputs=[experiment_dropdown],
	outputs=[video_id_input, current_index_state] + outputs
	)

	video_id_input.submit(
	fn=on_video_id_submit,
	inputs=[experiment_dropdown, video_id_input, current_index_state],
	outputs=[video_id_input, current_index_state] + outputs
	)

	next_btn.click(
	fn=next_entry,
	inputs=[experiment_dropdown, current_index_state, filter_dropdown],
	outputs=[video_id_input, current_index_state] + outputs
	)

	prev_btn.click(
	fn=prev_entry,
	inputs=[experiment_dropdown, current_index_state, filter_dropdown],
	outputs=[video_id_input, current_index_state] + outputs
	)

	filter_dropdown.change(
	fn=filter_by_judgment,
	inputs=[experiment_dropdown, filter_dropdown, current_index_state],
	outputs=[video_id_input, current_index_state] + outputs
	)

	# Load initial data
	demo.load(
	fn=on_experiment_change,
	inputs=[experiment_dropdown],
	outputs=[video_id_input, current_index_state] + outputs
	)

	return demo


	if __name__ == "__main__":
	demo = create_demo()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)