Spaces:

hugging-apps
/

tasker-keyframe-extractor

Running on Zero

App Files Files Community

tasker-keyframe-extractor / app.py

multimodalart HF Staff

Upload folder using huggingface_hub

5c8e53c verified about 13 hours ago

Raw

History Blame Contribute Delete

22.8 kB

	import os

	# Expandable segments to avoid allocator fragmentation under memory spikes
	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

	import spaces # MUST be before any torch/CUDA import

	import cv2
	import re
	import json
	import torch
	import numpy as np
	from PIL import Image
	from typing import List, Optional, Tuple
	import tempfile
	import gradio as gr
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

	MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

	# ── Load model at module scope (ZeroGPU rule 2) ──────────────────────────────
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	attn_implementation="sdpa",
	).to("cuda")


	# ── VLM call helper ──────────────────────────────────────────────────────────

	def vlm_call(images: List[Image.Image], question: str, system_prompt: str = "You are a highly strict UI navigation assistant designed to output JSON.") -> str:
	"""Call the local VLM with images and a question, return text response."""
	content = []
	for img in images:
	content.append({"type": "image", "image": img})
	content.append({"type": "text", "text": question})

	messages = [
	{"role": "system", "content": [{"type": "text", "text": system_prompt}]},
	{"role": "user", "content": content},
	]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(
	text=[text],
	images=[images] if images else None,
	padding=True,
	return_tensors="pt",
	).to("cuda")

	with torch.no_grad():
	output_ids = model.generate(**inputs, max_new_tokens=8192, do_sample=False, temperature=1.0)

	# Trim the input tokens from output
	input_len = inputs["input_ids"].shape[1]
	output_text = processor.batch_decode(
	output_ids[:, input_len:], skip_special_tokens=True
	)[0]
	return output_text


	def parse_json_response(text: str):
	"""Extract a JSON object from a text response."""
	try:
	match = re.search(r'\{.*\}', text, re.DOTALL)
	if match:
	return json.loads(match.group(0))
	except Exception:
	pass
	return None


	# ── Video utilities ──────────────────────────────────────────────────────────

	def extract_frame(video_path: str, frame_idx: int) -> Optional[Image.Image]:
	"""Extract a single frame from the video as PIL Image."""
	cap = cv2.VideoCapture(video_path)
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
	ret, frame = cap.read()
	cap.release()
	if not ret:
	return None
	return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))


	def compute_color_histogram(img: Image.Image) -> np.ndarray:
	"""Compute a normalized 3-channel color histogram."""
	arr = np.array(img)
	hist = cv2.calcHist([arr], [0, 1, 2], None, [50, 50, 50], [0, 256, 0, 256, 0, 256])
	cv2.normalize(hist, hist)
	return hist


	def frame_similarity(hist1: np.ndarray, hist2: np.ndarray) -> float:
	"""Compare two color histograms using correlation."""
	return float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))


	def is_frame_redundant(new_hist: np.ndarray, existing_hists: List[np.ndarray], threshold: float = 0.985) -> bool:
	"""Check if a new frame is too similar to existing ones."""
	for h in existing_hists:
	if frame_similarity(new_hist, h) >= threshold:
	return True
	return False


	# ── TASKER core: A* tree search keyframe extraction ─────────────────────────

	class VideoSeg:
	"""A video segment (tree node)."""
	def __init__(self, start: int, end: int):
	self.start = start
	self.end = end


	def find_visual_change_split_point(video_path: str, seg_start: int, seg_end: int) -> int:
	"""Find the frame with the largest visual change in a segment."""
	midpoint = (seg_start + seg_end) // 2
	try:
	seg_length = seg_end - seg_start
	if seg_length <= 2:
	return midpoint

	cap = cv2.VideoCapture(video_path)
	num_samples = min(seg_length, 10)
	step = max(1, seg_length // num_samples)
	sample_indices = list(range(seg_start, seg_end, step))
	if sample_indices[-1] != seg_end:
	sample_indices.append(seg_end)

	frames = {}
	hists = {}
	for idx in sample_indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	ret, frame = cap.read()
	if ret:
	frames[idx] = frame
	hist = cv2.calcHist([frame], [0, 1, 2], None, [50, 50, 50], [0, 256, 0, 256, 0, 256])
	cv2.normalize(hist, hist)
	hists[idx] = hist

	if len(frames) < 2:
	cap.release()
	return midpoint

	sorted_indices = sorted(frames.keys())
	max_diff = -1
	best_a, best_b = sorted_indices[0], sorted_indices[-1]
	for i in range(len(sorted_indices) - 1):
	idx_a, idx_b = sorted_indices[i], sorted_indices[i + 1]
	if idx_a in hists and idx_b in hists:
	diff = 1.0 - cv2.compareHist(hists[idx_a], hists[idx_b], cv2.HISTCMP_CORREL)
	if diff > max_diff:
	max_diff = diff
	best_a, best_b = idx_a, idx_b

	candidate = best_b
	cap.release()

	# Clamp to valid range
	min_pos = seg_start + int(seg_length * 0.15)
	max_pos = seg_start + int(seg_length * 0.85)
	if candidate < min_pos or candidate > max_pos:
	return midpoint
	return candidate
	except Exception:
	return midpoint


	def a_star_select_segment(images: List[Image.Image], goal: str, segment_des: str) -> str:
	"""A* strategy: balance goal-relevance and UI state changes."""
	prompt = f"""You are provided with sequential images sampled from a video.
	Each image is labeled with its frame index. The images are shown in chronological order.
	Goal: {goal}

	Candidate segments (gaps between current frames):
	{segment_des}

	(A* Strategy - Balance missing goal-relevant info and visual state changes)
	Identify ONE single candidate segment that BEST satisfies BOTH conditions simultaneously:
	1. GOAL PROXIMITY: The segment likely contains crucial missing actions that are necessary steps toward achieving the Goal.
	2. STATE CHANGE MAGNITUDE: The segment whose boundary frames show the MOST different visual states is more likely to contain important operations.

	Return JSON format: {{"frame_descriptions": [{{"segment_id": "1", "description": "Best A* candidate: missing goal step + visual state change"}}]}}
	"""
	return vlm_call(images, prompt)


	def qa_and_reflect(images: List[Image.Image], goal: str) -> Tuple[str, int]:
	"""Evaluate whether current frames are sufficient."""
	prompt_qa = f"Task Goal: {goal}\nLook at these sequential frames. Describe the EXACT step-by-step actions that happen transitioning from one frame to the next."
	answer = vlm_call(images, prompt_qa, system_prompt="You are a helpful video analysis assistant.")

	prompt_eval = f"""Task Goal: {goal}
	Your sequential analysis: {answer}

	Evaluate your confidence level strictly:
	1: Severe Jumps (There are completely missing screens or sudden state changes. MUST expand.)
	2: Minor Disconnects (The flow makes sense, but some intermediate actions are missing. Should expand.)
	3: Strong Continuity (The frames capture all important actions and transitions. No key step is skipped.)

	Output JSON exactly like this: {{"confidence": 3}}
	"""
	conf_str = vlm_call(images, prompt_eval)
	conf_json = parse_json_response(conf_str)
	confidence = conf_json.get("confidence", 1) if conf_json else 1
	return answer, int(confidence)


	@spaces.GPU(duration=240)
	def extract_keyframes(video_path: str, goal: str, search_strategy: str = "a_star", max_frames: int = 10, min_frames: int = 6, min_steps: int = 3, conf_lower: int = 3, progress=gr.Progress()):
	"""
	TASKER keyframe extraction: tree-search with VLM-guided segment selection.

	Args:
	video_path: Path to the input video.
	goal: Task query describing what the user wants to see.
	search_strategy: One of "a_star", "bfs", "gbfs", "dijkstra".
	max_frames: Maximum number of keyframes to extract.
	min_frames: Minimum number of frames before confidence check can stop.
	min_steps: Minimum expansion steps before confidence check can stop.
	conf_lower: Confidence threshold (1-3) to stop searching.
	Returns:
	List of (PIL Image, caption) tuples for gallery display, plus a summary string.
	"""
	cap = cv2.VideoCapture(video_path)
	num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	fps = cap.get(cv2.CAP_PROP_FPS)
	cap.release()

	if num_frames <= 0 or fps <= 0:
	return [], "Error: Could not read video file. Please upload a valid video."

	# ── Initial uniform sampling ─────────────────────────────────────────────
	init_frames = 4
	content_start = 0
	content_end = num_frames - 1

	if content_end - content_start + 1 <= init_frames:
	sample_idx = list(range(content_start, content_end + 1))
	else:
	interval = max(1, (content_end - content_start + 1) // (init_frames - 1))
	sample_idx = list(range(content_start, content_end + 1, interval))
	if sample_idx[-1] != content_end:
	sample_idx.append(content_end)

	progress(0.1, desc=f"Initial sampling: {len(sample_idx)} frames from {num_frames} total")

	video_segments = [VideoSeg(sample_idx[i-1], sample_idx[i]) for i in range(1, len(sample_idx))]

	# Histogram cache for dedup
	hist_cache = {}
	frozen_segments = set()
	effective_step = 0
	last_confidence = 0

	max_total_attempts = max_frames + 10

	for attempt in range(1, max_total_attempts + 1):
	current_frames = len(sample_idx)
	if current_frames >= max_frames:
	break

	# Extract current frames as images
	images = []
	for idx in sample_idx:
	img = extract_frame(video_path, idx)
	if img is not None:
	images.append(img)

	if not images:
	break

	progress(
	0.1 + 0.6 * (attempt / max_total_attempts),
	desc=f"Step {attempt}: {current_frames} frames, evaluating..."
	)

	# Confidence check
	if current_frames >= min_frames and effective_step > min_steps:
	_, confidence = qa_and_reflect(images, goal)
	last_confidence = confidence
	if confidence >= conf_lower:
	break
	else:
	if current_frames < min_frames:
	pass # forced expansion

	# Build segment descriptions
	frame_to_img_idx = {frame: i + 1 for i, frame in enumerate(sample_idx)}
	segment_des_lines = []
	for i, seg in enumerate(video_segments):
	seg_id = i + 1
	if (seg.start, seg.end) in frozen_segments:
	continue
	start_img = frame_to_img_idx.get(seg.start, "?")
	end_img = frame_to_img_idx.get(seg.end, "?")
	segment_des_lines.append(
	f" Segment {seg_id}: frames {seg.start}-{seg.end} (Image #{start_img} -> Image #{end_img})"
	)
	segment_des_str = "\n".join(segment_des_lines)

	if not segment_des_str:
	break

	# VLM segment selection
	try:
	if search_strategy == "bfs":
	response = vlm_call(images, f"""You are provided with sequential images sampled from a video.
	Goal: {goal}
	Candidate segments:
	{segment_des_str}
	Select MULTIPLE segments that likely contain crucial missing actions.
	Return JSON: {{"frame_descriptions": [{{"segment_id": "1", "description": "..."}}]}}""")
	elif search_strategy == "gbfs":
	response = vlm_call(images, f"""You are provided with sequential images sampled from a video.
	Goal: {goal}
	Candidate segments:
	{segment_des_str}
	Select the SINGLE segment MOST LIKELY to contain crucial missing actions.
	Return JSON: {{"frame_descriptions": [{{"segment_id": "1", "description": "..."}}]}}""")
	elif search_strategy == "dijkstra":
	response = vlm_call(images, f"""You are provided with sequential images sampled from a video.
	Candidate segments:
	{segment_des_str}
	Select the SINGLE segment with the MOST significant visual state transition.
	Return JSON: {{"frame_descriptions": [{{"segment_id": "1", "description": "..."}}]}}""")
	else: # a_star
	response = a_star_select_segment(images, goal, segment_des_str)

	parsed = parse_json_response(response)
	except Exception as e:
	print(f"VLM call error at step {attempt}: {e}")
	parsed = None

	# Determine selected segment IDs
	selected_seg_ids = set()
	if parsed and "frame_descriptions" in parsed:
	for desc in parsed["frame_descriptions"]:
	for key in desc:
	if key.lower() == "segment_id":
	val = str(desc[key]).strip()
	nums = re.findall(r'\d+', val)
	if nums:
	seg_id = int(nums[0])
	if 1 <= seg_id <= len(video_segments):
	selected_seg_ids.add(seg_id)
	break

	# Fallback: pick longest segment
	if not selected_seg_ids:
	longest_seg_id = None
	longest_len = 0
	for i, seg in enumerate(video_segments):
	seg_len = seg.end - seg.start
	if seg_len > longest_len and seg_len > 1 and (seg.start, seg.end) not in frozen_segments:
	longest_len = seg_len
	longest_seg_id = i + 1
	if longest_seg_id is not None:
	selected_seg_ids.add(longest_seg_id)

	if not selected_seg_ids:
	break

	# BFS quota limit
	if search_strategy == "bfs" and len(selected_seg_ids) > 1:
	remaining_quota = max_frames - len(sample_idx)
	if remaining_quota <= 0:
	break
	if len(selected_seg_ids) > remaining_quota:
	sorted_seg_ids = sorted(selected_seg_ids,
	key=lambda sid: video_segments[sid-1].end - video_segments[sid-1].start,
	reverse=True)
	selected_seg_ids = set(sorted_seg_ids[:remaining_quota])

	# Split selected segments
	split_origin = {}
	new_segments = []
	seg_counter = 0
	for i, seg in enumerate(video_segments):
	seg_id = i + 1
	if seg_id in selected_seg_ids:
	if seg.end - seg.start <= 1:
	seg_counter += 1
	new_segments.append(VideoSeg(seg.start, seg.end))
	else:
	sp = find_visual_change_split_point(video_path, seg.start, seg.end)
	split_origin[sp] = (seg.start, seg.end)
	seg_counter += 1
	new_segments.append(VideoSeg(seg.start, sp))
	seg_counter += 1
	new_segments.append(VideoSeg(sp, seg.end))
	else:
	seg_counter += 1
	new_segments.append(VideoSeg(seg.start, seg.end))
	video_segments = new_segments

	# Rebuild sample_idx
	sample_idx_set = set()
	for seg in video_segments:
	sample_idx_set.add(seg.start)
	sample_idx_set.add(seg.end)
	new_sample_idx = sorted(list(sample_idx_set))

	# Visual deduplication
	new_frames = [idx for idx in new_sample_idx if idx not in set(sample_idx)]
	old_sample_set = set(sample_idx)

	# Compute histograms for old frames
	old_hists = []
	for idx in sample_idx:
	img = extract_frame(video_path, idx)
	if img is not None:
	old_hists.append(compute_color_histogram(img))

	frames_to_remove = []
	accepted_new_hists = []
	for new_idx in new_frames:
	new_img = extract_frame(video_path, new_idx)
	if new_img is None:
	continue
	new_hist = compute_color_histogram(new_img)
	all_compare_hists = old_hists + accepted_new_hists

	if is_frame_redundant(new_hist, all_compare_hists, threshold=0.985):
	frames_to_remove.append(new_idx)
	if new_idx in split_origin:
	frozen_segments.add(split_origin[new_idx])
	else:
	accepted_new_hists.append(new_hist)

	if frames_to_remove:
	new_sample_idx = [idx for idx in new_sample_idx if idx not in frames_to_remove]
	new_sample_idx = sorted(new_sample_idx)
	video_segments = [VideoSeg(new_sample_idx[i-1], new_sample_idx[i])
	for i in range(1, len(new_sample_idx))]

	actually_added = len(new_sample_idx) > len(sample_idx)
	sample_idx = new_sample_idx

	if actually_added:
	effective_step += 1

	progress(0.85, desc="Finalizing keyframes...")

	# Force-fill if too few frames
	if len(sample_idx) < min_frames and last_confidence < conf_lower:
	max_force = min_frames + 5
	for _ in range(max_force):
	if len(sample_idx) >= min_frames:
	break
	max_gap = 0
	max_gap_idx = 0
	for i in range(len(sample_idx) - 1):
	if (sample_idx[i], sample_idx[i+1]) in frozen_segments:
	continue
	gap = sample_idx[i+1] - sample_idx[i]
	if gap > max_gap:
	max_gap = gap
	max_gap_idx = i
	if max_gap <= 1:
	break
	sp = find_visual_change_split_point(video_path, sample_idx[max_gap_idx], sample_idx[max_gap_idx + 1])
	sp_img = extract_frame(video_path, sp)
	if sp_img is None:
	break
	sp_hist = compute_color_histogram(sp_img)
	existing_hists = []
	for idx in sample_idx:
	img = extract_frame(video_path, idx)
	if img is not None:
	existing_hists.append(compute_color_histogram(img))
	if is_frame_redundant(sp_hist, existing_hists, threshold=0.985):
	frozen_segments.add((sample_idx[max_gap_idx], sample_idx[max_gap_idx + 1]))
	continue
	sample_idx.insert(max_gap_idx + 1, sp)

	# Extract final keyframes
	progress(0.95, desc="Extracting final keyframes...")

	gallery = []
	for i, idx in enumerate(sample_idx):
	img = extract_frame(video_path, idx)
	if img is not None:
	timestamp = idx / fps if fps > 0 else 0
	mins = int(timestamp // 60)
	secs = int(timestamp % 60)
	percent = (idx / max(1, num_frames - 1)) * 100
	caption = f"Frame {i+1}/{len(sample_idx)} \| idx={idx} \| {mins:02d}:{secs:02d} \| {percent:.1f}%"
	gallery.append((img, caption))

	summary = (
	f"TASKER {search_strategy.upper()} extracted {len(gallery)} keyframes "
	f"from {num_frames} total frames ({num_frames/fps:.1f}s video).\n\n"
	f"Search stats: {effective_step} effective expansion steps, "
	f"confidence={last_confidence}/3, "
	f"target range {min_frames}-{max_frames} frames."
	)

	progress(1.0, desc="Done!")
	return gallery, summary


	# ── Gradio UI ───────────────────────────────────────────────────────────────

	CUSTOM_CSS = """
	#header { text-align: center; margin-bottom: 20px; }
	#header h1 { font-size: 2em; margin-bottom: 5px; }
	#header p { color: #666; font-size: 1.1em; }
	"""

	with gr.Blocks(css=CUSTOM_CSS, title="TASKER Keyframe Extractor") as demo:
	gr.HTML("""
	<div id="header">
	<h1>TASKER: Task-driven and Scene-aware Keyframe Search</h1>
	<p>Extract task-relevant keyframes from a video using VLM-guided tree search (A* / BFS / GBFS / Dijkstra)</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	video_input = gr.Video(label="Upload Video", sources=["upload"])
	goal_input = gr.Textbox(
	label="Task Query / Goal",
	placeholder="e.g., How to send an email with an attachment?",
	lines=2,
	)
	strategy_input = gr.Dropdown(
	choices=["a_star", "bfs", "gbfs", "dijkstra"],
	value="a_star",
	label="Search Strategy",
	info="A* balances goal-relevance and visual changes. BFS explores broadly. GBFS focuses on goal. Dijkstra focuses on visual changes.",
	)
	with gr.Accordion("Advanced Settings", open=False):
	max_frames_slider = gr.Slider(4, 16, value=10, step=1, label="Max Keyframes")
	min_frames_slider = gr.Slider(2, 8, value=6, step=1, label="Min Keyframes (before confidence check)")
	min_steps_slider = gr.Slider(1, 8, value=3, step=1, label="Min Search Steps")
	conf_slider = gr.Slider(1, 3, value=3, step=1, label="Confidence Threshold (3=strictest)")

	extract_btn = gr.Button("Extract Keyframes", variant="primary")

	with gr.Column(scale=2):
	summary_output = gr.Markdown(label="Summary")
	gallery_output = gr.Gallery(
	label="Extracted Keyframes",
	columns=3,
	height=600,
	object_fit="contain",
	)

	extract_btn.click(
	fn=extract_keyframes,
	inputs=[
	video_input,
	goal_input,
	strategy_input,
	max_frames_slider,
	min_frames_slider,
	min_steps_slider,
	conf_slider,
	],
	outputs=[gallery_output, summary_output],
	)

	demo.launch()