Spaces:

ashleshp
/

Video-Scout

Runtime error

App Files Files Community

Video-Scout / src /perception /scout.py

ashleshp

first commit

fca155a 11 days ago

raw

history blame contribute delete

3.41 kB

	from sentence_transformers import SentenceTransformer
	from PIL import Image
	import numpy as np
	from typing import Union, List, Tuple
	from sklearn.metrics.pairwise import cosine_similarity

	class VisualScout:
	"""
	The 'Scout' Agent.

	This class handles the fast, lightweight semantic analysis of the video.
	It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors,
	allowing us to 'search' the video content numerically without needing a heavy LLM.
	"""
	def __init__(self, model_name: str = "clip-ViT-B-32"):
	print(f"Initializing Visual Scout with model: {model_name}...")
	# We use CPU here to save the GPU VRAM for the main Chat Model (Qwen)
	self.embedding_model = SentenceTransformer(model_name, device="cpu")

	def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray:
	"""Converts a single video frame into a 512-dimensional vector."""
	if isinstance(image_data, np.ndarray):
	# Convert OpenCV format (numpy) to PIL for the model
	image_data = Image.fromarray(image_data)
	return self.embedding_model.encode(image_data)

	def embed_text(self, search_text: str) -> np.ndarray:
	"""Converts a user's search query into a vector for comparison."""
	return self.embedding_model.encode(search_text)

	def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]:
	"""
	Scans the video to find 'Scenes' rather than just raw frames.

	It compares each frame to the previous one using vector similarity.
	If the similarity drops below the 'sensitivity' threshold, we mark it as a new event.
	"""
	if not video_frames:
	return []

	print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...")

	# Optimization: Batch process all images at once instead of a loop
	pil_images = [Image.fromarray(frame) for _, frame in video_frames]

	# This is the heavy lifting - encoding all frames
	frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True)

	significant_events = []

	# Always include the very first frame
	significant_events.append(video_frames[0])

	previous_vector = frame_embeddings[0].reshape(1, -1)

	# Iterate through the timeline
	for i in range(1, len(frame_embeddings)):
	current_vector = frame_embeddings[i].reshape(1, -1)

	# Calculate how similar this frame is to the previous one (0.0 to 1.0)
	similarity_score = cosine_similarity(previous_vector, current_vector)[0][0]

	# If the scene changed drastically (low similarity), keep this frame
	if similarity_score < sensitivity:
	timestamp = video_frames[i][0]
	print(f" ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})")

	significant_events.append(video_frames[i])
	previous_vector = current_vector

	print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.")
	return significant_events