ashleshp's picture
first commit
fca155a
from sentence_transformers import SentenceTransformer
from PIL import Image
import numpy as np
from typing import Union, List, Tuple
from sklearn.metrics.pairwise import cosine_similarity
class VisualScout:
"""
The 'Scout' Agent.
This class handles the fast, lightweight semantic analysis of the video.
It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors,
allowing us to 'search' the video content numerically without needing a heavy LLM.
"""
def __init__(self, model_name: str = "clip-ViT-B-32"):
print(f"Initializing Visual Scout with model: {model_name}...")
# We use CPU here to save the GPU VRAM for the main Chat Model (Qwen)
self.embedding_model = SentenceTransformer(model_name, device="cpu")
def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray:
"""Converts a single video frame into a 512-dimensional vector."""
if isinstance(image_data, np.ndarray):
# Convert OpenCV format (numpy) to PIL for the model
image_data = Image.fromarray(image_data)
return self.embedding_model.encode(image_data)
def embed_text(self, search_text: str) -> np.ndarray:
"""Converts a user's search query into a vector for comparison."""
return self.embedding_model.encode(search_text)
def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]:
"""
Scans the video to find 'Scenes' rather than just raw frames.
It compares each frame to the previous one using vector similarity.
If the similarity drops below the 'sensitivity' threshold, we mark it as a new event.
"""
if not video_frames:
return []
print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...")
# Optimization: Batch process all images at once instead of a loop
pil_images = [Image.fromarray(frame) for _, frame in video_frames]
# This is the heavy lifting - encoding all frames
frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True)
significant_events = []
# Always include the very first frame
significant_events.append(video_frames[0])
previous_vector = frame_embeddings[0].reshape(1, -1)
# Iterate through the timeline
for i in range(1, len(frame_embeddings)):
current_vector = frame_embeddings[i].reshape(1, -1)
# Calculate how similar this frame is to the previous one (0.0 to 1.0)
similarity_score = cosine_similarity(previous_vector, current_vector)[0][0]
# If the scene changed drastically (low similarity), keep this frame
if similarity_score < sensitivity:
timestamp = video_frames[i][0]
print(f" ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})")
significant_events.append(video_frames[i])
previous_vector = current_vector
print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.")
return significant_events