from sentence_transformers import SentenceTransformer from PIL import Image import numpy as np from typing import Union, List, Tuple from sklearn.metrics.pairwise import cosine_similarity class VisualScout: """ The 'Scout' Agent. This class handles the fast, lightweight semantic analysis of the video. It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors, allowing us to 'search' the video content numerically without needing a heavy LLM. """ def __init__(self, model_name: str = "clip-ViT-B-32"): print(f"Initializing Visual Scout with model: {model_name}...") # We use CPU here to save the GPU VRAM for the main Chat Model (Qwen) self.embedding_model = SentenceTransformer(model_name, device="cpu") def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray: """Converts a single video frame into a 512-dimensional vector.""" if isinstance(image_data, np.ndarray): # Convert OpenCV format (numpy) to PIL for the model image_data = Image.fromarray(image_data) return self.embedding_model.encode(image_data) def embed_text(self, search_text: str) -> np.ndarray: """Converts a user's search query into a vector for comparison.""" return self.embedding_model.encode(search_text) def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]: """ Scans the video to find 'Scenes' rather than just raw frames. It compares each frame to the previous one using vector similarity. If the similarity drops below the 'sensitivity' threshold, we mark it as a new event. """ if not video_frames: return [] print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...") # Optimization: Batch process all images at once instead of a loop pil_images = [Image.fromarray(frame) for _, frame in video_frames] # This is the heavy lifting - encoding all frames frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True) significant_events = [] # Always include the very first frame significant_events.append(video_frames[0]) previous_vector = frame_embeddings[0].reshape(1, -1) # Iterate through the timeline for i in range(1, len(frame_embeddings)): current_vector = frame_embeddings[i].reshape(1, -1) # Calculate how similar this frame is to the previous one (0.0 to 1.0) similarity_score = cosine_similarity(previous_vector, current_vector)[0][0] # If the scene changed drastically (low similarity), keep this frame if similarity_score < sensitivity: timestamp = video_frames[i][0] print(f" ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})") significant_events.append(video_frames[i]) previous_vector = current_vector print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.") return significant_events