Spaces:
Runtime error
Runtime error
| from sentence_transformers import SentenceTransformer | |
| from PIL import Image | |
| import numpy as np | |
| from typing import Union, List, Tuple | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| class VisualScout: | |
| """ | |
| The 'Scout' Agent. | |
| This class handles the fast, lightweight semantic analysis of the video. | |
| It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors, | |
| allowing us to 'search' the video content numerically without needing a heavy LLM. | |
| """ | |
| def __init__(self, model_name: str = "clip-ViT-B-32"): | |
| print(f"Initializing Visual Scout with model: {model_name}...") | |
| # We use CPU here to save the GPU VRAM for the main Chat Model (Qwen) | |
| self.embedding_model = SentenceTransformer(model_name, device="cpu") | |
| def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray: | |
| """Converts a single video frame into a 512-dimensional vector.""" | |
| if isinstance(image_data, np.ndarray): | |
| # Convert OpenCV format (numpy) to PIL for the model | |
| image_data = Image.fromarray(image_data) | |
| return self.embedding_model.encode(image_data) | |
| def embed_text(self, search_text: str) -> np.ndarray: | |
| """Converts a user's search query into a vector for comparison.""" | |
| return self.embedding_model.encode(search_text) | |
| def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]: | |
| """ | |
| Scans the video to find 'Scenes' rather than just raw frames. | |
| It compares each frame to the previous one using vector similarity. | |
| If the similarity drops below the 'sensitivity' threshold, we mark it as a new event. | |
| """ | |
| if not video_frames: | |
| return [] | |
| print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...") | |
| # Optimization: Batch process all images at once instead of a loop | |
| pil_images = [Image.fromarray(frame) for _, frame in video_frames] | |
| # This is the heavy lifting - encoding all frames | |
| frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True) | |
| significant_events = [] | |
| # Always include the very first frame | |
| significant_events.append(video_frames[0]) | |
| previous_vector = frame_embeddings[0].reshape(1, -1) | |
| # Iterate through the timeline | |
| for i in range(1, len(frame_embeddings)): | |
| current_vector = frame_embeddings[i].reshape(1, -1) | |
| # Calculate how similar this frame is to the previous one (0.0 to 1.0) | |
| similarity_score = cosine_similarity(previous_vector, current_vector)[0][0] | |
| # If the scene changed drastically (low similarity), keep this frame | |
| if similarity_score < sensitivity: | |
| timestamp = video_frames[i][0] | |
| print(f" ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})") | |
| significant_events.append(video_frames[i]) | |
| previous_vector = current_vector | |
| print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.") | |
| return significant_events | |