Spaces:
Runtime error
Runtime error
| import numpy as np | |
| from pathlib import Path | |
| from typing import List, Tuple, Dict, Any, Optional | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import pickle | |
| class VectorIndex: | |
| """ | |
| In-memory Vector Database. | |
| This acts as the 'Long Term Memory' for visual concepts. | |
| It maps a Timestamp (when something happened) to a Vector (what it looked like). | |
| """ | |
| def __init__(self, index_file_path: Path): | |
| self.file_path = index_file_path | |
| self.timestamps: List[float] = [] | |
| self.embedding_matrix: Optional[np.ndarray] = None | |
| self.metadata_store: List[Dict[str, Any]] = [] | |
| # Load existing index if available | |
| if self.file_path.exists(): | |
| self.load() | |
| def add(self, timestamp_seconds: float, vector: np.ndarray, extra_data: Dict[str, Any] = None): | |
| """Adds a new memory entry (timestamp + vector).""" | |
| self.timestamps.append(timestamp_seconds) | |
| self.metadata_store.append(extra_data or {}) | |
| # Normalize the vector to length 1. | |
| # This is crucial so that 'Cosine Similarity' is just a Dot Product (faster). | |
| vector_norm = np.linalg.norm(vector) | |
| if vector_norm > 0: | |
| vector = vector / vector_norm | |
| if self.embedding_matrix is None: | |
| self.embedding_matrix = vector.reshape(1, -1) | |
| else: | |
| self.embedding_matrix = np.vstack([self.embedding_matrix, vector]) | |
| def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[float, float]]: | |
| """ | |
| Finds the moments in the video that are most similar to the query. | |
| Returns: | |
| A list of tuples: (timestamp_seconds, similarity_score) | |
| """ | |
| if self.embedding_matrix is None: | |
| return [] | |
| # Normalize the query too | |
| query_norm = np.linalg.norm(query_vector) | |
| if query_norm > 0: | |
| query_vector = query_vector / query_norm | |
| # Calculate similarity against ALL stored memories at once | |
| similarity_scores = cosine_similarity(query_vector.reshape(1, -1), self.embedding_matrix)[0] | |
| # Sort by highest score first | |
| best_indices = np.argsort(similarity_scores)[::-1][:top_k] | |
| results = [] | |
| for index in best_indices: | |
| score = float(similarity_scores[index]) | |
| time_point = self.timestamps[index] | |
| results.append((time_point, score)) | |
| return results | |
| def save(self): | |
| """Persists the index to the disk using Pickle.""" | |
| data_packet = { | |
| "timestamps": self.timestamps, | |
| "vectors": self.embedding_matrix, | |
| "metadata": self.metadata_store | |
| } | |
| with open(self.file_path, "wb") as f: | |
| pickle.dump(data_packet, f) | |
| def load(self): | |
| """Loads the index from disk.""" | |
| with open(self.file_path, "rb") as f: | |
| data_packet = pickle.load(f) | |
| self.timestamps = data_packet["timestamps"] | |
| self.embedding_matrix = data_packet["vectors"] | |
| self.metadata_store = data_packet.get("metadata", []) |