Spaces:

ashleshp
/

Video-Scout

Runtime error

App Files Files Community

Video-Scout / src /memory /vector_index.py

ashleshp

first commit

fca155a 7 days ago

raw

history blame contribute delete

3.2 kB

	import numpy as np
	from pathlib import Path
	from typing import List, Tuple, Dict, Any, Optional
	from sklearn.metrics.pairwise import cosine_similarity
	import pickle

	class VectorIndex:
	"""
	In-memory Vector Database.

	This acts as the 'Long Term Memory' for visual concepts.
	It maps a Timestamp (when something happened) to a Vector (what it looked like).
	"""

	def __init__(self, index_file_path: Path):
	self.file_path = index_file_path
	self.timestamps: List[float] = []
	self.embedding_matrix: Optional[np.ndarray] = None
	self.metadata_store: List[Dict[str, Any]] = []

	# Load existing index if available
	if self.file_path.exists():
	self.load()

	def add(self, timestamp_seconds: float, vector: np.ndarray, extra_data: Dict[str, Any] = None):
	"""Adds a new memory entry (timestamp + vector)."""
	self.timestamps.append(timestamp_seconds)
	self.metadata_store.append(extra_data or {})

	# Normalize the vector to length 1.
	# This is crucial so that 'Cosine Similarity' is just a Dot Product (faster).
	vector_norm = np.linalg.norm(vector)
	if vector_norm > 0:
	vector = vector / vector_norm

	if self.embedding_matrix is None:
	self.embedding_matrix = vector.reshape(1, -1)
	else:
	self.embedding_matrix = np.vstack([self.embedding_matrix, vector])

	def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[float, float]]:
	"""
	Finds the moments in the video that are most similar to the query.

	Returns:
	A list of tuples: (timestamp_seconds, similarity_score)
	"""
	if self.embedding_matrix is None:
	return []

	# Normalize the query too
	query_norm = np.linalg.norm(query_vector)
	if query_norm > 0:
	query_vector = query_vector / query_norm

	# Calculate similarity against ALL stored memories at once
	similarity_scores = cosine_similarity(query_vector.reshape(1, -1), self.embedding_matrix)[0]

	# Sort by highest score first
	best_indices = np.argsort(similarity_scores)[::-1][:top_k]

	results = []
	for index in best_indices:
	score = float(similarity_scores[index])
	time_point = self.timestamps[index]
	results.append((time_point, score))

	return results

	def save(self):
	"""Persists the index to the disk using Pickle."""
	data_packet = {
	"timestamps": self.timestamps,
	"vectors": self.embedding_matrix,
	"metadata": self.metadata_store
	}
	with open(self.file_path, "wb") as f:
	pickle.dump(data_packet, f)

	def load(self):
	"""Loads the index from disk."""
	with open(self.file_path, "rb") as f:
	data_packet = pickle.load(f)
	self.timestamps = data_packet["timestamps"]
	self.embedding_matrix = data_packet["vectors"]
	self.metadata_store = data_packet.get("metadata", [])