Spaces:

NeerajCodz
/

scrapeRL

Sleeping

App Files Files Community

scrapeRL / backend /app /memory /long_term.py

NeerajCodz

fix: replace deprecated datetime.utcnow with timezone-aware

bfe0e24 about 2 months ago

raw

history blame contribute delete

15.6 kB

	"""Long-term memory with persistent vector storage using ChromaDB."""

	from __future__ import annotations

	import asyncio
	import hashlib
	import logging
	from datetime import datetime, timezone
	from typing import Any
	from uuid import uuid4

	from pydantic import BaseModel, Field

	logger = logging.getLogger(__name__)


	class Document(BaseModel):
	"""A document stored in long-term memory."""

	id: str = Field(default_factory=lambda: str(uuid4()))
	content: str
	embedding: list[float] \| None = None
	metadata: dict[str, Any] = Field(default_factory=dict)
	created_at: datetime = Field(default_factory=datetime.utcnow)
	updated_at: datetime = Field(default_factory=datetime.utcnow)

	model_config = {"arbitrary_types_allowed": True}


	class SearchResult(BaseModel):
	"""A search result from long-term memory."""

	document: Document
	score: float
	distance: float \| None = None

	model_config = {"arbitrary_types_allowed": True}


	class LongTermMemory:
	"""
	Long-term persistent memory using ChromaDB for vector storage.

	This memory layer provides semantic search capabilities using embeddings.
	It persists across episodes and sessions, storing knowledge that should
	be retained long-term.

	Attributes:
	collection_name: Name of the ChromaDB collection.
	persist_directory: Directory for persistent storage.
	top_k: Default number of results to return from search.
	"""

	def __init__(
	self,
	collection_name: str = "scraperl_memory",
	persist_directory: str = "./data/chroma",
	top_k: int = 10,
	embedding_function: Any \| None = None,
	) -> None:
	"""
	Initialize long-term memory.

	Args:
	collection_name: Name of the ChromaDB collection.
	persist_directory: Directory for persistent storage.
	top_k: Default number of results to return from search.
	embedding_function: Optional custom embedding function.
	"""
	self.collection_name = collection_name
	self.persist_directory = persist_directory
	self.top_k = top_k
	self._embedding_function = embedding_function
	self._client: Any = None
	self._collection: Any = None
	self._initialized = False
	self._lock = asyncio.Lock()

	async def initialize(self) -> None:
	"""
	Initialize ChromaDB client and collection.

	This should be called before using other methods.
	"""
	if self._initialized:
	return

	async with self._lock:
	if self._initialized:
	return

	try:
	import chromadb
	from chromadb.config import Settings

	# Create persistent client
	self._client = chromadb.Client(
	Settings(
	chroma_db_impl="duckdb+parquet",
	persist_directory=self.persist_directory,
	anonymized_telemetry=False,
	)
	)

	# Get or create collection
	self._collection = self._client.get_or_create_collection(
	name=self.collection_name,
	embedding_function=self._embedding_function,
	metadata={"hnsw:space": "cosine"},
	)

	self._initialized = True
	logger.info(
	f"Initialized long-term memory: collection={self.collection_name}"
	)

	except ImportError:
	logger.warning(
	"ChromaDB not available. Long-term memory will use in-memory fallback."
	)
	self._use_fallback()
	except Exception as e:
	logger.warning(
	f"Failed to initialize ChromaDB: {e}. Using in-memory fallback."
	)
	self._use_fallback()

	def _use_fallback(self) -> None:
	"""Use in-memory fallback when ChromaDB is unavailable."""
	self._client = None
	self._collection = None
	self._fallback_store: dict[str, Document] = {}
	self._initialized = True

	@property
	def is_initialized(self) -> bool:
	"""Check if memory is initialized."""
	return self._initialized

	@property
	def _using_fallback(self) -> bool:
	"""Check if using in-memory fallback."""
	return self._collection is None

	def _generate_id(self, content: str) -> str:
	"""Generate a deterministic ID from content."""
	return hashlib.sha256(content.encode()).hexdigest()[:16]

	async def store(
	self,
	content: str,
	document_id: str \| None = None,
	metadata: dict[str, Any] \| None = None,
	embedding: list[float] \| None = None,
	) -> Document:
	"""
	Store a document in long-term memory.

	Args:
	content: Text content to store.
	document_id: Optional custom ID. Generated from content if not provided.
	metadata: Optional metadata dictionary.
	embedding: Optional pre-computed embedding vector.

	Returns:
	The stored document.
	"""
	if not self._initialized:
	await self.initialize()

	async with self._lock:
	doc_id = document_id or self._generate_id(content)
	now = datetime.now(timezone.utc)

	document = Document(
	id=doc_id,
	content=content,
	embedding=embedding,
	metadata=metadata or {},
	created_at=now,
	updated_at=now,
	)

	if self._using_fallback:
	self._fallback_store[doc_id] = document
	else:
	# Store in ChromaDB
	try:
	self._collection.upsert(
	ids=[doc_id],
	documents=[content],
	metadatas=[
	{
	**document.metadata,
	"created_at": now.isoformat(),
	"updated_at": now.isoformat(),
	}
	],
	embeddings=[embedding] if embedding else None,
	)
	except Exception as e:
	logger.error(f"Failed to store document: {e}")
	raise

	return document

	async def search(
	self,
	query: str,
	top_k: int \| None = None,
	where: dict[str, Any] \| None = None,
	query_embedding: list[float] \| None = None,
	) -> list[SearchResult]:
	"""
	Search for similar documents using semantic search.

	Args:
	query: Search query text.
	top_k: Number of results to return. Uses default if not specified.
	where: Optional metadata filter.
	query_embedding: Optional pre-computed query embedding.

	Returns:
	List of search results with scores.
	"""
	if not self._initialized:
	await self.initialize()

	k = top_k or self.top_k

	async with self._lock:
	if self._using_fallback:
	# Simple substring matching for fallback
	results = []
	query_lower = query.lower()
	for doc in self._fallback_store.values():
	if query_lower in doc.content.lower():
	results.append(
	SearchResult(document=doc, score=1.0, distance=0.0)
	)
	return results[:k]

	try:
	# Query ChromaDB
	query_params: dict[str, Any] = {
	"n_results": k,
	}

	if query_embedding:
	query_params["query_embeddings"] = [query_embedding]
	else:
	query_params["query_texts"] = [query]

	if where:
	query_params["where"] = where

	results = self._collection.query(**query_params)

	# Parse results
	search_results = []
	if results and results.get("ids"):
	for i, doc_id in enumerate(results["ids"][0]):
	content = (
	results["documents"][0][i]
	if results.get("documents")
	else ""
	)
	metadata = (
	results["metadatas"][0][i]
	if results.get("metadatas")
	else {}
	)
	distance = (
	results["distances"][0][i]
	if results.get("distances")
	else None
	)

	doc = Document(
	id=doc_id,
	content=content,
	metadata=metadata,
	)

	# Convert distance to score (cosine similarity)
	score = 1 - distance if distance is not None else 1.0

	search_results.append(
	SearchResult(
	document=doc,
	score=score,
	distance=distance,
	)
	)

	return search_results

	except Exception as e:
	logger.error(f"Search failed: {e}")
	return []

	async def get(self, document_id: str) -> Document \| None:
	"""
	Retrieve a document by ID.

	Args:
	document_id: The document ID to retrieve.

	Returns:
	The document or None if not found.
	"""
	if not self._initialized:
	await self.initialize()

	async with self._lock:
	if self._using_fallback:
	return self._fallback_store.get(document_id)

	try:
	result = self._collection.get(ids=[document_id])
	if result and result["ids"]:
	return Document(
	id=result["ids"][0],
	content=result["documents"][0] if result.get("documents") else "",
	metadata=result["metadatas"][0] if result.get("metadatas") else {},
	)
	return None
	except Exception as e:
	logger.error(f"Failed to get document: {e}")
	return None

	async def delete(self, document_id: str) -> bool:
	"""
	Delete a document from long-term memory.

	Args:
	document_id: The document ID to delete.

	Returns:
	True if document was deleted, False otherwise.
	"""
	if not self._initialized:
	await self.initialize()

	async with self._lock:
	if self._using_fallback:
	if document_id in self._fallback_store:
	del self._fallback_store[document_id]
	return True
	return False

	try:
	self._collection.delete(ids=[document_id])
	return True
	except Exception as e:
	logger.error(f"Failed to delete document: {e}")
	return False

	async def delete_where(self, where: dict[str, Any]) -> int:
	"""
	Delete documents matching a metadata filter.

	Args:
	where: Metadata filter for documents to delete.

	Returns:
	Number of documents deleted.
	"""
	if not self._initialized:
	await self.initialize()

	async with self._lock:
	if self._using_fallback:
	to_delete = []
	for doc_id, doc in self._fallback_store.items():
	if all(doc.metadata.get(k) == v for k, v in where.items()):
	to_delete.append(doc_id)
	for doc_id in to_delete:
	del self._fallback_store[doc_id]
	return len(to_delete)

	try:
	# Get matching IDs first
	result = self._collection.get(where=where)
	if result and result["ids"]:
	self._collection.delete(ids=result["ids"])
	return len(result["ids"])
	return 0
	except Exception as e:
	logger.error(f"Failed to delete documents: {e}")
	return 0

	async def count(self) -> int:
	"""
	Get the total number of documents stored.

	Returns:
	Document count.
	"""
	if not self._initialized:
	await self.initialize()

	async with self._lock:
	if self._using_fallback:
	return len(self._fallback_store)

	try:
	return self._collection.count()
	except Exception as e:
	logger.error(f"Failed to count documents: {e}")
	return 0

	async def clear(self) -> int:
	"""
	Clear all documents from memory.

	Returns:
	Number of documents that were cleared.
	"""
	if not self._initialized:
	await self.initialize()

	async with self._lock:
	if self._using_fallback:
	count = len(self._fallback_store)
	self._fallback_store.clear()
	return count

	try:
	count = self._collection.count()
	# Delete and recreate collection
	self._client.delete_collection(self.collection_name)
	self._collection = self._client.create_collection(
	name=self.collection_name,
	embedding_function=self._embedding_function,
	metadata={"hnsw:space": "cosine"},
	)
	return count
	except Exception as e:
	logger.error(f"Failed to clear memory: {e}")
	return 0

	async def persist(self) -> None:
	"""Persist changes to disk."""
	if self._client and hasattr(self._client, "persist"):
	try:
	self._client.persist()
	except Exception as e:
	logger.error(f"Failed to persist memory: {e}")

	async def shutdown(self) -> None:
	"""Shutdown long-term memory and persist data."""
	if self._initialized and not self._using_fallback:
	await self.persist()
	self._initialized = False
	logger.info("Long-term memory shutdown complete")

	async def get_stats(self) -> dict[str, Any]:
	"""
	Get statistics about long-term memory.

	Returns:
	Dictionary with memory statistics.
	"""
	count = await self.count()
	return {
	"initialized": self._initialized,
	"using_fallback": self._using_fallback,
	"collection_name": self.collection_name,
	"persist_directory": self.persist_directory,
	"document_count": count,
	"top_k": self.top_k,
	}