final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 20, 2025

Commit

e4ac86d

verified ·

1 Parent(s): cffffd0

Update src/vector_store.py

Browse files

Files changed (1) hide show

src/vector_store.py +3 -30

src/vector_store.py CHANGED Viewed

@@ -1,7 +1,4 @@
-"""
-Vector Store and Embeddings Module using ChromaDB with sentence-transformers
-UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
-"""
 import os
 import json
 from typing import List, Dict
@@ -12,14 +9,12 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
 class CLIPEmbedder:
-    """Custom embedder using sentence-transformers for multimodal content"""
     def __init__(self, model_name: str = EMBEDDING_MODEL):
         print(f"🔄 Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
         print(f"✅ Model loaded successfully")
     def embed(self, text: str) -> List[float]:
-        """Generate embedding for text"""
         try:
             embedding = self.model.encode(text, convert_to_numpy=False)
             return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
@@ -28,7 +23,6 @@ class CLIPEmbedder:
             return [0.0] * EMBEDDING_DIM
     def embed_batch(self, texts: List[str]) -> List[List[float]]:
-        """Generate embeddings for batch of texts"""
         try:
             embeddings = self.model.encode(texts, convert_to_numpy=False)
             return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
@@ -38,14 +32,12 @@ class CLIPEmbedder:
 class VectorStore:
-    """Vector store manager using ChromaDB (v0.4.22+ with auto-persist)"""
     def __init__(self):
         self.persist_directory = CHROMA_DB_PATH
         self.embedder = CLIPEmbedder()
         print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
-        # NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
         try:
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
@@ -58,7 +50,6 @@ class VectorStore:
                 path=self.persist_directory
             )
-        # Get or create collection
         try:
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
@@ -73,14 +64,12 @@ class VectorStore:
             )
     def add_documents(self, documents: List[Dict], doc_id: str):
-        """Add documents to vector store"""
         texts = []
         metadatas = []
         ids = []
         print(f"\n📚 Adding documents for: {doc_id}")
-        # Add text chunks
         if 'text' in documents and documents['text']:
             chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
             for idx, chunk in enumerate(chunks):
@@ -93,7 +82,6 @@ class VectorStore:
                 ids.append(f"{doc_id}_text_{idx}")
             print(f"  ✅ Text: {len(chunks)} chunks")
-        # Add image descriptions and OCR text
         if 'images' in documents:
             image_count = 0
             for idx, image_data in enumerate(documents['images']):
@@ -110,7 +98,6 @@ class VectorStore:
             if image_count > 0:
                 print(f"  ✅ Images: {image_count} with OCR text")
-        # Add table content
         if 'tables' in documents:
             table_count = 0
             for idx, table_data in enumerate(documents['tables']):
@@ -127,11 +114,9 @@ class VectorStore:
                 print(f"  ✅ Tables: {table_count}")
         if texts:
-            # Generate embeddings
             print(f"  🔄 Generating {len(texts)} embeddings...")
             embeddings = self.embedder.embed_batch(texts)
-            # Add to collection
             try:
                 self.collection.add(
                     ids=ids,
@@ -140,13 +125,11 @@ class VectorStore:
                     metadatas=metadatas
                 )
                 print(f"✅ Successfully added {len(texts)} items to vector store")
-                # Auto-persist happens here
                 print(f"✅ Data persisted automatically to: {self.persist_directory}")
             except Exception as e:
                 print(f"❌ Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
-        """Search vector store for similar documents"""
         try:
             query_embedding = self.embedder.embed(query)
@@ -155,7 +138,6 @@ class VectorStore:
                 n_results=n_results
             )
-            # Format results
             formatted_results = []
             if results['documents']:
                 for i, doc in enumerate(results['documents'][0]):
@@ -175,7 +157,6 @@ class VectorStore:
             return []
     def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
-        """Split text into chunks with overlap"""
         chunks = []
         start = 0
         while start < len(text):
@@ -185,7 +166,6 @@ class VectorStore:
         return chunks
     def get_collection_info(self) -> Dict:
-        """Get information about the collection"""
         try:
             count = self.collection.count()
             return {
@@ -199,7 +179,6 @@ class VectorStore:
             return {'status': 'error', 'message': str(e)}
     def delete_by_doc_id(self, doc_id: str):
-        """Delete all documents related to a specific doc_id"""
         try:
             # Get all IDs with this doc_id
             results = self.collection.get(where={'doc_id': doc_id})
@@ -212,17 +191,11 @@ class VectorStore:
             print(f"Error deleting documents: {e}")
     def persist(self):
-        """
-        No-op for compatibility with older code.
-        ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
-        This method kept for backward compatibility.
-        """
-        print("✅ Vector store is using auto-persist (no manual persist needed)")
     def clear_all(self):
-        """Clear all documents from collection"""
         try:
-            # Delete collection and recreate
             self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",

 import os
 import json
 from typing import List, Dict
 class CLIPEmbedder:
     def __init__(self, model_name: str = EMBEDDING_MODEL):
         print(f"🔄 Loading embedding model: {model_name}")
         self.model = SentenceTransformer(model_name)
         print(f"✅ Model loaded successfully")
     def embed(self, text: str) -> List[float]:
         try:
             embedding = self.model.encode(text, convert_to_numpy=False)
             return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
             return [0.0] * EMBEDDING_DIM
     def embed_batch(self, texts: List[str]) -> List[List[float]]:
         try:
             embeddings = self.model.encode(texts, convert_to_numpy=False)
             return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
 class VectorStore:
     def __init__(self):
         self.persist_directory = CHROMA_DB_PATH
         self.embedder = CLIPEmbedder()
         print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
         try:
             self.client = chromadb.PersistentClient(
                 path=self.persist_directory
                 path=self.persist_directory
             )
         try:
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",
             )
     def add_documents(self, documents: List[Dict], doc_id: str):
         texts = []
         metadatas = []
         ids = []
         print(f"\n📚 Adding documents for: {doc_id}")
         if 'text' in documents and documents['text']:
             chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
             for idx, chunk in enumerate(chunks):
                 ids.append(f"{doc_id}_text_{idx}")
             print(f"  ✅ Text: {len(chunks)} chunks")
         if 'images' in documents:
             image_count = 0
             for idx, image_data in enumerate(documents['images']):
             if image_count > 0:
                 print(f"  ✅ Images: {image_count} with OCR text")
         if 'tables' in documents:
             table_count = 0
             for idx, table_data in enumerate(documents['tables']):
                 print(f"  ✅ Tables: {table_count}")
         if texts:
             print(f"  🔄 Generating {len(texts)} embeddings...")
             embeddings = self.embedder.embed_batch(texts)
             try:
                 self.collection.add(
                     ids=ids,
                     metadatas=metadatas
                 )
                 print(f"✅ Successfully added {len(texts)} items to vector store")
                 print(f"✅ Data persisted automatically to: {self.persist_directory}")
             except Exception as e:
                 print(f"❌ Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
         try:
             query_embedding = self.embedder.embed(query)
                 n_results=n_results
             )
             formatted_results = []
             if results['documents']:
                 for i, doc in enumerate(results['documents'][0]):
             return []
     def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
         chunks = []
         start = 0
         while start < len(text):
         return chunks
     def get_collection_info(self) -> Dict:
         try:
             count = self.collection.count()
             return {
             return {'status': 'error', 'message': str(e)}
     def delete_by_doc_id(self, doc_id: str):
         try:
             # Get all IDs with this doc_id
             results = self.collection.get(where={'doc_id': doc_id})
             print(f"Error deleting documents: {e}")
     def persist(self):
+        print("✅ Vector store is using auto-persist")
     def clear_all(self):
         try:
             self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
                 name="multimodal_rag",