Spaces:

MahatirTusher
/

DocuDoodle

Sleeping

App Files Files Community

MahatirTusher commited on Mar 19, 2025

Commit

1d53a6e

verified ·

1 Parent(s): 76685ff

Update chroma_db_utils.py

Browse files

Files changed (1) hide show

chroma_db_utils.py +25 -13

chroma_db_utils.py CHANGED Viewed

@@ -35,7 +35,7 @@ embedding_function = MistralEmbeddingFunction()
 def create_chroma_db(documents: List[str]):
     """
-    Creates a persistent Chroma database using the provided documents.
     """
     # Create a persistent directory for ChromaDB
     persist_directory = "chroma_db"
@@ -46,22 +46,20 @@ def create_chroma_db(documents: List[str]):
         path=persist_directory,
     )
-    # Get or create collection
     try:
-        # Try to get existing collection
-        db = chroma_client.get_collection(
             name="document_collection",
             embedding_function=embedding_function
         )
-        # Clear existing documents
-        db.delete(db.get()["ids"])
     except Exception as e:
-        print(f"Error getting collection: {e}. Creating a new collection...")
-        # Create new collection if it doesn't exist
-        db = chroma_client.create_collection(
-            name="document_collection",
-            embedding_function=embedding_function
-        )
     # Add documents in batches to avoid memory issues
     batch_size = 20
@@ -72,9 +70,11 @@ def create_chroma_db(documents: List[str]):
                 documents=batch,
                 ids=[f"doc_{j}" for j in range(i, i + len(batch))]
             )
         except Exception as e:
             print(f"Error adding batch {i} to ChromaDB: {e}")
     return db
 def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
@@ -108,4 +108,16 @@ def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
         return documents  # Return only valid results
     except Exception as e:
         print(f"Error in get_relevant_passage: {str(e)}")
-        return []

 def create_chroma_db(documents: List[str]):
     """
+    Creates or updates a persistent Chroma database using the provided documents.
     """
     # Create a persistent directory for ChromaDB
     persist_directory = "chroma_db"
         path=persist_directory,
     )
+    # Use get_or_create_collection to avoid UniqueConstraintError
     try:
+        db = chroma_client.get_or_create_collection(
             name="document_collection",
             embedding_function=embedding_function
         )
+        # Optionally clear existing documents if you want a fresh start
+        existing_ids = db.get()["ids"]
+        if existing_ids:
+            print(f"Clearing {len(existing_ids)} existing documents from collection...")
+            db.delete(ids=existing_ids)
     except Exception as e:
+        print(f"Error accessing or creating collection: {e}")
+        raise  # Re-raise to halt execution if something goes wrong
     # Add documents in batches to avoid memory issues
     batch_size = 20
                 documents=batch,
                 ids=[f"doc_{j}" for j in range(i, i + len(batch))]
             )
+            print(f"Added batch {i} to {i + len(batch) - 1} successfully.")
         except Exception as e:
             print(f"Error adding batch {i} to ChromaDB: {e}")
+    print(f"ChromaDB collection 'document_collection' created/updated with {db.count()} documents.")
     return db
 def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
         return documents  # Return only valid results
     except Exception as e:
         print(f"Error in get_relevant_passage: {str(e)}")
+        return []
+# Example usage (uncomment to test)
+if __name__ == "__main__":
+    sample_docs = [
+        "The quick brown fox jumps over the lazy dog.",
+        "Artificial intelligence is transforming the world.",
+        "ChromaDB is a vector database for embeddings."
+    ]
+    db = create_chroma_db(sample_docs)
+    query = "What is AI doing to the world?"
+    passages = get_relevant_passage(query, db)
+    print("Retrieved passages:", passages)