Spaces:

veerukhannan
/

advisor

Sleeping

App Files Files Community

veerukhannan commited on Nov 23, 2024

Commit

e55cfd3

verified ·

1 Parent(s): 97a7638

Update test_embeddings.py

Browse files

Files changed (1) hide show

test_embeddings.py +117 -15

test_embeddings.py CHANGED Viewed

@@ -11,26 +11,134 @@ class SentenceTransformerEmbeddings:
         embeddings = self.model.encode(input)
         return embeddings.tolist()
-def test_chromadb_content():
-    """Test if ChromaDB has the required content"""
     try:
-        # Set up ChromaDB path
         base_path = os.path.dirname(os.path.abspath(__file__))
         chroma_path = os.path.join(base_path, 'chroma_db')
-        if not os.path.exists(chroma_path):
-            logger.error(f"ChromaDB directory not found at {chroma_path}")
             return False
         # Initialize ChromaDB
         chroma_client = chromadb.PersistentClient(path=chroma_path)
-        # Check if collection exists
         collections = chroma_client.list_collections()
-        if not any(col.name == "legal_documents" for col in collections):
-            logger.error("Legal documents collection not found in ChromaDB")
             return False
         # Get collection
         collection = chroma_client.get_collection(
             name="legal_documents",
@@ -55,13 +163,7 @@ def test_chromadb_content():
             logger.error("Test query returned no results")
             return False
-        # Print sample content
-        logger.info("Sample content from ChromaDB:")
-        for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])):
-            logger.info(f"\nDocument {i+1}:")
-            logger.info(f"Title: {metadata['title']}")
-            logger.info(f"Content preview: {doc[:200]}...")
         return True
     except Exception as e:

         embeddings = self.model.encode(input)
         return embeddings.tolist()
+def initialize_chromadb():
+    """Initialize ChromaDB and load documents if needed"""
     try:
+        # Set up paths
         base_path = os.path.dirname(os.path.abspath(__file__))
+        doc_path = os.path.join(base_path, 'a2023-45.txt')
+        index_path = os.path.join(base_path, 'index.txt')
         chroma_path = os.path.join(base_path, 'chroma_db')
+        # Check if required files exist
+        if not os.path.exists(doc_path):
+            logger.error(f"Document file not found at {doc_path}")
+            return False
+        if not os.path.exists(index_path):
+            logger.error(f"Index file not found at {index_path}")
             return False
+        # Ensure ChromaDB directory exists
+        os.makedirs(chroma_path, exist_ok=True)
         # Initialize ChromaDB
         chroma_client = chromadb.PersistentClient(path=chroma_path)
+        embedding_function = SentenceTransformerEmbeddings()
+        # Check if collection exists and has content
         collections = chroma_client.list_collections()
+        collection_exists = any(col.name == "legal_documents" for col in collections)
+        if collection_exists:
+            collection = chroma_client.get_collection(
+                name="legal_documents",
+                embedding_function=embedding_function
+            )
+            if collection.count() > 0:
+                logger.info("ChromaDB collection already exists and has content")
+                return True
+        # If we get here, we need to create or repopulate the collection
+        logger.info("Loading documents into ChromaDB...")
+        # Delete existing collection if it exists
+        if collection_exists:
+            chroma_client.delete_collection("legal_documents")
+        # Create new collection
+        collection = chroma_client.create_collection(
+            name="legal_documents",
+            embedding_function=embedding_function
+        )
+        # Read and process documents
+        with open(doc_path, 'r', encoding='utf-8') as f:
+            document = f.read().strip()
+        with open(index_path, 'r', encoding='utf-8') as f:
+            index_content = [line.strip() for line in f.readlines() if line.strip()]
+        # Process document into sections
+        sections = []
+        current_section = ""
+        current_title = ""
+        for line in document.split('\n'):
+            line = line.strip()
+            if any(index_line in line for index_line in index_content):
+                if current_section and current_title:
+                    sections.append({
+                        "title": current_title,
+                        "content": current_section.strip()
+                    })
+                current_title = line
+                current_section = ""
+            else:
+                if line:
+                    current_section += line + "\n"
+        if current_section and current_title:
+            sections.append({
+                "title": current_title,
+                "content": current_section.strip()
+            })
+        # Prepare and add data to ChromaDB
+        if sections:
+            documents = []
+            metadatas = []
+            ids = []
+            for i, section in enumerate(sections):
+                if section["content"].strip():
+                    documents.append(section["content"])
+                    metadatas.append({
+                        "title": section["title"],
+                        "source": "a2023-45.txt",
+                        "section_number": i + 1
+                    })
+                    ids.append(f"section_{i+1}")
+            collection.add(
+                documents=documents,
+                metadatas=metadatas,
+                ids=ids
+            )
+            logger.info(f"Successfully loaded {len(documents)} sections into ChromaDB")
+            return True
+        else:
+            logger.error("No valid sections found in document")
+            return False
+    except Exception as e:
+        logger.error(f"Error initializing ChromaDB: {str(e)}")
+        return False
+def test_chromadb_content():
+    """Test if ChromaDB has the required content"""
+    try:
+        # First ensure ChromaDB is initialized
+        if not initialize_chromadb():
             return False
+        # Set up ChromaDB path
+        base_path = os.path.dirname(os.path.abspath(__file__))
+        chroma_path = os.path.join(base_path, 'chroma_db')
+        # Initialize ChromaDB
+        chroma_client = chromadb.PersistentClient(path=chroma_path)
         # Get collection
         collection = chroma_client.get_collection(
             name="legal_documents",
             logger.error("Test query returned no results")
             return False
+        logger.info("ChromaDB content verification successful")
         return True
     except Exception as e: