Spaces:

NEXAS
/

docling_rag

Sleeping

App Files Files Community

NEXAS commited on Mar 2, 2025

Commit

7a013c2

verified ·

1 Parent(s): a8e2b6e

Update utils/ingestion.py

Browse files

Files changed (1) hide show

utils/ingestion.py +26 -17

utils/ingestion.py CHANGED Viewed

@@ -23,7 +23,7 @@ class DocumentProcessor:
         """Initialize document processor with necessary components"""
         self.setup_document_converter()
         self.embed_model = FastEmbedEmbeddings()
-        self.client = chromadb.PersistentClient(path="chroma_db")  # Fixed storage
     def setup_document_converter(self):
         """Configure document converter with advanced processing capabilities"""
@@ -33,9 +33,17 @@ class DocumentProcessor:
         pipeline_options.table_structure_options.do_cell_matching = True
         pipeline_options.ocr_options.lang = ["en"]
         pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
-        pipeline_options.accelerator_options = AcceleratorOptions(
-            num_threads=8, device=AcceleratorDevice.MPS
-        )
         self.converter = DocumentConverter(
             format_options={
@@ -49,7 +57,7 @@ class DocumentProcessor:
     def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
         """Extract essential metadata from a chunk"""
         metadata = {
-            "text": chunk.text,
             "headings": [],
             "page_info": None,
             "content_type": None
@@ -73,7 +81,7 @@ class DocumentProcessor:
     def process_document(self, pdf_path: str):
         """Process document and create searchable index with metadata"""
-        print(f"Processing document: {pdf_path}")
         start_time = time.time()
         result = self.converter.convert(pdf_path)
@@ -87,7 +95,7 @@ class DocumentProcessor:
             metadata = self.extract_chunk_metadata(chunk)
             processed_chunks.append(metadata)
-        print("\nCreating vector database...")
         collection = self.client.get_or_create_collection(name="document_chunks")
         documents = []
@@ -98,10 +106,10 @@ class DocumentProcessor:
         for idx, chunk in enumerate(processed_chunks):
             text = chunk.get('text', '').strip()
             if not text:
-                print(f"Skipping empty chunk at index {idx}")
                 continue  # Skip empty chunks
-            embedding = self.embed_model.embed_documents([text])[0]  # ✅ Correct method
             documents.append(text)
             embeddings.append(embedding)
             metadata_list.append({
@@ -111,14 +119,15 @@ class DocumentProcessor:
             })
             ids.append(str(idx))
-        collection.add(
-            ids=ids,
-            embeddings=embeddings,
-            documents=documents,
-            metadatas=metadata_list
-        )
         processing_time = time.time() - start_time
-        print(f"\nDocument processing completed in {processing_time:.2f} seconds")
         return collection

         """Initialize document processor with necessary components"""
         self.setup_document_converter()
         self.embed_model = FastEmbedEmbeddings()
+        self.client = chromadb.PersistentClient(path="chroma_db")  # Persistent Storage
     def setup_document_converter(self):
         """Configure document converter with advanced processing capabilities"""
         pipeline_options.table_structure_options.do_cell_matching = True
         pipeline_options.ocr_options.lang = ["en"]
         pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        # ✅ Automatically handle CPU fallback
+        try:
+            pipeline_options.accelerator_options = AcceleratorOptions(
+                num_threads=8, device=AcceleratorDevice.MPS
+            )
+        except Exception as e:
+            print("⚠️ MPS is not available. Falling back to CPU.")
+            pipeline_options.accelerator_options = AcceleratorOptions(
+                num_threads=8, device=AcceleratorDevice.CPU
+            )
         self.converter = DocumentConverter(
             format_options={
     def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
         """Extract essential metadata from a chunk"""
         metadata = {
+            "text": chunk.text.strip(),
             "headings": [],
             "page_info": None,
             "content_type": None
     def process_document(self, pdf_path: str):
         """Process document and create searchable index with metadata"""
+        print(f"📄 Processing document: {pdf_path}")
         start_time = time.time()
         result = self.converter.convert(pdf_path)
             metadata = self.extract_chunk_metadata(chunk)
             processed_chunks.append(metadata)
+        print("✅ Chunking completed. Creating vector database...")
         collection = self.client.get_or_create_collection(name="document_chunks")
         documents = []
         for idx, chunk in enumerate(processed_chunks):
             text = chunk.get('text', '').strip()
             if not text:
+                print(f"⚠️ Skipping empty chunk at index {idx}")
                 continue  # Skip empty chunks
+            embedding = self.embed_model.embed_documents([text])[0]  # ✅ Corrected method
             documents.append(text)
             embeddings.append(embedding)
             metadata_list.append({
             })
             ids.append(str(idx))
+        if documents:
+            collection.add(
+                ids=ids,
+                embeddings=embeddings,
+                documents=documents,
+                metadatas=metadata_list
+            )
+            print(f"✅ Successfully added {len(documents)} chunks to the database.")
         processing_time = time.time() - start_time
+        print(f"✅ Document processing completed in {processing_time:.2f} seconds")
         return collection