Spaces:

Dinesh310
/

demo2

Sleeping

Dinesh310 commited on Jan 24

Commit

a76c973

verified ·

1 Parent(s): 6ff38d9

Update src/document_ingestion/document_processor.py

Files changed (1) hide show

src/document_ingestion/document_processor.py CHANGED Viewed

@@ -31,16 +31,32 @@ class DocumentProcessor:
             chunk_overlap=chunk_overlap
         )
-    def process_pdf(self, file_path: str):
-        """Load a PDF from a file path and split into chunks"""
-        try:
-            loader = PyPDFLoader(file_path)
-            # Load and split in one go
-            documents = loader.load_and_split(text_splitter=self.text_splitter)
-            return documents
-        except Exception as e:
-            print(f"Error loading PDF {file_path}: {e}")
-            return []
     def load_from_url(self, url: str) -> List[Document]:
         """Load document(s) from a URL"""

             chunk_overlap=chunk_overlap
         )
+    def process_pdfs(self, file_paths: List[str]):
+        """Load multiple PDFs and return a combined list of chunks"""
+        all_documents = []
+        for path in file_paths:
+            try:
+                loader = PyPDFLoader(path)
+                # This splits the specific PDF into chunks
+                chunks = loader.load_and_split(text_splitter=self.text_splitter)
+                # We add those chunks to our master list
+                all_documents.extend(chunks)
+            except Exception as e:
+                print(f"Error loading PDF {path}: {e}")
+        return all_documents
+    # def process_pdf(self, file_path: str):
+    #     """Load a PDF from a file path and split into chunks"""
+    #     try:
+    #         loader = PyPDFLoader(file_path)
+    #         # Load and split in one go
+    #         documents = loader.load_and_split(text_splitter=self.text_splitter)
+    #         return documents
+    #     except Exception as e:
+    #         print(f"Error loading PDF {file_path}: {e}")
+    #         return []
     def load_from_url(self, url: str) -> List[Document]:
         """Load document(s) from a URL"""