Spaces:

Dinesh310
/

demo2

Sleeping

Dinesh310 commited on 30 days ago

Commit

1ba8003

verified ·

1 Parent(s): 6b98cd9

Update src/document_ingestion/document_processor.py

Files changed (1) hide show

src/document_ingestion/document_processor.py CHANGED Viewed

@@ -30,6 +30,18 @@ class DocumentProcessor:
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap
         )
     def load_from_url(self, url: str) -> List[Document]:
         """Load document(s) from a URL"""
         loader = WebBaseLoader(url)

             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap
         )
+    def process_pdf(self, file_path: str):
+        """Load a PDF from a file path and split into chunks"""
+        try:
+            loader = PyPDFLoader(file_path)
+            # Load and split in one go
+            documents = loader.load_and_split(text_splitter=self.text_splitter)
+            return documents
+        except Exception as e:
+            print(f"Error loading PDF {file_path}: {e}")
+            return []
     def load_from_url(self, url: str) -> List[Document]:
         """Load document(s) from a URL"""
         loader = WebBaseLoader(url)