Dinesh310 commited on
Commit
a76c973
·
verified ·
1 Parent(s): 6ff38d9

Update src/document_ingestion/document_processor.py

Browse files
src/document_ingestion/document_processor.py CHANGED
@@ -31,16 +31,32 @@ class DocumentProcessor:
31
  chunk_overlap=chunk_overlap
32
  )
33
 
34
- def process_pdf(self, file_path: str):
35
- """Load a PDF from a file path and split into chunks"""
36
- try:
37
- loader = PyPDFLoader(file_path)
38
- # Load and split in one go
39
- documents = loader.load_and_split(text_splitter=self.text_splitter)
40
- return documents
41
- except Exception as e:
42
- print(f"Error loading PDF {file_path}: {e}")
43
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def load_from_url(self, url: str) -> List[Document]:
46
  """Load document(s) from a URL"""
 
31
  chunk_overlap=chunk_overlap
32
  )
33
 
34
+ def process_pdfs(self, file_paths: List[str]):
35
+ """Load multiple PDFs and return a combined list of chunks"""
36
+ all_documents = []
37
+
38
+ for path in file_paths:
39
+ try:
40
+ loader = PyPDFLoader(path)
41
+ # This splits the specific PDF into chunks
42
+ chunks = loader.load_and_split(text_splitter=self.text_splitter)
43
+ # We add those chunks to our master list
44
+ all_documents.extend(chunks)
45
+ except Exception as e:
46
+ print(f"Error loading PDF {path}: {e}")
47
+
48
+ return all_documents
49
+
50
+ # def process_pdf(self, file_path: str):
51
+ # """Load a PDF from a file path and split into chunks"""
52
+ # try:
53
+ # loader = PyPDFLoader(file_path)
54
+ # # Load and split in one go
55
+ # documents = loader.load_and_split(text_splitter=self.text_splitter)
56
+ # return documents
57
+ # except Exception as e:
58
+ # print(f"Error loading PDF {file_path}: {e}")
59
+ # return []
60
 
61
  def load_from_url(self, url: str) -> List[Document]:
62
  """Load document(s) from a URL"""