Dinesh310 commited on
Commit
1ba8003
·
verified ·
1 Parent(s): 6b98cd9

Update src/document_ingestion/document_processor.py

Browse files
src/document_ingestion/document_processor.py CHANGED
@@ -30,6 +30,18 @@ class DocumentProcessor:
30
  chunk_size=chunk_size,
31
  chunk_overlap=chunk_overlap
32
  )
 
 
 
 
 
 
 
 
 
 
 
 
33
  def load_from_url(self, url: str) -> List[Document]:
34
  """Load document(s) from a URL"""
35
  loader = WebBaseLoader(url)
 
30
  chunk_size=chunk_size,
31
  chunk_overlap=chunk_overlap
32
  )
33
+
34
+ def process_pdf(self, file_path: str):
35
+ """Load a PDF from a file path and split into chunks"""
36
+ try:
37
+ loader = PyPDFLoader(file_path)
38
+ # Load and split in one go
39
+ documents = loader.load_and_split(text_splitter=self.text_splitter)
40
+ return documents
41
+ except Exception as e:
42
+ print(f"Error loading PDF {file_path}: {e}")
43
+ return []
44
+
45
  def load_from_url(self, url: str) -> List[Document]:
46
  """Load document(s) from a URL"""
47
  loader = WebBaseLoader(url)