Dinesh310 commited on
Commit
160beae
·
verified ·
1 Parent(s): f52da2a

Update src/ingestion/document_processor.py

Browse files
Files changed (1) hide show
  1. src/ingestion/document_processor.py +16 -13
src/ingestion/document_processor.py CHANGED
@@ -8,19 +8,22 @@ class DocumentProcessor:
8
  self.embeddings = embeddings
9
 
10
  def process_pdfs(self, pdf_paths):
11
- try:
12
- documents = []
13
- for path in pdf_paths:
14
- loader = PyPDFLoader(path)
15
- documents.extend(loader.load())
16
 
17
- splitter = RecursiveCharacterTextSplitter(
18
- chunk_size=Config.CHUNK_SIZE,
19
- chunk_overlap=Config.CHUNK_OVERLAP
20
- )
21
- splits = splitter.split_documents(documents)
22
 
23
- return FAISS.from_documents(splits, self.embeddings)
 
 
24
 
25
- except Exception as e:
26
- raise RuntimeError(f"Document processing failed: {e}")
 
 
 
 
 
 
 
 
8
  self.embeddings = embeddings
9
 
10
  def process_pdfs(self, pdf_paths):
11
+ documents = []
 
 
 
 
12
 
13
+ for path in pdf_paths:
14
+ loader = PyPDFLoader(path)
15
+ docs = loader.load()
 
 
16
 
17
+ # Ensure source filename exists
18
+ for d in docs:
19
+ d.metadata["source"] = path.split("/")[-1]
20
 
21
+ documents.extend(docs)
22
+
23
+ splitter = RecursiveCharacterTextSplitter(
24
+ chunk_size=Config.CHUNK_SIZE,
25
+ chunk_overlap=Config.CHUNK_OVERLAP
26
+ )
27
+
28
+ splits = splitter.split_documents(documents)
29
+ return FAISS.from_documents(splits, self.embeddings)