Dinesh310 commited on
Commit
dc6497c
·
verified ·
1 Parent(s): 7479a8f

Create ingestion/document_processor.py

Browse files
Files changed (1) hide show
  1. src/ingestion/document_processor.py +26 -0
src/ingestion/document_processor.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain_community.vectorstores import FAISS
4
+ from config.config import Config
5
+
6
+ class DocumentProcessor:
7
+ def __init__(self, embeddings):
8
+ self.embeddings = embeddings
9
+
10
+ def process_pdfs(self, pdf_paths):
11
+ try:
12
+ documents = []
13
+ for path in pdf_paths:
14
+ loader = PyPDFLoader(path)
15
+ documents.extend(loader.load())
16
+
17
+ splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size=Config.CHUNK_SIZE,
19
+ chunk_overlap=Config.CHUNK_OVERLAP
20
+ )
21
+ splits = splitter.split_documents(documents)
22
+
23
+ return FAISS.from_documents(splits, self.embeddings)
24
+
25
+ except Exception as e:
26
+ raise RuntimeError(f"Document processing failed: {e}")