File size: 911 Bytes
dc6497c
 
 
f52da2a
dc6497c
 
 
 
 
 
d6afd8b
 
 
 
 
dc6497c
d6afd8b
 
 
 
 
dc6497c
d6afd8b
dc6497c
d6afd8b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from src.config.config import Config

class DocumentProcessor:
    def __init__(self, embeddings):
        self.embeddings = embeddings

    def process_pdfs(self, pdf_paths):
        try:
            documents = []
            for path in pdf_paths:
                loader = PyPDFLoader(path)
                documents.extend(loader.load())

            splitter = RecursiveCharacterTextSplitter(
                chunk_size=Config.CHUNK_SIZE,
                chunk_overlap=Config.CHUNK_OVERLAP
            )
            splits = splitter.split_documents(documents)

            return FAISS.from_documents(splits, self.embeddings)

        except Exception as e:
            raise RuntimeError(f"Document processing failed: {e}")