File size: 911 Bytes
dc6497c f52da2a dc6497c d6afd8b dc6497c d6afd8b dc6497c d6afd8b dc6497c d6afd8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from src.config.config import Config
class DocumentProcessor:
def __init__(self, embeddings):
self.embeddings = embeddings
def process_pdfs(self, pdf_paths):
try:
documents = []
for path in pdf_paths:
loader = PyPDFLoader(path)
documents.extend(loader.load())
splitter = RecursiveCharacterTextSplitter(
chunk_size=Config.CHUNK_SIZE,
chunk_overlap=Config.CHUNK_OVERLAP
)
splits = splitter.split_documents(documents)
return FAISS.from_documents(splits, self.embeddings)
except Exception as e:
raise RuntimeError(f"Document processing failed: {e}")
|