Demo_1 / src /ingestion /document_processor.py
Dinesh310's picture
Update src/ingestion/document_processor.py
d6afd8b verified
raw
history blame contribute delete
911 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from src.config.config import Config
class DocumentProcessor:
def __init__(self, embeddings):
self.embeddings = embeddings
def process_pdfs(self, pdf_paths):
try:
documents = []
for path in pdf_paths:
loader = PyPDFLoader(path)
documents.extend(loader.load())
splitter = RecursiveCharacterTextSplitter(
chunk_size=Config.CHUNK_SIZE,
chunk_overlap=Config.CHUNK_OVERLAP
)
splits = splitter.split_documents(documents)
return FAISS.from_documents(splits, self.embeddings)
except Exception as e:
raise RuntimeError(f"Document processing failed: {e}")