Spaces:
Sleeping
Sleeping
File size: 1,132 Bytes
bf06fa6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import PyPDF2
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
def process_pdfs(uploaded_files):
"""
Extract text from uploaded PDF files and split into chunks
Args:
uploaded_files: List of uploaded PDF files
Returns:
list: List of document chunks
"""
documents = []
for file in uploaded_files:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
documents.append(Document(page_content=text, metadata={"source": file.name}))
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
return splitter.split_documents(documents)
def create_vector_store(documents, embedding):
"""
Create FAISS vector store from documents
Args:
documents: List of document chunks
embedding: Embedding model
Returns:
FAISS: Vector store
"""
return FAISS.from_documents(documents, embedding) |