Spaces:
Sleeping
Sleeping
File size: 669 Bytes
59e5e32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def extract_pdf_text(pdf_docs):
"""
Extracts text from a list of uploaded PDF files.
"""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def split_text_into_chunks(text, chunk_size=10000, chunk_overlap=500):
"""
Splits extracted text into smaller chunks for better processing.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return text_splitter.split_text(text)
|