Spaces:
Sleeping
Sleeping
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| def extract_pdf_text(pdf_docs): | |
| """ | |
| Extracts text from a list of uploaded PDF files. | |
| """ | |
| text = "" | |
| for pdf in pdf_docs: | |
| pdf_reader = PdfReader(pdf) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def split_text_into_chunks(text, chunk_size=10000, chunk_overlap=500): | |
| """ | |
| Splits extracted text into smaller chunks for better processing. | |
| """ | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| return text_splitter.split_text(text) | |