Spaces:
Runtime error
Runtime error
| import os | |
| from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.schema.document import Document | |
| def create_cache_dir(directory=None): | |
| if not directory: | |
| directory = './.cache' | |
| os.makedirs('./.cache', exist_ok=True) | |
| return directory | |
| def load_pdf(file_path): | |
| loader = PyPDFLoader(file_path) | |
| return loader.load() | |
| def load_pdf_directory(directory): | |
| loader = PyPDFDirectoryLoader(directory) | |
| return loader.load() | |
| def split_pdf(pdfs: list[Document]): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=512, | |
| chunk_overlap=64, | |
| length_function=len, | |
| is_separator_regex=False | |
| ) | |
| return splitter.split_documents(pdfs) | |
| def extract_pdf(uploaded_pdf): | |
| cache_dir = create_cache_dir() | |
| cache_dir = os.path.join(cache_dir, 'temp_files') | |
| os.makedirs(cache_dir, exist_ok=True) | |
| # Support both single file and list of files | |
| if not isinstance(uploaded_pdf, list): | |
| uploaded_pdf = [uploaded_pdf] | |
| for file in uploaded_pdf: | |
| file_path = os.path.join(cache_dir, file.name) | |
| with open(file_path, 'wb') as w: | |
| w.write(file.getvalue()) | |
| return cache_dir |