Spaces:
Runtime error
Runtime error
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import UnstructuredAPIFileLoader | |
| def load_documents_OCR(file_path, unstructured_api): | |
| """Load documents that require OCR via unstructured.""" | |
| loader = UnstructuredAPIFileLoader(file_path=file_path, | |
| api_key=unstructured_api, | |
| url='https://paf-stkjy1b5.api.unstructuredapp.io/', | |
| mode='paged') | |
| documents = loader.load() | |
| return documents | |
| def load_documents(file_path): | |
| """Load documents using LangChain.""" | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| return documents | |
| def split_documents(documents): | |
| """Split documents using LangChain splitter.""" | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500) | |
| split_docs = text_splitter.split_documents(documents) | |
| return split_docs | |
| def load_and_split_documents(file_path): | |
| """Load and split documents from the specified file path.""" | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| if not documents: | |
| print("No documents loaded from file:", file_path) | |
| return [] | |
| split_docs = split_documents(documents) | |
| if not split_docs: | |
| print("Document splitting resulted in no output for file:", file_path) | |
| return split_docs | |
| def update_metadata(documents, original_name): | |
| """Update metadata for each document.""" | |
| updated_documents = [] | |
| for doc in documents: | |
| doc.metadata['source'] = original_name | |
| updated_documents.append(doc) | |
| return updated_documents |