Spaces:
Paused
Paused
| from chainlit import AskFileMessage | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| def split_file(file: AskFileMessage): | |
| import tempfile | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
| Loader = PyMuPDFLoader | |
| with tempfile.NamedTemporaryFile(mode="w", delete=False) as tempfile: | |
| with open(tempfile.name, "wb") as f: | |
| f.write(file.content) | |
| loader = Loader(tempfile.name) | |
| documents = loader.load() | |
| docs = text_splitter.split_documents(documents) | |
| for i, doc in enumerate(docs): | |
| doc.metadata["source"] = f"source_{id}" | |
| return docs |