Spaces:
Sleeping
Sleeping
| # document_processing.py | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from docx import Document | |
| import pdfkit | |
| def convert_docx_to_pdf(docx_file, pdf_file): | |
| """ | |
| Convert .docx file to a .pdf using pdfkit. | |
| """ | |
| document = Document(docx_file) | |
| document.save(f"{docx_file}") | |
| # Convert the docx file to pdf using pdfkit | |
| pdfkit.from_file(docx_file, pdf_file) | |
| def load_and_split_document(file_path, file_type): | |
| """ | |
| Handles PDF and DOCX files. If DOCX, it converts to PDF first, | |
| then processes the document. | |
| """ | |
| # Convert DOCX to PDF if necessary | |
| if file_type == "docx": | |
| pdf_file = file_path.replace(".docx", ".pdf") | |
| convert_docx_to_pdf(file_path, pdf_file) | |
| file_path = pdf_file # Update file path to newly created PDF | |
| # Load the PDF document | |
| loader = PyPDFLoader(file_path) | |
| raw_documents = loader.load() | |
| # Chunk the text using recursive character splitter | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200) | |
| documents = text_splitter.split_documents(raw_documents) | |
| return documents | |