File size: 1,244 Bytes
b0b3cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# document_processing.py
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document
import pdfkit


def convert_docx_to_pdf(docx_file, pdf_file):
    """

    Convert .docx file to a .pdf using pdfkit.

    """
    document = Document(docx_file)
    document.save(f"{docx_file}")
    
    # Convert the docx file to pdf using pdfkit
    pdfkit.from_file(docx_file, pdf_file)

def load_and_split_document(file_path, file_type):
    """

    Handles PDF and DOCX files. If DOCX, it converts to PDF first, 

    then processes the document.

    """
    # Convert DOCX to PDF if necessary
    if file_type == "docx":
        pdf_file = file_path.replace(".docx", ".pdf")
        convert_docx_to_pdf(file_path, pdf_file)
        file_path = pdf_file  # Update file path to newly created PDF

    # Load the PDF document
    loader = PyPDFLoader(file_path)
    raw_documents = loader.load()

    # Chunk the text using recursive character splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
    documents = text_splitter.split_documents(raw_documents)
    
    return documents