rag-assignment / doc_preprocessing.py
Harsh12's picture
Upload 5 files
b0b3cb1 verified
# document_processing.py
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document
import pdfkit
def convert_docx_to_pdf(docx_file, pdf_file):
"""
Convert .docx file to a .pdf using pdfkit.
"""
document = Document(docx_file)
document.save(f"{docx_file}")
# Convert the docx file to pdf using pdfkit
pdfkit.from_file(docx_file, pdf_file)
def load_and_split_document(file_path, file_type):
"""
Handles PDF and DOCX files. If DOCX, it converts to PDF first,
then processes the document.
"""
# Convert DOCX to PDF if necessary
if file_type == "docx":
pdf_file = file_path.replace(".docx", ".pdf")
convert_docx_to_pdf(file_path, pdf_file)
file_path = pdf_file # Update file path to newly created PDF
# Load the PDF document
loader = PyPDFLoader(file_path)
raw_documents = loader.load()
# Chunk the text using recursive character splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)
return documents