Spaces:

Harsh12
/

rag-assignment

Sleeping

rag-assignment / doc_preprocessing.py

Upload 5 files

b0b3cb1 verified over 1 year ago

1.24 kB

	# document_processing.py
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from docx import Document
	import pdfkit


	def convert_docx_to_pdf(docx_file, pdf_file):
	"""
	Convert .docx file to a .pdf using pdfkit.
	"""
	document = Document(docx_file)
	document.save(f"{docx_file}")

	# Convert the docx file to pdf using pdfkit
	pdfkit.from_file(docx_file, pdf_file)

	def load_and_split_document(file_path, file_type):
	"""
	Handles PDF and DOCX files. If DOCX, it converts to PDF first,
	then processes the document.
	"""
	# Convert DOCX to PDF if necessary
	if file_type == "docx":
	pdf_file = file_path.replace(".docx", ".pdf")
	convert_docx_to_pdf(file_path, pdf_file)
	file_path = pdf_file # Update file path to newly created PDF

	# Load the PDF document
	loader = PyPDFLoader(file_path)
	raw_documents = loader.load()

	# Chunk the text using recursive character splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
	documents = text_splitter.split_documents(raw_documents)

	return documents