Spaces:

genaitiwari
/

RagLangchainStreamlit

Sleeping

App Files Files Community

RagLangchainStreamlit / Loaders /pdfloader.py

genaitiwari

initial check in

c618768 almost 2 years ago

raw

history blame contribute delete

2.69 kB

	import tempfile
	import streamlit as st
	import os
	from glob import iglob
	from langchain_community.document_loaders import PyPDFLoader


	class PDFLoader:
	def __init__(self,repo_path, rag_path_ext) -> None:
	self.repo_path = repo_path
	self.rag_path_ext= rag_path_ext


	# Cache the function to load and process PDF documents
	#@st.cache(allow_output_mutation=True)
	def load_and_process_pdfs(self):
	documents = []
	try :
	pdf_files = self.repo_path
	# for filepath in iglob(os.path.join(pdf_folder_path, "*/.pdf"), recursive=True):
	# filepath = filepath.replace("\\","/")
	# pdf_files.append(filepath)
	# # bewlo is for extract image
	# #loader = PyPDFLoader(filepath,extract_images=True)
	# loader = PyPDFLoader(filepath)

	# documents.extend(loader.load())
	if pdf_files:
	for pdf_file in pdf_files:
	# Save the file temporarily
	#pdf_file = pdf_file.replace("\\","/")
	# tmp_location = os.path.join('/tmp', pdf_file.name)
	# tmp_location = tmp_location.replace("\\","/")
	# with open(tmp_location, 'wb') as tmp_file:
	# tmp_file.write(pdf_file.read())
	# Save the file temporarily
	with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
	tmp_file.write(pdf_file.read())



	# tmp_file_path=tmp_file_path.replace("\\","/")

	# with open(tmp_file_path, "wb") as temp_file:
	# temp_file.write(pdf_file.read())

	# Load PDF using PyPDFLoader
	tmp_file_path = tmp_file.name+".pdf"
	tmp_file_path=tmp_file_path.replace("\\","/")
	os.rename(tmp_file.name,tmp_file_path )
	loader = PyPDFLoader(tmp_file_path)

	# Load PDF using PyPDFLoader
	#pages = loader.load_and_split()
	documents.extend(loader.load())

	# Process each page (you can add your custom logic here)
	# for page_num, page_content in enumerate(pages):
	# st.write(f"Page {page_num + 1} content:")
	# st.write(page_content)

	# # Clean up temporary file
	os.remove(tmp_file_path)

	except Exception as e:
	print(e)
	return documents