Spaces:

sunny333
/

multimodalRAG

Runtime error

initial commit

568cd7b 12 months ago

1.48 kB

	from langchain_community.document_loaders import UnstructuredPDFLoader
	import os
	import pickle

	doc = 'data/filteredData.pdf'

	def extractor_text_image_table():
	loader = UnstructuredPDFLoader(file_path=doc,
	strategy='hi_res',
	extract_images_in_pdf=True,
	infer_table_structure=True,
	# section-based chunking
	chunking_strategy="by_title",
	max_characters=4000, # max size of chunks
	new_after_n_chars=4000, # preferred size of chunks
	# smaller chunks < 2000 chars will be combined into a larger chunk
	combine_text_under_n_chars=2000,
	mode='elements',
	image_output_dir_path='./figures')
	data = loader.load()
	print_retrived_data(data)
	with open('data.pkl', 'wb') as f:
	pickle.dump(data, f)


	def print_retrived_data(data):
	print(">>>>>>>>>>>>>>data retrived>>>>>>>>")
	print([doc.metadata['category'] for doc in data])
	print(">>>>>>>>>>>>>>end -- data retrived>>>>>>>>")


	# call this to extract images
	file_path="data.pkl"
	if os.path.exists(file_path):
	print(f"✅ File '{file_path}' found")
	else:
	print(">>>>>>>> generating: extracting text images tables >>>>>")
	extractor_text_image_table()