multimodalRAG / RAG_MLM /extractor.py
sunny333's picture
initial commit
568cd7b
from langchain_community.document_loaders import UnstructuredPDFLoader
import os
import pickle
doc = 'data/filteredData.pdf'
def extractor_text_image_table():
loader = UnstructuredPDFLoader(file_path=doc,
strategy='hi_res',
extract_images_in_pdf=True,
infer_table_structure=True,
# section-based chunking
chunking_strategy="by_title",
max_characters=4000, # max size of chunks
new_after_n_chars=4000, # preferred size of chunks
# smaller chunks < 2000 chars will be combined into a larger chunk
combine_text_under_n_chars=2000,
mode='elements',
image_output_dir_path='./figures')
data = loader.load()
print_retrived_data(data)
with open('data.pkl', 'wb') as f:
pickle.dump(data, f)
def print_retrived_data(data):
print(">>>>>>>>>>>>>>data retrived>>>>>>>>")
print([doc.metadata['category'] for doc in data])
print(">>>>>>>>>>>>>>end -- data retrived>>>>>>>>")
# call this to extract images
file_path="data.pkl"
if os.path.exists(file_path):
print(f"✅ File '{file_path}' found")
else:
print(">>>>>>>> generating: extracting text images tables >>>>>")
extractor_text_image_table()