Spaces:
Runtime error
Runtime error
| from langchain_community.document_loaders import UnstructuredPDFLoader | |
| import os | |
| import pickle | |
| doc = 'data/filteredData.pdf' | |
| def extractor_text_image_table(): | |
| loader = UnstructuredPDFLoader(file_path=doc, | |
| strategy='hi_res', | |
| extract_images_in_pdf=True, | |
| infer_table_structure=True, | |
| # section-based chunking | |
| chunking_strategy="by_title", | |
| max_characters=4000, # max size of chunks | |
| new_after_n_chars=4000, # preferred size of chunks | |
| # smaller chunks < 2000 chars will be combined into a larger chunk | |
| combine_text_under_n_chars=2000, | |
| mode='elements', | |
| image_output_dir_path='./figures') | |
| data = loader.load() | |
| print_retrived_data(data) | |
| with open('data.pkl', 'wb') as f: | |
| pickle.dump(data, f) | |
| def print_retrived_data(data): | |
| print(">>>>>>>>>>>>>>data retrived>>>>>>>>") | |
| print([doc.metadata['category'] for doc in data]) | |
| print(">>>>>>>>>>>>>>end -- data retrived>>>>>>>>") | |
| # call this to extract images | |
| file_path="data.pkl" | |
| if os.path.exists(file_path): | |
| print(f"✅ File '{file_path}' found") | |
| else: | |
| print(">>>>>>>> generating: extracting text images tables >>>>>") | |
| extractor_text_image_table() |