Spaces:
Runtime error
Runtime error
| import os | |
| import fitz | |
| import PyPDF2 | |
| def extract_text_and_figures(pdf_path): | |
| """ | |
| Extracts text and figures from a PDF file. | |
| Args: | |
| pdf_path (str): The path to the PDF file. | |
| Returns: | |
| tuple: A tuple containing two lists: | |
| * A list of extracted text blocks. | |
| * A list of extracted figures (as bytes). | |
| """ | |
| texts = [] | |
| figures = [] | |
| # Open the PDF using PyMuPDF (fitz) for image extraction | |
| doc = fitz.open(pdf_path) | |
| for page_num, page in enumerate(doc): | |
| text = page.get_text("text") # Extract text as plain text | |
| texts.append(text) | |
| # Process images on the page | |
| image_list = page.get_images() | |
| for image_index, img in enumerate(image_list): | |
| xref = img[0] # Image XREF | |
| pix = fitz.Pixmap(doc, xref) # Create Pixmap image | |
| # Save image in desired format (here, PNG) | |
| if pix.n < 5: # Grayscale or RGB | |
| img_bytes = pix.tobytes("png") | |
| else: # CMYK: Convert to RGB first | |
| pix = fitz.Pixmap(fitz.csRGB, pix) | |
| img_bytes = pix.tobytes("png") | |
| figures.append(img_bytes) | |
| # Extract additional text using PyPDF2 (in case fitz didn't get everything) | |
| with open(pdf_path, 'rb') as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text = page.extract_text() | |
| texts.append(text) | |
| try: | |
| os.remove(pdf_path) | |
| except FileNotFoundError: | |
| print(f"File '{pdf_path}' not found.") | |
| except PermissionError: | |
| print(f"Unable to remove '{pdf_path}'. Check permissions.") | |
| return texts, figures | |