Spaces:
Runtime error
Runtime error
| import fitz | |
| import io | |
| import base64 | |
| from PIL import Image | |
| import gradio as gr | |
| import cv2 | |
| import tempfile | |
| import os | |
| def pdf_to_img(pdf_path): | |
| pdf_document = fitz.open(pdf_path) | |
| counter = 1 | |
| img_list = [] | |
| for page_number in range(len(pdf_document)): | |
| page = pdf_document[page_number] | |
| image_list = page.get_images() | |
| for image in image_list: | |
| base_img = pdf_document.extract_image(image[0]) | |
| image_data = base_img["image"] | |
| img = Image.open(io.BytesIO(image_data)) | |
| # if img.mode == "RGBA": | |
| # # Convert RGBA image to RGB | |
| # img = img.convert("RGB") | |
| extention = base_img['ext'] | |
| img.save(open(f"image{counter}.{extention}","wb")) | |
| img_list.append(f"image{counter}.{extention}") | |
| counter += 1 | |
| return (img_list) | |
| def extract_text_from_pdf(pdf_file): | |
| # Open the PDF file | |
| doc = fitz.open(pdf_file) | |
| # Initialize an empty string to store the extracted text | |
| extracted_text = "" | |
| # Iterate through each page of the PDF | |
| for page_num in range(len(doc)): | |
| # Load the page | |
| page = doc.load_page(page_num) | |
| # Extract text from the page and append it to the extracted_text string | |
| extracted_text += page.get_text() | |
| # Close the PDF document | |
| doc.close() | |
| return extracted_text | |
| title = "Extract Image and Text" | |
| with gr.Blocks(theme=gr.themes.Glass(primary_hue=gr.themes.colors.slate)) as demo: | |
| gr.Markdown(f'<h1 style="text-align: center;">{title}</h1>') | |
| with gr.Row(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File(type="filepath", label="Upload .pdf file") | |
| upload_button = gr.Button(value="Show Images") | |
| img_gallery = gr.Gallery(label="Generated images", show_label=True, elem_id="gallery", object_fit="contain", height="auto",allow_preview=True) | |
| with gr.Row(): | |
| with gr.Column(): | |
| output_text = text = gr.Textbox(label="Output", lines=4, autoscroll=False) | |
| upload_button.click(pdf_to_img, inputs=file_input, outputs=[img_gallery]) | |
| upload_button.click(extract_text_from_pdf, inputs=file_input, outputs=[output_text]) | |
| demo.launch() |