Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pdf_to_image | |
| import image_to_text | |
| from ml_engine.model_functions import is_it_title | |
| def process_pdf(pdf): | |
| # Ensure we get the correct path to the uploaded file | |
| pdf_path = pdf.name # `pdf` is now a NamedString/TempFile with a `.name` attribute | |
| pdf_pages_images = pdf_to_image.pdfToImg2(pdf_path) | |
| pages = [] | |
| curr_pg = "" | |
| for img in pdf_pages_images: | |
| text = image_to_text.img2string(img) | |
| for line in text.split("\n"): | |
| if(len(line) == 0): continue | |
| if(is_it_title(line)): | |
| # print(f"TITLE FOUND: {line}") #Debug statement | |
| if(len(curr_pg) != 0): | |
| pages.append(curr_pg) | |
| curr_pg = "" | |
| curr_pg = (curr_pg + line + "\n") | |
| pages.append(curr_pg) | |
| # print(pages) | |
| return pages # Returning a list of strings | |
| # Gradio interface using latest syntax | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# PDF to Pages Processor") | |
| gr.Markdown("Upload a PDF and get a list of extracted pages as output.") | |
| # pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"]) | |
| pdf_input = gr.File(label="Upload a PDF") | |
| output = gr.JSON(label="Extracted Pages") | |
| submit_button = gr.Button("Process PDF") | |
| # Define interaction | |
| submit_button.click(fn=process_pdf, inputs=pdf_input, outputs=output) | |
| if __name__ == "__main__": | |
| demo.launch() | |