Spaces:
Sleeping
Sleeping
| import json | |
| import base64 | |
| import pathlib | |
| import pdfplumber | |
| import streamlit as st | |
| import fillpdf | |
| from fillpdf import fillpdfs | |
| ########################################################## | |
| # Display PDF function | |
| def displayPDF(file): | |
| # Opening file from file path | |
| with open(file, "rb") as f: | |
| base64_pdf = base64.b64encode(f.read()).decode('utf-8') | |
| # Embedding PDF in HTML | |
| pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">' | |
| # Displaying File | |
| st.markdown(pdf_display, unsafe_allow_html=True) | |
| ########################################################## | |
| st.set_page_config(page_title="AcroForms Data Extractor") | |
| st.title("AcroForms Data Extractor") | |
| st.markdown(""" | |
| This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data. | |
| """) | |
| # Upload PDF | |
| pdf_file = st.file_uploader("Upload PDF File", type=["pdf"]) | |
| if pdf_file is not None: | |
| # Save file to a directory | |
| uploads_dir = pathlib.Path("uploads") | |
| if not uploads_dir.exists(): | |
| uploads_dir.mkdir() | |
| with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f: | |
| f.write(pdf_file.getbuffer()) | |
| # Get file path | |
| pdf_path = f"{uploads_dir}/{pdf_file.name}" | |
| # # Print path | |
| # st.markdown("**PDF Path:**") | |
| # st.write(pdf_path) | |
| # Display PDF | |
| st.divider() | |
| st.markdown("**PDF Display:**") | |
| displayPDF(pdf_path) | |
| # Print Form Data | |
| st.divider() | |
| form_data = fillpdfs.get_form_fields(pdf_path) | |
| st.markdown("\n\n**PDF AcroForm:**") | |
| st.write(form_data) | |
| # convert dictionary into string | |
| form_txt = json.dumps(form_data) | |
| # download button | |
| st.download_button( | |
| label='Download AcroForm JSON', | |
| data=form_txt, | |
| file_name='form.json', | |
| mime='application/json', | |
| ) | |
| # Print Number of Pages and Extract Texxt | |
| st.divider() | |
| st.markdown("**PDF to Text:**") | |
| with pdfplumber.open(pdf_file) as pdf: | |
| pages = pdf.pages | |
| # Number of Pages | |
| st.markdown("**Number of Pages**") | |
| st.write(f"Number of Pages: {len(pages)}") | |
| # Extract Metadata | |
| st.markdown("**Metadata**") | |
| metadata = pdf.metadata | |
| st.code(metadata) | |
| # Extract Text | |
| text = "" | |
| for page in pages: | |
| text += page.extract_text(layout=True) + "\n\n" | |
| st.markdown("**Text**") | |
| st.text(text) | |
| # Allow text to be downloaded | |
| btn = st.download_button( | |
| label="Download PDF Text", | |
| data=text, | |
| file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt", | |
| mime="text/plain" | |
| ) | |