Spaces:

dminhk
/

AcroForms-Data-Extractor

Sleeping

App Files Files Community

AcroForms-Data-Extractor / app.py

dminhk

Create app.py

ac446e5 about 2 years ago

raw

history blame contribute delete

2.79 kB

	import json
	import base64
	import pathlib
	import pdfplumber
	import streamlit as st
	import fillpdf
	from fillpdf import fillpdfs

	##########################################################
	# Display PDF function
	def displayPDF(file):
	# Opening file from file path
	with open(file, "rb") as f:
	base64_pdf = base64.b64encode(f.read()).decode('utf-8')
	# Embedding PDF in HTML
	pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
	# Displaying File
	st.markdown(pdf_display, unsafe_allow_html=True)
	##########################################################

	st.set_page_config(page_title="AcroForms Data Extractor")

	st.title("AcroForms Data Extractor")

	st.markdown("""
	This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data.
	""")

	# Upload PDF
	pdf_file = st.file_uploader("Upload PDF File", type=["pdf"])

	if pdf_file is not None:

	# Save file to a directory
	uploads_dir = pathlib.Path("uploads")
	if not uploads_dir.exists():
	uploads_dir.mkdir()

	with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f:
	f.write(pdf_file.getbuffer())

	# Get file path
	pdf_path = f"{uploads_dir}/{pdf_file.name}"

	# # Print path
	# st.markdown("PDF Path:")
	# st.write(pdf_path)

	# Display PDF
	st.divider()
	st.markdown("PDF Display:")
	displayPDF(pdf_path)
	# Print Form Data
	st.divider()
	form_data = fillpdfs.get_form_fields(pdf_path)
	st.markdown("\n\nPDF AcroForm:")
	st.write(form_data)
	# convert dictionary into string
	form_txt = json.dumps(form_data)
	# download button
	st.download_button(
	label='Download AcroForm JSON',
	data=form_txt,
	file_name='form.json',
	mime='application/json',
	)
	# Print Number of Pages and Extract Texxt
	st.divider()
	st.markdown("PDF to Text:")
	with pdfplumber.open(pdf_file) as pdf:
	pages = pdf.pages
	# Number of Pages
	st.markdown("Number of Pages")
	st.write(f"Number of Pages: {len(pages)}")
	# Extract Metadata
	st.markdown("Metadata")
	metadata = pdf.metadata
	st.code(metadata)
	# Extract Text
	text = ""
	for page in pages:
	text += page.extract_text(layout=True) + "\n\n"

	st.markdown("Text")
	st.text(text)

	# Allow text to be downloaded
	btn = st.download_button(
	label="Download PDF Text",
	data=text,
	file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt",
	mime="text/plain"
	)