import streamlit as st import pymupdf4llm import tempfile import pathlib import markdown2 from docx import Document from bs4 import BeautifulSoup def pdf_to_markdown(pdf_file): # Create a temporary file to save the uploaded PDF with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: temp_file.write(pdf_file.read()) temp_file_path = temp_file.name # Convert PDF to Markdown pdf_text = pymupdf4llm.to_markdown(temp_file_path) # Save the Markdown content to a file md_file_path = pathlib.Path("Output.md") md_file_path.write_bytes(pdf_text.encode()) return pdf_text, md_file_path def create_docx_from_markdown(md_content): # Convert Markdown to HTML html_content = markdown2.markdown(md_content) # Create a new Document doc = Document() doc.add_heading('Converted PDF Content', level=1) # Use BeautifulSoup to parse the HTML and extract text soup = BeautifulSoup(html_content, "html.parser") # Add the rendered HTML content to the document for element in soup: if element.name == 'h1': doc.add_heading(element.get_text(), level=1) elif element.name == 'h2': doc.add_heading(element.get_text(), level=2) elif element.name == 'h3': doc.add_heading(element.get_text(), level=3) elif element.name == 'p': doc.add_paragraph(element.get_text()) elif element.name == 'strong': doc.add_paragraph(element.get_text(), style='IntenseQuote') elif element.name == 'em': p = doc.add_paragraph() p.add_run(element.get_text()).italic = True # Save the document docx_file_path = pathlib.Path("Output.docx") doc.save(docx_file_path) return docx_file_path # Streamlit application st.title("📄 Structured PDF Data Extractor") st.subheader("Upload a PDF file, preview the structured content, and download it if needed.") # File uploader for PDF pdf_input = st.file_uploader("Upload PDF", type="pdf") if pdf_input is not None: # Convert to Markdown when the PDF is uploaded with st.spinner("Converting PDF to Markdown..."): try: pdf_text, md_file_path = pdf_to_markdown(pdf_input) # Display the Markdown content st.markdown("### Markdown Content Preview:", unsafe_allow_html=True) st.markdown(pdf_text, unsafe_allow_html=True) # Create a download button for the Markdown file st.markdown("### Download Markdown File:") with open(md_file_path, "rb") as file: st.download_button( label="Download Markdown", data=file, file_name=md_file_path.name, mime="text/markdown" ) # Create the .docx file from rendered Markdown content docx_file_path = create_docx_from_markdown(pdf_text) # Create a download button for the .docx file st.markdown("### Download Word Document:") with open(docx_file_path, "rb") as file: st.download_button( label="Download Word Document", data=file, file_name=docx_file_path.name, mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) except Exception as e: st.error(f"An error occurred during conversion: {e}") # Add some styling to make it visually appealing st.markdown( """ """, unsafe_allow_html=True )