Spaces:

ShayanRl
/

pdf2text

Sleeping

App Files Files Community

ShayanRl commited on Nov 29, 2025

Commit

159e468

verified ·

1 Parent(s): 36a28df

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -47

app.py CHANGED Viewed

@@ -1,62 +1,84 @@
 import streamlit as st
-import io
 import requests
 import pdfplumber
 import os
-def fextractURL(pdf_path):
-    extracted_data = ""
     try:
-        if pdf_path.endswith('.pdf'):
-            # If the URL ends with .pdf, use pdfplumber directly
-            r = requests.get(pdf_path)
-            f = io.BytesIO(r.content)
-            with pdfplumber.open(f) as pdf:
-                for page in pdf.pages:
-                    extracted_data += page.extract_text() + "\n"  # Extract text
-                    tables = page.extract_tables()  # Extract tables
-                    for table in tables:
-                        for row in table:
-                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
-        else:
-            # If the URL does not end with .pdf, download the PDF first
-            response = requests.get(pdf_path)
-            pdf_content = response.content
-            # Save the PDF locally
-            pdf_filename = 'downloaded_document.pdf'
-            with open(pdf_filename, 'wb') as pdf_file:
-                pdf_file.write(pdf_content)
-            # Extract content using pdfplumber
-            with pdfplumber.open(pdf_filename) as pdf:
-                for page in pdf.pages:
-                    extracted_data += page.extract_text() + "\n"  # Extract text
-                    tables = page.extract_tables()  # Extract tables
-                    for table in tables:
-                        for row in table:
-                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
-            # Delete the PDF file
-            os.remove(pdf_filename)
     except Exception as e:
-        st.error(f"An error occurred: {str(e)}")
-    return extracted_data
 vert_space = '<div style="padding: 3rem 1rem;"></div>'
 st.markdown(vert_space, unsafe_allow_html=True)
-st.write("Extract full text from PDF URL")
-pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
-button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
-extractedText = st.empty()
-if button:
-    try:
-        text = fextractURL(pdfURL)
-        extractedText.text(text)
-    except Exception as e:
-        st.error(f"An error occurred: {str(e)}")

 import streamlit as st
 import requests
 import pdfplumber
 import os
+import fitz  # PyMuPDF
+def download_pdf(pdf_path):
+    """Downloads PDF from URL or returns local path if it exists."""
+    if os.path.isfile(pdf_path):
+        return pdf_path
+    try:
+        response = requests.get(pdf_path)
+        response.raise_for_status()
+        pdf_filename = 'downloaded_document.pdf'
+        with open(pdf_filename, 'wb') as pdf_file:
+            pdf_file.write(response.content)
+        return pdf_filename
+    except Exception as e:
+        st.error(f"Error downloading PDF: {e}")
+        return None
+def extract_content(pdf_path):
+    """Extracts raw text using pdfplumber and HTML using PyMuPDF."""
+    text_data = ""
+    html_data = ""
+    # 1. Extract Raw Text using pdfplumber (keeping existing logic)
     try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text_data += (page.extract_text() or "") + "\n"
+                tables = page.extract_tables()
+                for table in tables:
+                    for row in table:
+                        # Handle None cells in tables
+                        row_text = "\t".join(str(cell) if cell is not None else "" for cell in row)
+                        text_data += row_text + "\n"
     except Exception as e:
+        st.error(f"Error extracting text with pdfplumber: {e}")
+    # 2. Extract HTML using PyMuPDF (fitz)
+    try:
+        doc = fitz.open(pdf_path)
+        for page in doc:
+            html_data += page.get_text("html")
+        doc.close()
+    except Exception as e:
+        st.error(f"Error extracting HTML with PyMuPDF: {e}")
+    return text_data, html_data
 vert_space = '<div style="padding: 3rem 1rem;"></div>'
 st.markdown(vert_space, unsafe_allow_html=True)
+st.title("PDF Content Scraper")
+st.write("Extract full text and HTML from PDF URL")
+pdfURL = st.text_input(label="PDF URL", value="", placeholder="Enter PDF URL here")
+button = st.button(label='Extract')
+if button and pdfURL:
+    with st.spinner("Downloading and extracting..."):
+        local_pdf = download_pdf(pdfURL)
+        if local_pdf:
+            text, html = extract_content(local_pdf)
+            # Clean up downloaded file if it was downloaded
+            if local_pdf == 'downloaded_document.pdf' and os.path.exists(local_pdf):
+                os.remove(local_pdf)
+            st.subheader("Raw Text Content")
+            st.text_area("Extracted Text", text, height=300)
+            st.subheader("HTML Content")
+            st.download_button(
+                label="Download HTML",
+                data=html,
+                file_name="extracted_content.html",
+                mime="text/html"
+            )
+            with st.expander("View HTML Source"):
+                st.code(html, language='html')