Spaces:

ositamiles
/

Pdf-Text-Extractor

Build error

App Files Files Community

ositamiles commited on Sep 27, 2024

Commit

9afab0a

verified ·

1 Parent(s): 83bfe83

Create app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import streamlit as st
+import pytesseract
+from pdf2image import convert_from_path
+import os
+import tempfile
+import base64
+# Set page config
+st.set_page_config(page_title="PDF Text Extractor", layout="wide")
+# Function to convert PDF pages to images
+def pdf_to_images(pdf_path, dpi=300):
+    return convert_from_path(pdf_path, dpi=dpi)
+# Function to extract text from images
+def extract_text_from_images(images, progress_bar):
+    extracted_text = ""
+    for i, image in enumerate(images):
+        text = pytesseract.image_to_string(image)
+        extracted_text += f"Page {i+1}:\n{text}\n\n"
+        progress_bar.progress((i + 1) / len(images))
+    return extracted_text
+# Function to create a download link
+def get_download_link(text, filename, link_text):
+    b64 = base64.b64encode(text.encode()).decode()
+    return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{link_text}</a>'
+# Main Streamlit app
+def main():
+    st.title("PDF Text Extractor")
+    st.sidebar.header("Settings")
+    dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50)
+    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+    if uploaded_file is not None:
+        st.success("File successfully uploaded!")
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            tmp_file_path = tmp_file.name
+        try:
+            with st.spinner("Converting PDF to images..."):
+                images = pdf_to_images(tmp_file_path, dpi)
+            st.info(f"Total pages: {len(images)}")
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            with st.spinner("Extracting text from images..."):
+                status_text.text("Extracting text... (This may take a while)")
+                extracted_text = extract_text_from_images(images, progress_bar)
+            status_text.text("Text extraction complete!")
+            st.subheader("Extracted Text")
+            st.text_area("", extracted_text, height=300)
+            st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True)
+        except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
+        finally:
+            # Clean up the temporary file
+            os.unlink(tmp_file_path)
+if __name__ == "__main__":
+    main()