ositamiles commited on
Commit
9afab0a
·
verified ·
1 Parent(s): 83bfe83

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ import os
5
+ import tempfile
6
+ import base64
7
+
8
+ # Set page config
9
+ st.set_page_config(page_title="PDF Text Extractor", layout="wide")
10
+
11
+ # Function to convert PDF pages to images
12
+ def pdf_to_images(pdf_path, dpi=300):
13
+ return convert_from_path(pdf_path, dpi=dpi)
14
+
15
+ # Function to extract text from images
16
+ def extract_text_from_images(images, progress_bar):
17
+ extracted_text = ""
18
+ for i, image in enumerate(images):
19
+ text = pytesseract.image_to_string(image)
20
+ extracted_text += f"Page {i+1}:\n{text}\n\n"
21
+ progress_bar.progress((i + 1) / len(images))
22
+ return extracted_text
23
+
24
+ # Function to create a download link
25
+ def get_download_link(text, filename, link_text):
26
+ b64 = base64.b64encode(text.encode()).decode()
27
+ return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{link_text}</a>'
28
+
29
+ # Main Streamlit app
30
+ def main():
31
+ st.title("PDF Text Extractor")
32
+
33
+ st.sidebar.header("Settings")
34
+ dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50)
35
+
36
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
37
+
38
+ if uploaded_file is not None:
39
+ st.success("File successfully uploaded!")
40
+
41
+ # Save uploaded file temporarily
42
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
43
+ tmp_file.write(uploaded_file.getvalue())
44
+ tmp_file_path = tmp_file.name
45
+
46
+ try:
47
+ with st.spinner("Converting PDF to images..."):
48
+ images = pdf_to_images(tmp_file_path, dpi)
49
+
50
+ st.info(f"Total pages: {len(images)}")
51
+
52
+ progress_bar = st.progress(0)
53
+ status_text = st.empty()
54
+
55
+ with st.spinner("Extracting text from images..."):
56
+ status_text.text("Extracting text... (This may take a while)")
57
+ extracted_text = extract_text_from_images(images, progress_bar)
58
+
59
+ status_text.text("Text extraction complete!")
60
+
61
+ st.subheader("Extracted Text")
62
+ st.text_area("", extracted_text, height=300)
63
+
64
+ st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True)
65
+
66
+ except Exception as e:
67
+ st.error(f"An error occurred: {str(e)}")
68
+
69
+ finally:
70
+ # Clean up the temporary file
71
+ os.unlink(tmp_file_path)
72
+
73
+ if __name__ == "__main__":
74
+ main()