AzizWazir commited on
Commit
03dbb03
·
verified ·
1 Parent(s): 46a8f59

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pytesseract
3
+ from PIL import Image
4
+ import docx
5
+ import pdf2image
6
+
7
+ # Set Tesseract path if not set already
8
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
9
+
10
+ def extract_text_from_image_pdf(pdf_file):
11
+ """Extracts text from a PDF by converting it to images and performing OCR."""
12
+
13
+ # Read the PDF file
14
+ with open(pdf_file, 'rb') as f:
15
+ pdf_bytes = f.read()
16
+
17
+ # Extract images from the PDF
18
+ images = pdf2image.convert_from_bytes(pdf_bytes)
19
+
20
+ # Perform OCR on each image and combine the text
21
+ extracted_text = ''
22
+ for image in images:
23
+ text = pytesseract.image_to_string(image)
24
+ extracted_text += text + '\n' # Add newline for better readability
25
+
26
+ return extracted_text
27
+
28
+ def main():
29
+ """Streamlit app for converting PDF images to text."""
30
+
31
+ # Title and description
32
+ st.title("PDF to Text Converter")
33
+ st.subheader("Convert your PDF images to editable text documents.")
34
+
35
+ # Upload PDF file
36
+ uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
37
+
38
+ if uploaded_file is not None:
39
+ # Extract text from the PDF
40
+ extracted_text = extract_text_from_image_pdf(uploaded_file.name)
41
+
42
+ # Display extracted text
43
+ st.success("Text extracted from PDF:")
44
+ st.write(extracted_text)
45
+
46
+ # Download option (optional)
47
+ if st.button("Download text as .txt file"):
48
+ with open("extracted_text.txt", "w") as f:
49
+ f.write(extracted_text)
50
+ st.success("Text downloaded!")
51
+
52
+ if __name__ == "__main__":
53
+ main()