AzizWazir commited on
Commit
5fa7f28
·
verified ·
1 Parent(s): 03dbb03

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -53
app.py DELETED
@@ -1,53 +0,0 @@
1
- import streamlit as st
2
- import pytesseract
3
- from PIL import Image
4
- import docx
5
- import pdf2image
6
-
7
- # Set Tesseract path if not set already
8
- pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
9
-
10
- def extract_text_from_image_pdf(pdf_file):
11
- """Extracts text from a PDF by converting it to images and performing OCR."""
12
-
13
- # Read the PDF file
14
- with open(pdf_file, 'rb') as f:
15
- pdf_bytes = f.read()
16
-
17
- # Extract images from the PDF
18
- images = pdf2image.convert_from_bytes(pdf_bytes)
19
-
20
- # Perform OCR on each image and combine the text
21
- extracted_text = ''
22
- for image in images:
23
- text = pytesseract.image_to_string(image)
24
- extracted_text += text + '\n' # Add newline for better readability
25
-
26
- return extracted_text
27
-
28
- def main():
29
- """Streamlit app for converting PDF images to text."""
30
-
31
- # Title and description
32
- st.title("PDF to Text Converter")
33
- st.subheader("Convert your PDF images to editable text documents.")
34
-
35
- # Upload PDF file
36
- uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
37
-
38
- if uploaded_file is not None:
39
- # Extract text from the PDF
40
- extracted_text = extract_text_from_image_pdf(uploaded_file.name)
41
-
42
- # Display extracted text
43
- st.success("Text extracted from PDF:")
44
- st.write(extracted_text)
45
-
46
- # Download option (optional)
47
- if st.button("Download text as .txt file"):
48
- with open("extracted_text.txt", "w") as f:
49
- f.write(extracted_text)
50
- st.success("Text downloaded!")
51
-
52
- if __name__ == "__main__":
53
- main()