AzizWazir commited on
Commit
1168986
·
verified ·
1 Parent(s): bdc3ab9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -38
app.py CHANGED
@@ -1,53 +1,66 @@
 
1
  import streamlit as st
 
2
  import pytesseract
3
  from PIL import Image
4
- import docx
5
- import pdf2image
6
 
7
- # Set Tesseract path if not set already
8
- pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
 
9
 
10
- def extract_text_from_image_pdf(pdf_file):
11
- """Extracts text from a PDF by converting it to images and performing OCR."""
 
 
 
 
 
 
12
 
13
- # Read the PDF file
14
- with open(pdf_file, 'rb') as f:
15
- pdf_bytes = f.read()
 
 
16
 
17
- # Extract images from the PDF
18
- images = pdf2image.convert_from_bytes(pdf_bytes)
19
-
20
- # Perform OCR on each image and combine the text
21
- extracted_text = ''
22
- for image in images:
23
- text = pytesseract.image_to_string(image)
24
- extracted_text += text + '\n' # Add newline for better readability
25
-
26
- return extracted_text
27
 
28
  def main():
29
- """Streamlit app for converting PDF images to text."""
30
-
31
- # Title and description
32
- st.title("PDF to Text Converter")
33
- st.subheader("Convert your PDF images to editable text documents.")
34
 
35
- # Upload PDF file
36
- uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
 
 
 
 
 
 
 
 
 
37
 
38
- if uploaded_file is not None:
39
- # Extract text from the PDF
40
- extracted_text = extract_text_from_image_pdf(uploaded_file.name)
 
 
 
 
41
 
42
- # Display extracted text
43
- st.success("Text extracted from PDF:")
44
- st.write(extracted_text)
45
 
46
- # Download option (optional)
47
- if st.button("Download text as .txt file"):
48
- with open("extracted_text.txt", "w") as f:
49
- f.write(extracted_text)
50
- st.success("Text downloaded!")
51
 
52
  if __name__ == "__main__":
53
- main()
 
1
+ import os
2
  import streamlit as st
3
+ from pdf2image import convert_from_path
4
  import pytesseract
5
  from PIL import Image
6
+ import pandas as pd
7
+ from docx import Document
8
 
9
+ # Set paths for poppler and tesseract (for local testing or adjust as per your environment)
10
+ POPPLER_PATH = "/usr/bin"
11
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
12
 
13
+ # Function to extract text from an image-based PDF
14
+ def extract_text_from_image_pdf(pdf_path):
15
+ images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
16
+ extracted_text = []
17
+ for page_num, image in enumerate(images, start=1):
18
+ text = pytesseract.image_to_string(image)
19
+ extracted_text.append(f"Page {page_num}:\n{text}")
20
+ return "\n".join(extracted_text)
21
 
22
+ # Function to save extracted text to a Word file
23
+ def save_text_to_word(text, output_path):
24
+ doc = Document()
25
+ doc.add_paragraph(text)
26
+ doc.save(output_path)
27
 
28
+ # Function to save extracted text to an Excel file
29
+ def save_text_to_excel(text, output_path):
30
+ data = {"Text": text.split("\n")}
31
+ df = pd.DataFrame(data)
32
+ df.to_excel(output_path, index=False)
 
 
 
 
 
33
 
34
  def main():
35
+ st.title("PDF Image to Text Converter")
36
+ st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.")
 
 
 
37
 
38
+ uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
39
+ if uploaded_file is not None:
40
+ with st.spinner("Processing..."):
41
+ tmp_file_path = "uploaded_file.pdf"
42
+ with open(tmp_file_path, "wb") as f:
43
+ f.write(uploaded_file.read())
44
+
45
+ try:
46
+ extracted_text = extract_text_from_image_pdf(tmp_file_path)
47
+ st.success("Text extracted successfully!")
48
+ st.text_area("Extracted Text", extracted_text, height=300)
49
 
50
+ # Options to download text in different formats
51
+ if st.button("Download as Word"):
52
+ save_text_to_word(extracted_text, "output.docx")
53
+ st.download_button("Download Word File", open("output.docx", "rb"), "output.docx")
54
+ if st.button("Download as Excel"):
55
+ save_text_to_excel(extracted_text, "output.xlsx")
56
+ st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx")
57
 
58
+ except Exception as e:
59
+ st.error(f"An error occurred: {e}")
 
60
 
61
+ finally:
62
+ if os.path.exists(tmp_file_path):
63
+ os.remove(tmp_file_path)
 
 
64
 
65
  if __name__ == "__main__":
66
+ main()