AzizWazir commited on
Commit
2df8377
·
verified ·
1 Parent(s): 0016867

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -56
app.py CHANGED
@@ -1,71 +1,53 @@
1
  import streamlit as st
2
  import pytesseract
3
- from pdf2image import convert_from_path
4
  from PIL import Image
5
- import pandas as pd
6
- from docx import Document
7
- import io
8
- import tempfile
9
 
10
- # OCR function to convert image-based PDF to text
11
- def extract_text_from_image_pdf(uploaded_file):
12
- # Save the uploaded file to a temporary file
13
- with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
14
- tmp_file.write(uploaded_file.read()) # Write the file content to the temporary file
15
- tmp_file_path = tmp_file.name # Get the temporary file path
16
-
17
- # Convert PDF to images using pdf2image
18
- images = convert_from_path(tmp_file_path)
19
- extracted_text = []
20
-
21
- for image in images:
22
- # Use pytesseract to do OCR on the image
23
- text = pytesseract.image_to_string(image)
24
- extracted_text.append(text)
25
-
26
- return "\n".join(extracted_text)
27
 
28
- # Save text to Word document
29
- def save_to_word(text, output_filename):
30
- doc = Document()
31
- doc.add_paragraph(text)
32
- doc.save(output_filename)
33
 
34
- # Save text to Excel document
35
- def save_to_excel(text, output_filename):
36
- # Split the text into rows and columns (simplified, adjust based on your data)
37
- rows = text.split("\n")
38
- table_data = [row.split() for row in rows if row] # You can adjust this for proper column splitting
39
-
40
- df = pd.DataFrame(table_data)
41
- df.to_excel(output_filename, index=False)
 
 
 
 
 
 
42
 
43
- # Main function
44
  def main():
45
- st.title("PDF (Image-based) to Text-based Document Converter")
46
 
47
- # File uploader widget in Streamlit
48
- uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 
49
 
50
- if uploaded_file is not None:
51
- # Convert image-based PDF to text using OCR
52
- extracted_text = extract_text_from_image_pdf(uploaded_file)
53
 
54
- st.write("Extracted Text:")
55
- st.text_area("Text from PDF", extracted_text, height=300)
 
56
 
57
- # Convert the extracted text to Word or Excel
58
- if st.button("Save as Word"):
59
- # Save to Word file
60
- word_filename = "extracted_text.docx"
61
- save_to_word(extracted_text, word_filename)
62
- st.success(f"Saved to {word_filename}")
63
 
64
- if st.button("Save as Excel"):
65
- # Save to Excel file
66
- excel_filename = "extracted_text.xlsx"
67
- save_to_excel(extracted_text, excel_filename)
68
- st.success(f"Saved to {excel_filename}")
69
 
70
  if __name__ == "__main__":
71
- main()
 
1
  import streamlit as st
2
  import pytesseract
 
3
  from PIL import Image
4
+ import docx
5
+ import pdf2image
 
 
6
 
7
+ # Set Tesseract path if not set already
8
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def extract_text_from_image_pdf(pdf_file):
11
+ """Extracts text from a PDF by converting it to images and performing OCR."""
 
 
 
12
 
13
+ # Read the PDF file
14
+ with open(pdf_file, 'rb') as f:
15
+ pdf_bytes = f.read()
16
+
17
+ # Extract images from the PDF
18
+ images = pdf2image.convert_from_bytes(pdf_bytes)
19
+
20
+ # Perform OCR on each image and combine the text
21
+ extracted_text = ''
22
+ for image in images:
23
+ text = pytesseract.image_to_string(image)
24
+ extracted_text += text + '\n' # Add newline for better readability
25
+
26
+ return extracted_text
27
 
 
28
  def main():
29
+ """Streamlit app for converting PDF images to text."""
30
 
31
+ # Title and description
32
+ st.title("PDF to Text Converter")
33
+ st.subheader("Convert your PDF images to editable text documents.")
34
 
35
+ # Upload PDF file
36
+ uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
 
37
 
38
+ if uploaded_file is not None:
39
+ # Extract text from the PDF
40
+ extracted_text = extract_text_from_image_pdf(uploaded_file.name)
41
 
42
+ # Display extracted text
43
+ st.success("Text extracted from PDF:")
44
+ st.write(extracted_text)
 
 
 
45
 
46
+ # Download option (optional)
47
+ if st.button("Download text as .txt file"):
48
+ with open("extracted_text.txt", "w") as f:
49
+ f.write(extracted_text)
50
+ st.success("Text downloaded!")
51
 
52
  if __name__ == "__main__":
53
+ main()