AzizWazir commited on
Commit
a856839
·
verified ·
1 Parent(s): 401483c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -13
app.py CHANGED
@@ -1,18 +1,21 @@
1
- import streamlit as st
2
  import os
 
3
  from pdf2image import convert_from_path
4
  from PIL import Image
5
  import pytesseract
 
6
 
7
- # Ensure Poppler is in the PATH (you might need to adjust this for your server)
8
- os.environ["PATH"] += ":/usr/local/bin" # Update this with the correct Poppler path if needed
 
9
 
10
  def pdf_to_text(pdf_path):
11
  try:
12
  # Convert PDF to images
13
- images = convert_from_path(pdf_path)
14
  text = ""
15
 
 
16
  for image in images:
17
  text += pytesseract.image_to_string(image)
18
 
@@ -21,30 +24,36 @@ def pdf_to_text(pdf_path):
21
  st.error(f"Error during PDF to image conversion: {e}")
22
  return None
23
 
 
 
 
 
 
 
24
  def main():
25
  st.title("PDF to Text Converter")
26
-
27
  # Upload PDF file
28
  uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
29
-
30
  if uploaded_file is not None:
31
  # Save uploaded file temporarily
32
  with open("uploaded_file.pdf", "wb") as f:
33
  f.write(uploaded_file.getbuffer())
34
-
35
  st.text("Converting PDF to text...")
36
-
 
37
  text = pdf_to_text("uploaded_file.pdf")
38
 
39
  if text:
40
  st.text_area("Extracted Text", text, height=300)
41
-
42
  # Create downloadable Word file
43
  word_file = "output.docx"
44
- with open(word_file, "w") as f:
45
- f.write(text)
46
-
47
  st.download_button("Download Word File", word_file)
48
-
49
  if __name__ == "__main__":
50
  main()
 
 
1
  import os
2
+ import streamlit as st
3
  from pdf2image import convert_from_path
4
  from PIL import Image
5
  import pytesseract
6
+ from docx import Document
7
 
8
+ # Ensure Poppler's path is correct
9
+ # Set the full path to Poppler's 'bin' directory (update this path according to your system)
10
+ poppler_path = r'C:\poppler\bin' # Update this with your actual Poppler path
11
 
12
  def pdf_to_text(pdf_path):
13
  try:
14
  # Convert PDF to images
15
+ images = convert_from_path(pdf_path, poppler_path=poppler_path)
16
  text = ""
17
 
18
+ # Extract text from each image using pytesseract
19
  for image in images:
20
  text += pytesseract.image_to_string(image)
21
 
 
24
  st.error(f"Error during PDF to image conversion: {e}")
25
  return None
26
 
27
+ def save_text_to_word(text, filename="output.docx"):
28
+ # Create a Word document and write the text to it
29
+ doc = Document()
30
+ doc.add_paragraph(text)
31
+ doc.save(filename)
32
+
33
  def main():
34
  st.title("PDF to Text Converter")
35
+
36
  # Upload PDF file
37
  uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
38
+
39
  if uploaded_file is not None:
40
  # Save uploaded file temporarily
41
  with open("uploaded_file.pdf", "wb") as f:
42
  f.write(uploaded_file.getbuffer())
43
+
44
  st.text("Converting PDF to text...")
45
+
46
+ # Convert PDF to text
47
  text = pdf_to_text("uploaded_file.pdf")
48
 
49
  if text:
50
  st.text_area("Extracted Text", text, height=300)
51
+
52
  # Create downloadable Word file
53
  word_file = "output.docx"
54
+ save_text_to_word(text, word_file)
55
+
 
56
  st.download_button("Download Word File", word_file)
57
+
58
  if __name__ == "__main__":
59
  main()