AzizWazir commited on
Commit
72c095c
·
verified ·
1 Parent(s): 757beef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -39
app.py CHANGED
@@ -4,56 +4,63 @@ from pdf2image import convert_from_path
4
  from PIL import Image
5
  import pytesseract
6
  from docx import Document
 
7
 
8
- # Ensure Poppler's path is correct
9
- # Set the full path to Poppler's 'bin' directory (update this path according to your system)
10
- poppler_path = r'C:\poppler\bin' # Update this with your actual Poppler path
11
 
12
- def pdf_to_text(pdf_path):
 
13
  try:
14
- # Convert PDF to images
15
- images = convert_from_path(pdf_path, poppler_path=poppler_path)
16
- text = ""
17
-
18
- # Extract text from each image using pytesseract
19
- for image in images:
20
- text += pytesseract.image_to_string(image)
21
-
 
 
22
  return text
23
  except Exception as e:
24
- st.error(f"Error during PDF to image conversion: {e}")
25
  return None
26
 
27
- def save_text_to_word(text, filename="output.docx"):
28
- # Create a Word document and write the text to it
29
  doc = Document()
30
  doc.add_paragraph(text)
31
- doc.save(filename)
32
-
33
- def main():
34
- st.title("PDF to Text Converter")
35
-
36
- # Upload PDF file
37
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
38
 
39
- if uploaded_file is not None:
40
- # Save uploaded file temporarily
41
- with open("uploaded_file.pdf", "wb") as f:
42
- f.write(uploaded_file.getbuffer())
43
 
44
- st.text("Converting PDF to text...")
 
45
 
46
- # Convert PDF to text
47
- text = pdf_to_text("uploaded_file.pdf")
 
48
 
49
- if text:
50
- st.text_area("Extracted Text", text, height=300)
51
-
52
- # Create downloadable Word file
53
- word_file = "output.docx"
54
- save_text_to_word(text, word_file)
55
-
56
- st.download_button("Download Word File", word_file)
 
 
57
 
58
- if __name__ == "__main__":
59
- main()
 
 
 
 
 
 
4
  from PIL import Image
5
  import pytesseract
6
  from docx import Document
7
+ import tempfile
8
 
9
+ # Folder path for PDFs
10
+ pdf_folder_path = "D:/General"
 
11
 
12
+ # Function to convert PDF to image
13
+ def pdf_to_image(pdf_path):
14
  try:
15
+ images = convert_from_path(pdf_path, 500)
16
+ return images
17
+ except Exception as e:
18
+ st.error(f"Error during PDF to image conversion: {str(e)}")
19
+ return None
20
+
21
+ # Function to extract text from an image using pytesseract
22
+ def image_to_text(image):
23
+ try:
24
+ text = pytesseract.image_to_string(image)
25
  return text
26
  except Exception as e:
27
+ st.error(f"Error during image to text conversion: {str(e)}")
28
  return None
29
 
30
+ # Function to save text to a Word document
31
+ def save_to_word(text, file_name):
32
  doc = Document()
33
  doc.add_paragraph(text)
34
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name)
35
+ doc.save(temp_file.name)
36
+ return temp_file.name
 
 
 
 
37
 
38
+ # Streamlit UI
39
+ st.title("PDF to Word Converter")
40
+ st.write("Converting PDFs from the D:/General folder")
 
41
 
42
+ # Get all PDFs in the specified folder
43
+ pdf_files = [f for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]
44
 
45
+ if pdf_files:
46
+ for pdf_file in pdf_files:
47
+ pdf_path = os.path.join(pdf_folder_path, pdf_file)
48
 
49
+ # Convert PDF to images
50
+ images = pdf_to_image(pdf_path)
51
+
52
+ if images:
53
+ # Extract text from images
54
+ extracted_text = ""
55
+ for img in images:
56
+ text = image_to_text(img)
57
+ if text:
58
+ extracted_text += text + "\n"
59
 
60
+ # Save the extracted text to Word
61
+ if extracted_text:
62
+ word_file = save_to_word(extracted_text, pdf_file)
63
+ st.success(f"Conversion of {pdf_file} complete! Download the Word file below.")
64
+ st.download_button(f"Download {pdf_file} as Word", word_file, file_name=f"{pdf_file}.docx")
65
+ else:
66
+ st.write("No PDFs found in the specified folder.")