AzizWazir commited on
Commit
13590ad
·
verified ·
1 Parent(s): 7c01c8e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pytesseract
3
+ from PIL import Image
4
+ import docx
5
+ import pdf2image
6
+ import camelot
7
+
8
+ # Set Tesseract path if not set already
9
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
10
+
11
+ def pdf_to_docx(pdf_file):
12
+ """Converts a PDF file to a Word document (.docx) using OCR.
13
+
14
+ Args:
15
+ pdf_file: The path to the PDF file.
16
+
17
+ Returns:
18
+ A Word document object.
19
+ """
20
+
21
+ # Extract images from the PDF file
22
+ pages = pdf2image.convert_from_path(pdf_file, dpi=200)
23
+
24
+ # Create a Word document
25
+ doc = docx.Document()
26
+
27
+ # Iterate over the extracted images and perform OCR
28
+ for page in pages:
29
+ text = pytesseract.image_to_string(page)
30
+ doc.add_paragraph(text)
31
+
32
+ return doc
33
+
34
+ def pdf_to_xlsx(pdf_file):
35
+ """Converts a PDF file to an Excel spreadsheet (.xlsx) using Camelot.
36
+
37
+ Args:
38
+ pdf_file: The path to the PDF file.
39
+
40
+ Returns:
41
+ A list of Excel tables extracted from the PDF.
42
+ """
43
+
44
+ tables = camelot.read_pdf(pdf_file, flavor='streamlit')
45
+ return tables
46
+
47
+ def main():
48
+ """Streamlit app for converting PDF files to Word and Excel."""
49
+
50
+ # Title and description
51
+ st.title("PDF Converter App")
52
+ st.subheader("Convert your PDFs to editable Word documents and Excel spreadsheets.")
53
+
54
+ # Upload PDF file
55
+ uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
56
+
57
+ if uploaded_file is not None:
58
+ # Convert PDF to Word and Excel
59
+ try:
60
+ doc = pdf_to_docx(uploaded_file)
61
+ tables = pdf_to_xlsx(uploaded_file)
62
+
63
+ # Download options
64
+ if st.button("Download Word document"):
65
+ with open("output.docx", "wb") as f:
66
+ doc.save(f)
67
+ st.success("Word document downloaded!")
68
+
69
+ if tables:
70
+ st.header("Extracted Excel Tables")
71
+ for i, table in enumerate(tables):
72
+ st.subheader(f"Table {i+1}")
73
+ st.dataframe(table.df)
74
+ if st.button(f"Download Excel table {i+1}"):
75
+ table.df.to_excel(f"table_{i+1}.xlsx", index=False)
76
+ st.success(f"Excel table {i+1} downloaded!")
77
+
78
+ except Exception as e:
79
+ st.error(f"Error converting PDF: {e}")
80
+
81
+ if __name__ == "__main__":
82
+ main()