Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

13590ad

verified ·

1 Parent(s): 7c01c8e

Create app.py

Browse files

Files changed (1) hide show

app.py +82 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import streamlit as st
+import pytesseract
+from PIL import Image
+import docx
+import pdf2image
+import camelot
+# Set Tesseract path if not set already
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+def pdf_to_docx(pdf_file):
+  """Converts a PDF file to a Word document (.docx) using OCR.
+  Args:
+    pdf_file: The path to the PDF file.
+  Returns:
+    A Word document object.
+  """
+  # Extract images from the PDF file
+  pages = pdf2image.convert_from_path(pdf_file, dpi=200)
+  # Create a Word document
+  doc = docx.Document()
+  # Iterate over the extracted images and perform OCR
+  for page in pages:
+    text = pytesseract.image_to_string(page)
+    doc.add_paragraph(text)
+  return doc
+def pdf_to_xlsx(pdf_file):
+  """Converts a PDF file to an Excel spreadsheet (.xlsx) using Camelot.
+  Args:
+    pdf_file: The path to the PDF file.
+  Returns:
+    A list of Excel tables extracted from the PDF.
+  """
+  tables = camelot.read_pdf(pdf_file, flavor='streamlit')
+  return tables
+def main():
+  """Streamlit app for converting PDF files to Word and Excel."""
+  # Title and description
+  st.title("PDF Converter App")
+  st.subheader("Convert your PDFs to editable Word documents and Excel spreadsheets.")
+  # Upload PDF file
+  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
+  if uploaded_file is not None:
+    # Convert PDF to Word and Excel
+    try:
+      doc = pdf_to_docx(uploaded_file)
+      tables = pdf_to_xlsx(uploaded_file)
+      # Download options
+      if st.button("Download Word document"):
+        with open("output.docx", "wb") as f:
+          doc.save(f)
+        st.success("Word document downloaded!")
+      if tables:
+        st.header("Extracted Excel Tables")
+        for i, table in enumerate(tables):
+          st.subheader(f"Table {i+1}")
+          st.dataframe(table.df)
+          if st.button(f"Download Excel table {i+1}"):
+            table.df.to_excel(f"table_{i+1}.xlsx", index=False)
+            st.success(f"Excel table {i+1} downloaded!")
+    except Exception as e:
+      st.error(f"Error converting PDF: {e}")
+if __name__ == "__main__":
+  main()