Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

145992a

verified ·

1 Parent(s): 529e2f8

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -75

app.py CHANGED Viewed

@@ -1,82 +1,31 @@
-import streamlit as st
-import pytesseract
-from PIL import Image
-import docx
-import pdf2image
-import camelot
-# Set Tesseract path if not set already
-pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-def pdf_to_docx(pdf_file):
-  """Converts a PDF file to a Word document (.docx) using OCR.
-  Args:
-    pdf_file: The path to the PDF file.
-  Returns:
-    A Word document object.
-  """
-  # Extract images from the PDF file
-  pages = pdf2image.convert_from_path(pdf_file, dpi=200)
-  # Create a Word document
-  doc = docx.Document()
-  # Iterate over the extracted images and perform OCR
-  for page in pages:
-    text = pytesseract.image_to_string(page)
-    doc.add_paragraph(text)
-  return doc
-def pdf_to_xlsx(pdf_file):
-  """Converts a PDF file to an Excel spreadsheet (.xlsx) using Camelot.
-  Args:
-    pdf_file: The path to the PDF file.
-  Returns:
-    A list of Excel tables extracted from the PDF.
-  """
-  tables = camelot.read_pdf(pdf_file, flavor='streamlit')
-  return tables
-def main():
-  """Streamlit app for converting PDF files to Word and Excel."""
-  # Title and description
-  st.title("PDF Converter App")
-  st.subheader("Convert your PDFs to editable Word documents and Excel spreadsheets.")
-  # Upload PDF file
-  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
-  if uploaded_file is not None:
-    # Convert PDF to Word and Excel
     try:
-      doc = pdf_to_docx(uploaded_file)
-      tables = pdf_to_xlsx(uploaded_file)
-      # Download options
-      if st.button("Download Word document"):
-        with open("output.docx", "wb") as f:
-          doc.save(f)
-        st.success("Word document downloaded!")
-      if tables:
-        st.header("Extracted Excel Tables")
-        for i, table in enumerate(tables):
-          st.subheader(f"Table {i+1}")
-          st.dataframe(table.df)
-          if st.button(f"Download Excel table {i+1}"):
-            table.df.to_excel(f"table_{i+1}.xlsx", index=False)
-            st.success(f"Excel table {i+1} downloaded!")
     except Exception as e:
-      st.error(f"Error converting PDF: {e}")
 if __name__ == "__main__":
-  main()

+from docx import Document
+def extract_text_from_docx(file_path):
+    """Extracts all text from a .docx file"""
     try:
+        # Open the .docx file
+        doc = Document(file_path)
+        # Extract text from each paragraph in the document
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + '\n'
+        return text
     except Exception as e:
+        print(f"Error processing the document: {e}")
+        return None
+def main():
+    file_path = "your_document.docx"  # Replace with your actual file path
+    text = extract_text_from_docx(file_path)
+    if text:
+        print("Extracted Text:")
+        print(text)
+    else:
+        print("Failed to extract text.")
 if __name__ == "__main__":
+    main()