Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

da16d5a

verified ·

1 Parent(s): 09938bc

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -29

app.py CHANGED Viewed

@@ -1,38 +1,61 @@
-import streamlit as st
 from docx import Document
 import io
-def extract_text_from_docx(file):
-    """Extracts all text from an uploaded .docx file"""
-    try:
-        # Open the uploaded .docx file
-        doc = Document(io.BytesIO(file.read()))
-        # Extract text from each paragraph in the document
-        text = ""
-        for paragraph in doc.paragraphs:
-            text += paragraph.text + '\n'
-        return text
-    except Exception as e:
-        st.error(f"Error processing the document: {e}")
-        return None
-def main():
-    st.title("Extract Text from DOCX")
-    # File upload
-    uploaded_file = st.file_uploader("Choose a DOCX file", type=["docx"])
-    if uploaded_file is not None:
-        text = extract_text_from_docx(uploaded_file)
-        if text:
-            st.subheader("Extracted Text")
-            st.text(text)
-        else:
-            st.error("Failed to extract text.")
-if __name__ == "__main__":
-    main()

+import pytesseract
+from pdf2image import convert_from_path
 from docx import Document
 import io
+import fitz  # PyMuPDF
+# OCR Setup
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path based on your installation
+# Function to extract images from a PDF
+def extract_images_from_pdf(pdf_path):
+    images = []
+    doc = fitz.open(pdf_path)
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        pix = page.get_pixmap()
+        img = pix.tobytes()
+        images.append(img)
+    return images
+# Function to perform OCR on images and extract text
+def ocr_from_images(images):
+    extracted_text = ""
+    for img in images:
+        text = pytesseract.image_to_string(img)
+        extracted_text += text + "\n"
+    return extracted_text
+# Function to convert PDF with images to a Word document
+def pdf_to_word(pdf_path, word_output_path):
+    # Extract images from PDF
+    images = extract_images_from_pdf(pdf_path)
+    # Perform OCR on the images
+    ocr_text = ocr_from_images(images)
+    # Convert PDF text to Word
+    doc = Document()
+    doc.add_heading('Converted PDF Text', 0)
+    # Extract PDF text (non-image content)
+    pdf_text = ""
+    with open(pdf_path, 'rb') as file:
+        doc = fitz.open(file)
+        for page in doc:
+            pdf_text += page.get_text()
+    # Add both PDF text and OCR extracted text to Word
+    doc.add_paragraph(pdf_text)
+    doc.add_paragraph("Extracted Text from Images (OCR):")
+    doc.add_paragraph(ocr_text)
+    doc.save(word_output_path)
+    print(f"Word document saved as: {word_output_path}")
+# Example usage
+pdf_path = "your_pdf_file.pdf"  # Provide the path to your PDF file
+word_output_path = "output.docx"  # Provide the desired output Word file path
+pdf_to_word(pdf_path, word_output_path)