Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

ea1977e

verified ·

1 Parent(s): da16d5a

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -54

app.py CHANGED Viewed

@@ -1,61 +1,30 @@
-import pytesseract
-from pdf2image import convert_from_path
-from docx import Document
-import io
-import fitz  # PyMuPDF
-# OCR Setup
-pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path based on your installation
-# Function to extract images from a PDF
-def extract_images_from_pdf(pdf_path):
-    images = []
-    doc = fitz.open(pdf_path)
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        pix = page.get_pixmap()
-        img = pix.tobytes()
-        images.append(img)
-    return images
-# Function to perform OCR on images and extract text
-def ocr_from_images(images):
-    extracted_text = ""
-    for img in images:
-        text = pytesseract.image_to_string(img)
-        extracted_text += text + "\n"
-    return extracted_text
-# Function to convert PDF with images to a Word document
-def pdf_to_word(pdf_path, word_output_path):
-    # Extract images from PDF
-    images = extract_images_from_pdf(pdf_path)
-    # Perform OCR on the images
-    ocr_text = ocr_from_images(images)
-    # Convert PDF text to Word
-    doc = Document()
-    doc.add_heading('Converted PDF Text', 0)
-    # Extract PDF text (non-image content)
-    pdf_text = ""
-    with open(pdf_path, 'rb') as file:
-        doc = fitz.open(file)
-        for page in doc:
-            pdf_text += page.get_text()
-    # Add both PDF text and OCR extracted text to Word
-    doc.add_paragraph(pdf_text)
-    doc.add_paragraph("Extracted Text from Images (OCR):")
-    doc.add_paragraph(ocr_text)
-    doc.save(word_output_path)
-    print(f"Word document saved as: {word_output_path}")
 # Example usage
-pdf_path = "your_pdf_file.pdf"  # Provide the path to your PDF file
-word_output_path = "output.docx"  # Provide the desired output Word file path
-pdf_to_word(pdf_path, word_output_path)

+import pandas as pd
+def pdf_to_excel(pdf_path, excel_output_path):
+    # Example: If your PDF has structured data that can be parsed into a table
+    # (You can use libraries like pdfplumber for extracting tables)
+    tables = []  # List to store the extracted tables
+    # Example of extracting a table (this part would depend on your PDF content)
+    # Extract tables using pdfplumber, PyMuPDF, or a similar library
+    # Example with pdfplumber (if tables are present in your PDF)
+    import pdfplumber
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            table = page.extract_table()
+            if table:
+                tables.append(table)
+    # Write the extracted tables to an Excel file
+    with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
+        for i, table in enumerate(tables):
+            df = pd.DataFrame(table[1:], columns=table[0])  # Converting to DataFrame
+            df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
+    print(f"Excel file saved as: {excel_output_path}")
 # Example usage
+pdf_path = "your_pdf_file.pdf"
+excel_output_path = "output.xlsx"
+pdf_to_excel(pdf_path, excel_output_path)