Spaces:

Vishwas1
/

PDF2Marathi

Sleeping

App Files Files Community

Vishwas1 commited on Dec 7, 2024

Commit

0d509f3

verified ·

1 Parent(s): b948fd6

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -51

app.py CHANGED Viewed

@@ -1,28 +1,18 @@
-import os
-from popplerqt5 import Poppler
-from PyQt5.QtCore import QByteArray
-import pytesseract
 from PIL import Image
-# Ensure pytesseract is configured for Marathi language
-pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract"  # Update if needed
-marathi_lang = "mar"  # Ensure Marathi language is installed in Tesseract
 def extract_images_from_pdf(pdf_path):
     """
-    Extract images from the PDF file using python-poppler-qt5.
     """
-    document = Poppler.Document.load(pdf_path)
-    if not document:
-        raise ValueError(f"Unable to open {pdf_path}")
     images = []
-    for i in range(document.numPages()):
-        page = document.page(i)
-        if page:
-            image = page.renderToImage(300, 300)  # DPI: 300x300 for better OCR
-            images.append(image)
     return images
 def perform_ocr_on_images(images):
@@ -30,44 +20,20 @@ def perform_ocr_on_images(images):
     Perform OCR on the extracted images.
     """
     ocr_results = []
-    for i, image in enumerate(images):
-        # Convert Qt Image to PIL Image
-        pil_image = Image.fromqimage(image)
-        text = pytesseract.image_to_string(pil_image, lang=marathi_lang)
         ocr_results.append(text)
-        print(f"OCR for Page {i + 1}: {text}")
-    return ocr_results
 def ocr_marathi_from_pdf(pdf_path):
     """
     Main function to handle Marathi OCR from a PDF.
     """
-    # Step 1: Extract images from the PDF
-    print("Extracting images from PDF...")
     images = extract_images_from_pdf(pdf_path)
-    # Step 2: Perform OCR on the extracted images
-    print("Performing OCR on images...")
-    ocr_results = perform_ocr_on_images(images)
-    # Combine results
-    combined_text = "\n".join(ocr_results)
-    print(f"Combined OCR Text: {combined_text}")
-    return combined_text
 if __name__ == "__main__":
-    pdf_path = "path/to/marathi/pdf.pdf"  # Replace with the path to your PDF
-    if not os.path.exists(pdf_path):
-        print(f"PDF file not found: {pdf_path}")
-    else:
-        print("Processing Marathi PDF...")
-        ocr_text = ocr_marathi_from_pdf(pdf_path)
-        with open("output.txt", "w", encoding="utf-8") as f:
-            f.write(ocr_text)
-        print("OCR text saved to output.txt")

+import fitz  # PyMuPDF
 from PIL import Image
+import pytesseract
 def extract_images_from_pdf(pdf_path):
     """
+    Extract images from the PDF file using PyMuPDF.
     """
     images = []
+    document = fitz.open(pdf_path)
+    for page_number in range(len(document)):
+        page = document.load_page(page_number)
+        pix = page.get_pixmap(dpi=300)  # Render page to an image with 300 DPI
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
     return images
 def perform_ocr_on_images(images):
     Perform OCR on the extracted images.
     """
     ocr_results = []
+    for img in images:
+        text = pytesseract.image_to_string(img, lang='mar')  # Specify 'mar' for Marathi
         ocr_results.append(text)
+    return "\n".join(ocr_results)
 def ocr_marathi_from_pdf(pdf_path):
     """
     Main function to handle Marathi OCR from a PDF.
     """
     images = extract_images_from_pdf(pdf_path)
+    ocr_text = perform_ocr_on_images(images)
+    return ocr_text
 if __name__ == "__main__":
+    pdf_path = "path/to/your/marathi.pdf"  # Replace with your PDF file path
+    ocr_text = ocr_marathi_from_pdf(pdf_path)
+    print(ocr_text)